LLVM 23.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/MC/MCContext.h"
38
39using namespace llvm;
40
41#define DEBUG_TYPE "si-instr-info"
42
43#define GET_INSTRINFO_CTOR_DTOR
44#include "AMDGPUGenInstrInfo.inc"
45
46namespace llvm::AMDGPU {
47#define GET_D16ImageDimIntrinsics_IMPL
48#define GET_ImageDimIntrinsicTable_IMPL
49#define GET_RsrcIntrinsics_IMPL
50#include "AMDGPUGenSearchableTables.inc"
51} // namespace llvm::AMDGPU
52
53// Must be at least 4 to be able to branch over minimum unconditional branch
54// code. This is only for making it possible to write reasonably small tests for
55// long branches.
57BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
58 cl::desc("Restrict range of branch instructions (DEBUG)"));
59
61 "amdgpu-fix-16-bit-physreg-copies",
62 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
63 cl::init(true),
65
67 : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
68 AMDGPU::ADJCALLSTACKDOWN),
69 RI(ST), ST(ST) {
70 SchedModel.init(&ST);
71}
72
73//===----------------------------------------------------------------------===//
74// TargetInstrInfo callbacks
75//===----------------------------------------------------------------------===//
76
77static unsigned getNumOperandsNoGlue(SDNode *Node) {
78 unsigned N = Node->getNumOperands();
79 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
80 --N;
81 return N;
82}
83
84/// Returns true if both nodes have the same value for the given
85/// operand \p Op, or if both nodes do not have this operand.
87 AMDGPU::OpName OpName) {
88 unsigned Opc0 = N0->getMachineOpcode();
89 unsigned Opc1 = N1->getMachineOpcode();
90
91 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
92 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
93
94 if (Op0Idx == -1 && Op1Idx == -1)
95 return true;
96
97
98 if ((Op0Idx == -1 && Op1Idx != -1) ||
99 (Op1Idx == -1 && Op0Idx != -1))
100 return false;
101
102 // getNamedOperandIdx returns the index for the MachineInstr's operands,
103 // which includes the result as the first operand. We are indexing into the
104 // MachineSDNode's operands, so we need to skip the result operand to get
105 // the real index.
106 --Op0Idx;
107 --Op1Idx;
108
109 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
110}
111
112static bool canRemat(const MachineInstr &MI) {
113
117 return true;
118
119 if (SIInstrInfo::isSMRD(MI)) {
120 return !MI.memoperands_empty() &&
121 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
122 return MMO->isLoad() && MMO->isInvariant();
123 });
124 }
125
126 return false;
127}
128
130 const MachineInstr &MI) const {
131
132 if (canRemat(MI)) {
133 // Normally VALU use of exec would block the rematerialization, but that
134 // is OK in this case to have an implicit exec read as all VALU do.
135 // We really want all of the generic logic for this except for this.
136
137 // Another potential implicit use is mode register. The core logic of
138 // the RA will not attempt rematerialization if mode is set anywhere
139 // in the function, otherwise it is safe since mode is not changed.
140
141 // There is difference to generic method which does not allow
142 // rematerialization if there are virtual register uses. We allow this,
143 // therefore this method includes SOP instructions as well.
144 if (!MI.hasImplicitDef() &&
145 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
146 !MI.mayRaiseFPException())
147 return true;
148 }
149
151}
152
153// Returns true if the scalar result of a VALU instruction depends on exec.
154bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
155 // Ignore comparisons which are only used masked with exec.
156 // This allows some hoisting/sinking of VALU comparisons.
157 if (MI.isCompare()) {
158 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
159 if (!Dst)
160 return true;
161
162 Register DstReg = Dst->getReg();
163 if (!DstReg.isVirtual())
164 return true;
165
166 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
167 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
168 switch (Use.getOpcode()) {
169 case AMDGPU::S_AND_SAVEEXEC_B32:
170 case AMDGPU::S_AND_SAVEEXEC_B64:
171 break;
172 case AMDGPU::S_AND_B32:
173 case AMDGPU::S_AND_B64:
174 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
175 return true;
176 break;
177 default:
178 return true;
179 }
180 }
181 return false;
182 }
183
184 // If it is not convergent it does not depend on EXEC.
185 if (!MI.isConvergent())
186 return false;
187
188 switch (MI.getOpcode()) {
189 default:
190 break;
191 case AMDGPU::V_READFIRSTLANE_B32:
192 return true;
193 }
194
195 return false;
196}
197
199 // Any implicit use of exec by VALU is not a real register read.
200 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
201 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
202}
203
205 MachineBasicBlock *SuccToSinkTo,
206 MachineCycleInfo *CI) const {
207 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
208 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
209 return true;
210
211 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
212 // Check if sinking of MI would create temporal divergent use.
213 for (auto Op : MI.uses()) {
214 if (Op.isReg() && Op.getReg().isVirtual() &&
215 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
216 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
217
218 // SgprDef defined inside cycle
219 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
220 if (FromCycle == nullptr)
221 continue;
222
223 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
224 // Check if there is a FromCycle that contains SgprDef's basic block but
225 // does not contain SuccToSinkTo and also has divergent exit condition.
226 while (FromCycle && !FromCycle->contains(ToCycle)) {
228 FromCycle->getExitingBlocks(ExitingBlocks);
229
230 // FromCycle has divergent exit condition.
231 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
232 if (hasDivergentBranch(ExitingBlock))
233 return false;
234 }
235
236 FromCycle = FromCycle->getParentCycle();
237 }
238 }
239 }
240
241 return true;
242}
243
245 int64_t &Offset0,
246 int64_t &Offset1) const {
247 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
248 return false;
249
250 unsigned Opc0 = Load0->getMachineOpcode();
251 unsigned Opc1 = Load1->getMachineOpcode();
252
253 // Make sure both are actually loads.
254 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
255 return false;
256
257 // A mayLoad instruction without a def is not a load. Likely a prefetch.
258 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
259 return false;
260
261 if (isDS(Opc0) && isDS(Opc1)) {
262
263 // FIXME: Handle this case:
264 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
265 return false;
266
267 // Check base reg.
268 if (Load0->getOperand(0) != Load1->getOperand(0))
269 return false;
270
271 // Skip read2 / write2 variants for simplicity.
272 // TODO: We should report true if the used offsets are adjacent (excluded
273 // st64 versions).
274 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
275 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
276 if (Offset0Idx == -1 || Offset1Idx == -1)
277 return false;
278
279 // XXX - be careful of dataless loads
280 // getNamedOperandIdx returns the index for MachineInstrs. Since they
281 // include the output in the operand list, but SDNodes don't, we need to
282 // subtract the index by one.
283 Offset0Idx -= get(Opc0).NumDefs;
284 Offset1Idx -= get(Opc1).NumDefs;
285 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
286 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
287 return true;
288 }
289
290 if (isSMRD(Opc0) && isSMRD(Opc1)) {
291 // Skip time and cache invalidation instructions.
292 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
293 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
294 return false;
295
296 unsigned NumOps = getNumOperandsNoGlue(Load0);
297 if (NumOps != getNumOperandsNoGlue(Load1))
298 return false;
299
300 // Check base reg.
301 if (Load0->getOperand(0) != Load1->getOperand(0))
302 return false;
303
304 // Match register offsets, if both register and immediate offsets present.
305 assert(NumOps == 4 || NumOps == 5);
306 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
307 return false;
308
309 const ConstantSDNode *Load0Offset =
311 const ConstantSDNode *Load1Offset =
313
314 if (!Load0Offset || !Load1Offset)
315 return false;
316
317 Offset0 = Load0Offset->getZExtValue();
318 Offset1 = Load1Offset->getZExtValue();
319 return true;
320 }
321
322 // MUBUF and MTBUF can access the same addresses.
323 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
324
325 // MUBUF and MTBUF have vaddr at different indices.
326 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
327 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
328 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
329 return false;
330
331 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
332 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
333
334 if (OffIdx0 == -1 || OffIdx1 == -1)
335 return false;
336
337 // getNamedOperandIdx returns the index for MachineInstrs. Since they
338 // include the output in the operand list, but SDNodes don't, we need to
339 // subtract the index by one.
340 OffIdx0 -= get(Opc0).NumDefs;
341 OffIdx1 -= get(Opc1).NumDefs;
342
343 SDValue Off0 = Load0->getOperand(OffIdx0);
344 SDValue Off1 = Load1->getOperand(OffIdx1);
345
346 // The offset might be a FrameIndexSDNode.
347 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
348 return false;
349
350 Offset0 = Off0->getAsZExtVal();
351 Offset1 = Off1->getAsZExtVal();
352 return true;
353 }
354
355 return false;
356}
357
358static bool isStride64(unsigned Opc) {
359 switch (Opc) {
360 case AMDGPU::DS_READ2ST64_B32:
361 case AMDGPU::DS_READ2ST64_B64:
362 case AMDGPU::DS_WRITE2ST64_B32:
363 case AMDGPU::DS_WRITE2ST64_B64:
364 return true;
365 default:
366 return false;
367 }
368}
369
372 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
373 const TargetRegisterInfo *TRI) const {
374 if (!LdSt.mayLoadOrStore())
375 return false;
376
377 unsigned Opc = LdSt.getOpcode();
378 OffsetIsScalable = false;
379 const MachineOperand *BaseOp, *OffsetOp;
380 int DataOpIdx;
381
382 if (isDS(LdSt)) {
383 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
384 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
385 if (OffsetOp) {
386 // Normal, single offset LDS instruction.
387 if (!BaseOp) {
388 // DS_CONSUME/DS_APPEND use M0 for the base address.
389 // TODO: find the implicit use operand for M0 and use that as BaseOp?
390 return false;
391 }
392 BaseOps.push_back(BaseOp);
393 Offset = OffsetOp->getImm();
394 // Get appropriate operand, and compute width accordingly.
395 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
396 if (DataOpIdx == -1)
397 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
398 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
399 Width = LocationSize::precise(64);
400 else
401 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
402 } else {
403 // The 2 offset instructions use offset0 and offset1 instead. We can treat
404 // these as a load with a single offset if the 2 offsets are consecutive.
405 // We will use this for some partially aligned loads.
406 const MachineOperand *Offset0Op =
407 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
408 const MachineOperand *Offset1Op =
409 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
410
411 unsigned Offset0 = Offset0Op->getImm() & 0xff;
412 unsigned Offset1 = Offset1Op->getImm() & 0xff;
413 if (Offset0 + 1 != Offset1)
414 return false;
415
416 // Each of these offsets is in element sized units, so we need to convert
417 // to bytes of the individual reads.
418
419 unsigned EltSize;
420 if (LdSt.mayLoad())
421 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
422 else {
423 assert(LdSt.mayStore());
424 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
425 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
426 }
427
428 if (isStride64(Opc))
429 EltSize *= 64;
430
431 BaseOps.push_back(BaseOp);
432 Offset = EltSize * Offset0;
433 // Get appropriate operand(s), and compute width accordingly.
434 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
435 if (DataOpIdx == -1) {
436 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
437 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
438 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
439 Width = LocationSize::precise(
440 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
441 } else {
442 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
443 }
444 }
445 return true;
446 }
447
448 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
449 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
450 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
451 return false;
452 BaseOps.push_back(RSrc);
453 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
454 if (BaseOp && !BaseOp->isFI())
455 BaseOps.push_back(BaseOp);
456 const MachineOperand *OffsetImm =
457 getNamedOperand(LdSt, AMDGPU::OpName::offset);
458 Offset = OffsetImm->getImm();
459 const MachineOperand *SOffset =
460 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
461 if (SOffset) {
462 if (SOffset->isReg())
463 BaseOps.push_back(SOffset);
464 else
465 Offset += SOffset->getImm();
466 }
467 // Get appropriate operand, and compute width accordingly.
468 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
469 if (DataOpIdx == -1)
470 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
471 if (DataOpIdx == -1) // LDS DMA
472 return false;
473 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
474 return true;
475 }
476
477 if (isImage(LdSt)) {
478 auto RsrcOpName =
479 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
480 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
481 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
482 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
483 if (VAddr0Idx >= 0) {
484 // GFX10 possible NSA encoding.
485 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
486 BaseOps.push_back(&LdSt.getOperand(I));
487 } else {
488 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
489 }
490 Offset = 0;
491 // Get appropriate operand, and compute width accordingly.
492 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
493 if (DataOpIdx == -1)
494 return false; // no return sampler
495 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
496 return true;
497 }
498
499 if (isSMRD(LdSt)) {
500 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
501 if (!BaseOp) // e.g. S_MEMTIME
502 return false;
503 BaseOps.push_back(BaseOp);
504 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
505 Offset = OffsetOp ? OffsetOp->getImm() : 0;
506 // Get appropriate operand, and compute width accordingly.
507 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
508 if (DataOpIdx == -1)
509 return false;
510 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
511 return true;
512 }
513
514 if (isFLAT(LdSt)) {
515 // Instructions have either vaddr or saddr or both or none.
516 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
517 if (BaseOp)
518 BaseOps.push_back(BaseOp);
519 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
520 if (BaseOp)
521 BaseOps.push_back(BaseOp);
522 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
523 // Get appropriate operand, and compute width accordingly.
524 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
525 if (DataOpIdx == -1)
526 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
527 if (DataOpIdx == -1) // LDS DMA
528 return false;
529 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
530 return true;
531 }
532
533 return false;
534}
535
536static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
538 const MachineInstr &MI2,
540 // Only examine the first "base" operand of each instruction, on the
541 // assumption that it represents the real base address of the memory access.
542 // Other operands are typically offsets or indices from this base address.
543 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
544 return true;
545
546 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
547 return false;
548
549 auto *MO1 = *MI1.memoperands_begin();
550 auto *MO2 = *MI2.memoperands_begin();
551 if (MO1->getAddrSpace() != MO2->getAddrSpace())
552 return false;
553
554 const auto *Base1 = MO1->getValue();
555 const auto *Base2 = MO2->getValue();
556 if (!Base1 || !Base2)
557 return false;
558 Base1 = getUnderlyingObject(Base1);
559 Base2 = getUnderlyingObject(Base2);
560
561 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
562 return false;
563
564 return Base1 == Base2;
565}
566
568 int64_t Offset1, bool OffsetIsScalable1,
570 int64_t Offset2, bool OffsetIsScalable2,
571 unsigned ClusterSize,
572 unsigned NumBytes) const {
573 // If the mem ops (to be clustered) do not have the same base ptr, then they
574 // should not be clustered
575 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
576 if (!BaseOps1.empty() && !BaseOps2.empty()) {
577 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
578 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
579 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
580 return false;
581
582 const SIMachineFunctionInfo *MFI =
583 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
584 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
585 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
586 // If only one base op is empty, they do not have the same base ptr
587 return false;
588 }
589
590 // In order to avoid register pressure, on an average, the number of DWORDS
591 // loaded together by all clustered mem ops should not exceed
592 // MaxMemoryClusterDWords. This is an empirical value based on certain
593 // observations and performance related experiments.
594 // The good thing about this heuristic is - it avoids clustering of too many
595 // sub-word loads, and also avoids clustering of wide loads. Below is the
596 // brief summary of how the heuristic behaves for various `LoadSize` when
597 // MaxMemoryClusterDWords is 8.
598 //
599 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
600 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
601 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
602 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
603 // (5) LoadSize >= 17: do not cluster
604 const unsigned LoadSize = NumBytes / ClusterSize;
605 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
606 return NumDWords <= MaxMemoryClusterDWords;
607}
608
609// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
610// the first 16 loads will be interleaved with the stores, and the next 16 will
611// be clustered as expected. It should really split into 2 16 store batches.
612//
613// Loads are clustered until this returns false, rather than trying to schedule
614// groups of stores. This also means we have to deal with saying different
615// address space loads should be clustered, and ones which might cause bank
616// conflicts.
617//
618// This might be deprecated so it might not be worth that much effort to fix.
620 int64_t Offset0, int64_t Offset1,
621 unsigned NumLoads) const {
622 assert(Offset1 > Offset0 &&
623 "Second offset should be larger than first offset!");
624 // If we have less than 16 loads in a row, and the offsets are within 64
625 // bytes, then schedule together.
626
627 // A cacheline is 64 bytes (for global memory).
628 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
629}
630
633 const DebugLoc &DL, MCRegister DestReg,
634 MCRegister SrcReg, bool KillSrc,
635 const char *Msg = "illegal VGPR to SGPR copy") {
636 MachineFunction *MF = MBB.getParent();
637
639 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
640
641 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
642 .addReg(SrcReg, getKillRegState(KillSrc));
643}
644
645/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
646/// possible to have a direct copy in these cases on GFX908, so an intermediate
647/// VGPR copy is required.
651 const DebugLoc &DL, MCRegister DestReg,
652 MCRegister SrcReg, bool KillSrc,
653 RegScavenger &RS, bool RegsOverlap,
654 Register ImpDefSuperReg = Register(),
655 Register ImpUseSuperReg = Register()) {
656 assert((TII.getSubtarget().hasMAIInsts() &&
657 !TII.getSubtarget().hasGFX90AInsts()) &&
658 "Expected GFX908 subtarget.");
659
660 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
661 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
662 "Source register of the copy should be either an SGPR or an AGPR.");
663
664 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
665 "Destination register of the copy should be an AGPR.");
666
667 const SIRegisterInfo &RI = TII.getRegisterInfo();
668
669 // First try to find defining accvgpr_write to avoid temporary registers.
670 // In the case of copies of overlapping AGPRs, we conservatively do not
671 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
672 // an accvgpr_write used for this same copy due to implicit-defs
673 if (!RegsOverlap) {
674 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
675 --Def;
676
677 if (!Def->modifiesRegister(SrcReg, &RI))
678 continue;
679
680 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
681 Def->getOperand(0).getReg() != SrcReg)
682 break;
683
684 MachineOperand &DefOp = Def->getOperand(1);
685 assert(DefOp.isReg() || DefOp.isImm());
686
687 if (DefOp.isReg()) {
688 bool SafeToPropagate = true;
689 // Check that register source operand is not clobbered before MI.
690 // Immediate operands are always safe to propagate.
691 for (auto I = Def; I != MI && SafeToPropagate; ++I)
692 if (I->modifiesRegister(DefOp.getReg(), &RI))
693 SafeToPropagate = false;
694
695 if (!SafeToPropagate)
696 break;
697
698 for (auto I = Def; I != MI; ++I)
699 I->clearRegisterKills(DefOp.getReg(), &RI);
700 }
701
702 MachineInstrBuilder Builder =
703 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
704 .add(DefOp);
705 if (ImpDefSuperReg)
706 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
707
708 if (ImpUseSuperReg) {
709 Builder.addReg(ImpUseSuperReg,
711 }
712
713 return;
714 }
715 }
716
717 RS.enterBasicBlockEnd(MBB);
718 RS.backward(std::next(MI));
719
720 // Ideally we want to have three registers for a long reg_sequence copy
721 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
722 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
723 *MBB.getParent());
724
725 // Registers in the sequence are allocated contiguously so we can just
726 // use register number to pick one of three round-robin temps.
727 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
728 Register Tmp =
729 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
730 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
731 "VGPR used for an intermediate copy should have been reserved.");
732
733 // Only loop through if there are any free registers left. We don't want to
734 // spill.
735 while (RegNo--) {
736 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
737 /* RestoreAfter */ false, 0,
738 /* AllowSpill */ false);
739 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
740 break;
741 Tmp = Tmp2;
742 RS.setRegUsed(Tmp);
743 }
744
745 // Insert copy to temporary VGPR.
746 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
747 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
748 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
749 } else {
750 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
751 }
752
753 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
754 .addReg(SrcReg, getKillRegState(KillSrc));
755 if (ImpUseSuperReg) {
756 UseBuilder.addReg(ImpUseSuperReg,
758 }
759
760 MachineInstrBuilder DefBuilder
761 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
762 .addReg(Tmp, RegState::Kill);
763
764 if (ImpDefSuperReg)
765 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
766}
767
770 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
771 const TargetRegisterClass *RC, bool Forward) {
772 const SIRegisterInfo &RI = TII.getRegisterInfo();
773 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
775 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
776
777 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
778 int16_t SubIdx = BaseIndices[Idx];
779 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
780 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
781 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
782 unsigned Opcode = AMDGPU::S_MOV_B32;
783
784 // Is SGPR aligned? If so try to combine with next.
785 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
786 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
787 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
788 // Can use SGPR64 copy
789 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
790 SubIdx = RI.getSubRegFromChannel(Channel, 2);
791 DestSubReg = RI.getSubReg(DestReg, SubIdx);
792 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
793 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
794 Opcode = AMDGPU::S_MOV_B64;
795 Idx++;
796 }
797
798 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
799 .addReg(SrcSubReg)
800 .addReg(SrcReg, RegState::Implicit);
801
802 if (!FirstMI)
803 FirstMI = LastMI;
804
805 if (!Forward)
806 I--;
807 }
808
809 assert(FirstMI && LastMI);
810 if (!Forward)
811 std::swap(FirstMI, LastMI);
812
813 FirstMI->addOperand(
814 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
815
816 if (KillSrc)
817 LastMI->addRegisterKilled(SrcReg, &RI);
818}
819
822 const DebugLoc &DL, Register DestReg,
823 Register SrcReg, bool KillSrc, bool RenamableDest,
824 bool RenamableSrc) const {
825 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
826 unsigned Size = RI.getRegSizeInBits(*RC);
827 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
828 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
829
830 // The rest of copyPhysReg assumes Src and Dst size are the same size.
831 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
832 // we remove Fix16BitCopies and this code block?
833 if (Fix16BitCopies) {
834 if (((Size == 16) != (SrcSize == 16))) {
835 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
836 assert(ST.useRealTrue16Insts());
837 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
838 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
839 RegToFix = SubReg;
840
841 if (DestReg == SrcReg) {
842 // Identity copy. Insert empty bundle since ExpandPostRA expects an
843 // instruction here.
844 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
845 return;
846 }
847 RC = RI.getPhysRegBaseClass(DestReg);
848 Size = RI.getRegSizeInBits(*RC);
849 SrcRC = RI.getPhysRegBaseClass(SrcReg);
850 SrcSize = RI.getRegSizeInBits(*SrcRC);
851 }
852 }
853
854 if (RC == &AMDGPU::VGPR_32RegClass) {
855 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
856 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
857 AMDGPU::AGPR_32RegClass.contains(SrcReg));
858 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
859 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
860 BuildMI(MBB, MI, DL, get(Opc), DestReg)
861 .addReg(SrcReg, getKillRegState(KillSrc));
862 return;
863 }
864
865 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
866 RC == &AMDGPU::SReg_32RegClass) {
867 if (SrcReg == AMDGPU::SCC) {
868 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
869 .addImm(1)
870 .addImm(0);
871 return;
872 }
873
874 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
875 if (DestReg == AMDGPU::VCC_LO) {
876 // FIXME: Hack until VReg_1 removed.
877 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
878 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
879 .addImm(0)
880 .addReg(SrcReg, getKillRegState(KillSrc));
881 return;
882 }
883
884 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
885 return;
886 }
887
888 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
889 .addReg(SrcReg, getKillRegState(KillSrc));
890 return;
891 }
892
893 if (RC == &AMDGPU::SReg_64RegClass) {
894 if (SrcReg == AMDGPU::SCC) {
895 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
896 .addImm(1)
897 .addImm(0);
898 return;
899 }
900
901 if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
902 if (DestReg == AMDGPU::VCC) {
903 // FIXME: Hack until VReg_1 removed.
904 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
905 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
906 .addImm(0)
907 .addReg(SrcReg, getKillRegState(KillSrc));
908 return;
909 }
910
911 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
912 return;
913 }
914
915 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
916 .addReg(SrcReg, getKillRegState(KillSrc));
917 return;
918 }
919
920 if (DestReg == AMDGPU::SCC) {
921 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
922 // but SelectionDAG emits such copies for i1 sources.
923 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
924 // This copy can only be produced by patterns
925 // with explicit SCC, which are known to be enabled
926 // only for subtargets with S_CMP_LG_U64 present.
927 assert(ST.hasScalarCompareEq64());
928 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
929 .addReg(SrcReg, getKillRegState(KillSrc))
930 .addImm(0);
931 } else {
932 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
933 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
934 .addReg(SrcReg, getKillRegState(KillSrc))
935 .addImm(0);
936 }
937
938 return;
939 }
940
941 if (RC == &AMDGPU::AGPR_32RegClass) {
942 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
943 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
944 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
945 .addReg(SrcReg, getKillRegState(KillSrc));
946 return;
947 }
948
949 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
950 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
951 .addReg(SrcReg, getKillRegState(KillSrc));
952 return;
953 }
954
955 // FIXME: Pass should maintain scavenger to avoid scan through the block on
956 // every AGPR spill.
957 RegScavenger RS;
958 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
959 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
960 return;
961 }
962
963 if (Size == 16) {
964 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
965 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
966 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
967
968 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
969 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
970 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
971 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
972 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
973 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
974 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
975 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
976
977 if (IsSGPRDst) {
978 if (!IsSGPRSrc) {
979 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
980 return;
981 }
982
983 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
984 .addReg(NewSrcReg, getKillRegState(KillSrc));
985 return;
986 }
987
988 if (IsAGPRDst || IsAGPRSrc) {
989 if (!DstLow || !SrcLow) {
990 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
991 "Cannot use hi16 subreg with an AGPR!");
992 }
993
994 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
995 return;
996 }
997
998 if (ST.useRealTrue16Insts()) {
999 if (IsSGPRSrc) {
1000 assert(SrcLow);
1001 SrcReg = NewSrcReg;
1002 }
1003 // Use the smaller instruction encoding if possible.
1004 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
1005 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
1006 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1007 .addReg(SrcReg);
1008 } else {
1009 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1010 .addImm(0) // src0_modifiers
1011 .addReg(SrcReg)
1012 .addImm(0); // op_sel
1013 }
1014 return;
1015 }
1016
1017 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1018 if (!DstLow || !SrcLow) {
1019 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1020 "Cannot use hi16 subreg on VI!");
1021 }
1022
1023 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1024 .addReg(NewSrcReg, getKillRegState(KillSrc));
1025 return;
1026 }
1027
1028 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1029 .addImm(0) // src0_modifiers
1030 .addReg(NewSrcReg)
1031 .addImm(0) // clamp
1038 // First implicit operand is $exec.
1039 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1040 return;
1041 }
1042
1043 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1044 if (ST.hasMovB64()) {
1045 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1046 .addReg(SrcReg, getKillRegState(KillSrc));
1047 return;
1048 }
1049 if (ST.hasPkMovB32()) {
1050 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1052 .addReg(SrcReg)
1054 .addReg(SrcReg)
1055 .addImm(0) // op_sel_lo
1056 .addImm(0) // op_sel_hi
1057 .addImm(0) // neg_lo
1058 .addImm(0) // neg_hi
1059 .addImm(0) // clamp
1060 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1061 return;
1062 }
1063 }
1064
1065 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1066 if (RI.isSGPRClass(RC)) {
1067 if (!RI.isSGPRClass(SrcRC)) {
1068 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1069 return;
1070 }
1071 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1072 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1073 Forward);
1074 return;
1075 }
1076
1077 unsigned EltSize = 4;
1078 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1079 if (RI.isAGPRClass(RC)) {
1080 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1081 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1082 else if (RI.hasVGPRs(SrcRC) ||
1083 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1084 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1085 else
1086 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1087 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1088 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1089 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1090 (RI.isProperlyAlignedRC(*RC) &&
1091 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1092 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1093 if (ST.hasMovB64()) {
1094 Opcode = AMDGPU::V_MOV_B64_e32;
1095 EltSize = 8;
1096 } else if (ST.hasPkMovB32()) {
1097 Opcode = AMDGPU::V_PK_MOV_B32;
1098 EltSize = 8;
1099 }
1100 }
1101
1102 // For the cases where we need an intermediate instruction/temporary register
1103 // (destination is an AGPR), we need a scavenger.
1104 //
1105 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1106 // whole block for every handled copy.
1107 std::unique_ptr<RegScavenger> RS;
1108 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1109 RS = std::make_unique<RegScavenger>();
1110
1111 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1112
1113 // If there is an overlap, we can't kill the super-register on the last
1114 // instruction, since it will also kill the components made live by this def.
1115 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1116 const bool CanKillSuperReg = KillSrc && !Overlap;
1117
1118 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1119 unsigned SubIdx;
1120 if (Forward)
1121 SubIdx = SubIndices[Idx];
1122 else
1123 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1124 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1125 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1126 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1127
1128 bool IsFirstSubreg = Idx == 0;
1129 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1130
1131 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1132 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1133 Register ImpUseSuper = SrcReg;
1134 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1135 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1136 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1138 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1140 .addReg(SrcSubReg)
1142 .addReg(SrcSubReg)
1143 .addImm(0) // op_sel_lo
1144 .addImm(0) // op_sel_hi
1145 .addImm(0) // neg_lo
1146 .addImm(0) // neg_hi
1147 .addImm(0) // clamp
1148 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1149 if (IsFirstSubreg)
1151 } else {
1152 MachineInstrBuilder Builder =
1153 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1154 if (IsFirstSubreg)
1155 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1156
1157 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1158 }
1159 }
1160}
1161
1162int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1163 int32_t NewOpc;
1164
1165 // Try to map original to commuted opcode
1166 NewOpc = AMDGPU::getCommuteRev(Opcode);
1167 if (NewOpc != -1)
1168 // Check if the commuted (REV) opcode exists on the target.
1169 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1170
1171 // Try to map commuted to original opcode
1172 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1173 if (NewOpc != -1)
1174 // Check if the original (non-REV) opcode exists on the target.
1175 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1176
1177 return Opcode;
1178}
1179
1180const TargetRegisterClass *
1182 return &AMDGPU::VGPR_32RegClass;
1183}
1184
1187 const DebugLoc &DL, Register DstReg,
1189 Register TrueReg,
1190 Register FalseReg) const {
1191 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1192 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1194 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1195 "Not a VGPR32 reg");
1196
1197 if (Cond.size() == 1) {
1198 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1199 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1200 .add(Cond[0]);
1201 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1202 .addImm(0)
1203 .addReg(FalseReg)
1204 .addImm(0)
1205 .addReg(TrueReg)
1206 .addReg(SReg);
1207 } else if (Cond.size() == 2) {
1208 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1209 switch (Cond[0].getImm()) {
1210 case SIInstrInfo::SCC_TRUE: {
1211 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1212 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1213 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1214 .addImm(0)
1215 .addReg(FalseReg)
1216 .addImm(0)
1217 .addReg(TrueReg)
1218 .addReg(SReg);
1219 break;
1220 }
1221 case SIInstrInfo::SCC_FALSE: {
1222 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1223 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1224 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1225 .addImm(0)
1226 .addReg(FalseReg)
1227 .addImm(0)
1228 .addReg(TrueReg)
1229 .addReg(SReg);
1230 break;
1231 }
1232 case SIInstrInfo::VCCNZ: {
1233 MachineOperand RegOp = Cond[1];
1234 RegOp.setImplicit(false);
1235 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1236 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1237 .add(RegOp);
1238 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1239 .addImm(0)
1240 .addReg(FalseReg)
1241 .addImm(0)
1242 .addReg(TrueReg)
1243 .addReg(SReg);
1244 break;
1245 }
1246 case SIInstrInfo::VCCZ: {
1247 MachineOperand RegOp = Cond[1];
1248 RegOp.setImplicit(false);
1249 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1250 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1251 .add(RegOp);
1252 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1253 .addImm(0)
1254 .addReg(TrueReg)
1255 .addImm(0)
1256 .addReg(FalseReg)
1257 .addReg(SReg);
1258 break;
1259 }
1260 case SIInstrInfo::EXECNZ: {
1261 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1262 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1263 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1264 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1265 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1266 .addImm(0)
1267 .addReg(FalseReg)
1268 .addImm(0)
1269 .addReg(TrueReg)
1270 .addReg(SReg);
1271 break;
1272 }
1273 case SIInstrInfo::EXECZ: {
1274 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1275 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1276 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1277 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1278 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1279 .addImm(0)
1280 .addReg(FalseReg)
1281 .addImm(0)
1282 .addReg(TrueReg)
1283 .addReg(SReg);
1284 llvm_unreachable("Unhandled branch predicate EXECZ");
1285 break;
1286 }
1287 default:
1288 llvm_unreachable("invalid branch predicate");
1289 }
1290 } else {
1291 llvm_unreachable("Can only handle Cond size 1 or 2");
1292 }
1293}
1294
1297 const DebugLoc &DL,
1298 Register SrcReg, int Value) const {
1299 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1300 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1301 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1302 .addImm(Value)
1303 .addReg(SrcReg);
1304
1305 return Reg;
1306}
1307
1310 const DebugLoc &DL,
1311 Register SrcReg, int Value) const {
1312 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1313 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1314 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1315 .addImm(Value)
1316 .addReg(SrcReg);
1317
1318 return Reg;
1319}
1320
1322 const Register Reg,
1323 int64_t &ImmVal) const {
1324 switch (MI.getOpcode()) {
1325 case AMDGPU::V_MOV_B32_e32:
1326 case AMDGPU::S_MOV_B32:
1327 case AMDGPU::S_MOVK_I32:
1328 case AMDGPU::S_MOV_B64:
1329 case AMDGPU::V_MOV_B64_e32:
1330 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1331 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1332 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1333 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1334 case AMDGPU::V_MOV_B64_PSEUDO:
1335 case AMDGPU::V_MOV_B16_t16_e32: {
1336 const MachineOperand &Src0 = MI.getOperand(1);
1337 if (Src0.isImm()) {
1338 ImmVal = Src0.getImm();
1339 return MI.getOperand(0).getReg() == Reg;
1340 }
1341
1342 return false;
1343 }
1344 case AMDGPU::V_MOV_B16_t16_e64: {
1345 const MachineOperand &Src0 = MI.getOperand(2);
1346 if (Src0.isImm() && !MI.getOperand(1).getImm()) {
1347 ImmVal = Src0.getImm();
1348 return MI.getOperand(0).getReg() == Reg;
1349 }
1350
1351 return false;
1352 }
1353 case AMDGPU::S_BREV_B32:
1354 case AMDGPU::V_BFREV_B32_e32:
1355 case AMDGPU::V_BFREV_B32_e64: {
1356 const MachineOperand &Src0 = MI.getOperand(1);
1357 if (Src0.isImm()) {
1358 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1359 return MI.getOperand(0).getReg() == Reg;
1360 }
1361
1362 return false;
1363 }
1364 case AMDGPU::S_NOT_B32:
1365 case AMDGPU::V_NOT_B32_e32:
1366 case AMDGPU::V_NOT_B32_e64: {
1367 const MachineOperand &Src0 = MI.getOperand(1);
1368 if (Src0.isImm()) {
1369 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1370 return MI.getOperand(0).getReg() == Reg;
1371 }
1372
1373 return false;
1374 }
1375 default:
1376 return false;
1377 }
1378}
1379
1380std::optional<int64_t>
1382 if (Op.isImm())
1383 return Op.getImm();
1384
1385 if (!Op.isReg() || !Op.getReg().isVirtual())
1386 return std::nullopt;
1387 MachineRegisterInfo &MRI = Op.getParent()->getMF()->getRegInfo();
1388 const MachineInstr *Def = MRI.getVRegDef(Op.getReg());
1389 if (Def && Def->isMoveImmediate()) {
1390 const MachineOperand &ImmSrc = Def->getOperand(1);
1391 if (ImmSrc.isImm())
1392 return extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg());
1393 }
1394
1395 return std::nullopt;
1396}
1397
1399
1400 if (RI.isAGPRClass(DstRC))
1401 return AMDGPU::COPY;
1402 if (RI.getRegSizeInBits(*DstRC) == 16) {
1403 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1404 // before RA.
1405 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1406 }
1407 if (RI.getRegSizeInBits(*DstRC) == 32)
1408 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1409 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1410 return AMDGPU::S_MOV_B64;
1411 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1412 return AMDGPU::V_MOV_B64_PSEUDO;
1413 return AMDGPU::COPY;
1414}
1415
1416const MCInstrDesc &
1418 bool IsIndirectSrc) const {
1419 if (IsIndirectSrc) {
1420 if (VecSize <= 32) // 4 bytes
1421 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1422 if (VecSize <= 64) // 8 bytes
1423 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1424 if (VecSize <= 96) // 12 bytes
1425 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1426 if (VecSize <= 128) // 16 bytes
1427 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1428 if (VecSize <= 160) // 20 bytes
1429 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1430 if (VecSize <= 192) // 24 bytes
1431 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6);
1432 if (VecSize <= 224) // 28 bytes
1433 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7);
1434 if (VecSize <= 256) // 32 bytes
1435 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1436 if (VecSize <= 288) // 36 bytes
1437 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1438 if (VecSize <= 320) // 40 bytes
1439 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1440 if (VecSize <= 352) // 44 bytes
1441 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1442 if (VecSize <= 384) // 48 bytes
1443 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1444 if (VecSize <= 512) // 64 bytes
1445 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1446 if (VecSize <= 1024) // 128 bytes
1447 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1448
1449 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1450 }
1451
1452 if (VecSize <= 32) // 4 bytes
1453 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1454 if (VecSize <= 64) // 8 bytes
1455 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1456 if (VecSize <= 96) // 12 bytes
1457 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1458 if (VecSize <= 128) // 16 bytes
1459 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1460 if (VecSize <= 160) // 20 bytes
1461 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1462 if (VecSize <= 192) // 24 bytes
1463 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6);
1464 if (VecSize <= 224) // 28 bytes
1465 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7);
1466 if (VecSize <= 256) // 32 bytes
1467 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1468 if (VecSize <= 288) // 36 bytes
1469 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1470 if (VecSize <= 320) // 40 bytes
1471 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1472 if (VecSize <= 352) // 44 bytes
1473 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1474 if (VecSize <= 384) // 48 bytes
1475 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1476 if (VecSize <= 512) // 64 bytes
1477 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1478 if (VecSize <= 1024) // 128 bytes
1479 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1480
1481 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1482}
1483
1484static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1485 if (VecSize <= 32) // 4 bytes
1486 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1487 if (VecSize <= 64) // 8 bytes
1488 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1489 if (VecSize <= 96) // 12 bytes
1490 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1491 if (VecSize <= 128) // 16 bytes
1492 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1493 if (VecSize <= 160) // 20 bytes
1494 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1495 if (VecSize <= 192) // 24 bytes
1496 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1497 if (VecSize <= 224) // 28 bytes
1498 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1499 if (VecSize <= 256) // 32 bytes
1500 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1501 if (VecSize <= 288) // 36 bytes
1502 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1503 if (VecSize <= 320) // 40 bytes
1504 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1505 if (VecSize <= 352) // 44 bytes
1506 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1507 if (VecSize <= 384) // 48 bytes
1508 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1509 if (VecSize <= 512) // 64 bytes
1510 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1511 if (VecSize <= 1024) // 128 bytes
1512 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1513
1514 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1515}
1516
1517static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1518 if (VecSize <= 32) // 4 bytes
1519 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1520 if (VecSize <= 64) // 8 bytes
1521 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1522 if (VecSize <= 96) // 12 bytes
1523 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1524 if (VecSize <= 128) // 16 bytes
1525 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1526 if (VecSize <= 160) // 20 bytes
1527 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1528 if (VecSize <= 192) // 24 bytes
1529 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1530 if (VecSize <= 224) // 28 bytes
1531 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1532 if (VecSize <= 256) // 32 bytes
1533 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1534 if (VecSize <= 288) // 36 bytes
1535 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1536 if (VecSize <= 320) // 40 bytes
1537 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1538 if (VecSize <= 352) // 44 bytes
1539 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1540 if (VecSize <= 384) // 48 bytes
1541 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1542 if (VecSize <= 512) // 64 bytes
1543 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1544 if (VecSize <= 1024) // 128 bytes
1545 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1546
1547 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1548}
1549
1550static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1551 if (VecSize <= 64) // 8 bytes
1552 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1553 if (VecSize <= 128) // 16 bytes
1554 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1555 if (VecSize <= 256) // 32 bytes
1556 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1557 if (VecSize <= 512) // 64 bytes
1558 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1559 if (VecSize <= 1024) // 128 bytes
1560 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1561
1562 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1563}
1564
1565const MCInstrDesc &
1566SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1567 bool IsSGPR) const {
1568 if (IsSGPR) {
1569 switch (EltSize) {
1570 case 32:
1571 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1572 case 64:
1573 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1574 default:
1575 llvm_unreachable("invalid reg indexing elt size");
1576 }
1577 }
1578
1579 assert(EltSize == 32 && "invalid reg indexing elt size");
1581}
1582
1583static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1584 switch (Size) {
1585 case 4:
1586 return AMDGPU::SI_SPILL_S32_SAVE;
1587 case 8:
1588 return AMDGPU::SI_SPILL_S64_SAVE;
1589 case 12:
1590 return AMDGPU::SI_SPILL_S96_SAVE;
1591 case 16:
1592 return AMDGPU::SI_SPILL_S128_SAVE;
1593 case 20:
1594 return AMDGPU::SI_SPILL_S160_SAVE;
1595 case 24:
1596 return AMDGPU::SI_SPILL_S192_SAVE;
1597 case 28:
1598 return AMDGPU::SI_SPILL_S224_SAVE;
1599 case 32:
1600 return AMDGPU::SI_SPILL_S256_SAVE;
1601 case 36:
1602 return AMDGPU::SI_SPILL_S288_SAVE;
1603 case 40:
1604 return AMDGPU::SI_SPILL_S320_SAVE;
1605 case 44:
1606 return AMDGPU::SI_SPILL_S352_SAVE;
1607 case 48:
1608 return AMDGPU::SI_SPILL_S384_SAVE;
1609 case 64:
1610 return AMDGPU::SI_SPILL_S512_SAVE;
1611 case 128:
1612 return AMDGPU::SI_SPILL_S1024_SAVE;
1613 default:
1614 llvm_unreachable("unknown register size");
1615 }
1616}
1617
1618static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1619 switch (Size) {
1620 case 2:
1621 return AMDGPU::SI_SPILL_V16_SAVE;
1622 case 4:
1623 return AMDGPU::SI_SPILL_V32_SAVE;
1624 case 8:
1625 return AMDGPU::SI_SPILL_V64_SAVE;
1626 case 12:
1627 return AMDGPU::SI_SPILL_V96_SAVE;
1628 case 16:
1629 return AMDGPU::SI_SPILL_V128_SAVE;
1630 case 20:
1631 return AMDGPU::SI_SPILL_V160_SAVE;
1632 case 24:
1633 return AMDGPU::SI_SPILL_V192_SAVE;
1634 case 28:
1635 return AMDGPU::SI_SPILL_V224_SAVE;
1636 case 32:
1637 return AMDGPU::SI_SPILL_V256_SAVE;
1638 case 36:
1639 return AMDGPU::SI_SPILL_V288_SAVE;
1640 case 40:
1641 return AMDGPU::SI_SPILL_V320_SAVE;
1642 case 44:
1643 return AMDGPU::SI_SPILL_V352_SAVE;
1644 case 48:
1645 return AMDGPU::SI_SPILL_V384_SAVE;
1646 case 64:
1647 return AMDGPU::SI_SPILL_V512_SAVE;
1648 case 128:
1649 return AMDGPU::SI_SPILL_V1024_SAVE;
1650 default:
1651 llvm_unreachable("unknown register size");
1652 }
1653}
1654
1655static unsigned getAVSpillSaveOpcode(unsigned Size) {
1656 switch (Size) {
1657 case 4:
1658 return AMDGPU::SI_SPILL_AV32_SAVE;
1659 case 8:
1660 return AMDGPU::SI_SPILL_AV64_SAVE;
1661 case 12:
1662 return AMDGPU::SI_SPILL_AV96_SAVE;
1663 case 16:
1664 return AMDGPU::SI_SPILL_AV128_SAVE;
1665 case 20:
1666 return AMDGPU::SI_SPILL_AV160_SAVE;
1667 case 24:
1668 return AMDGPU::SI_SPILL_AV192_SAVE;
1669 case 28:
1670 return AMDGPU::SI_SPILL_AV224_SAVE;
1671 case 32:
1672 return AMDGPU::SI_SPILL_AV256_SAVE;
1673 case 36:
1674 return AMDGPU::SI_SPILL_AV288_SAVE;
1675 case 40:
1676 return AMDGPU::SI_SPILL_AV320_SAVE;
1677 case 44:
1678 return AMDGPU::SI_SPILL_AV352_SAVE;
1679 case 48:
1680 return AMDGPU::SI_SPILL_AV384_SAVE;
1681 case 64:
1682 return AMDGPU::SI_SPILL_AV512_SAVE;
1683 case 128:
1684 return AMDGPU::SI_SPILL_AV1024_SAVE;
1685 default:
1686 llvm_unreachable("unknown register size");
1687 }
1688}
1689
1690static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1691 bool IsVectorSuperClass) {
1692 // Currently, there is only 32-bit WWM register spills needed.
1693 if (Size != 4)
1694 llvm_unreachable("unknown wwm register spill size");
1695
1696 if (IsVectorSuperClass)
1697 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1698
1699 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1700}
1701
1703 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1704 const SIMachineFunctionInfo &MFI) const {
1705 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1706
1707 // Choose the right opcode if spilling a WWM register.
1709 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1710
1711 // TODO: Check if AGPRs are available
1712 if (ST.hasMAIInsts())
1713 return getAVSpillSaveOpcode(Size);
1714
1716}
1717
1720 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1721 MachineInstr::MIFlag Flags) const {
1722 MachineFunction *MF = MBB.getParent();
1724 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1725 const DebugLoc &DL = MBB.findDebugLoc(MI);
1726
1727 MachinePointerInfo PtrInfo
1728 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1730 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1731 FrameInfo.getObjectAlign(FrameIndex));
1732 unsigned SpillSize = RI.getSpillSize(*RC);
1733
1734 MachineRegisterInfo &MRI = MF->getRegInfo();
1735 if (RI.isSGPRClass(RC)) {
1736 MFI->setHasSpilledSGPRs();
1737 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1738 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1739 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1740
1741 // We are only allowed to create one new instruction when spilling
1742 // registers, so we need to use pseudo instruction for spilling SGPRs.
1743 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1744
1745 // The SGPR spill/restore instructions only work on number sgprs, so we need
1746 // to make sure we are using the correct register class.
1747 if (SrcReg.isVirtual() && SpillSize == 4) {
1748 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1749 }
1750
1751 BuildMI(MBB, MI, DL, OpDesc)
1752 .addReg(SrcReg, getKillRegState(isKill)) // data
1753 .addFrameIndex(FrameIndex) // addr
1754 .addMemOperand(MMO)
1756
1757 if (RI.spillSGPRToVGPR())
1758 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1759 return;
1760 }
1761
1762 unsigned Opcode =
1763 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1764 MFI->setHasSpilledVGPRs();
1765
1766 BuildMI(MBB, MI, DL, get(Opcode))
1767 .addReg(SrcReg, getKillRegState(isKill)) // data
1768 .addFrameIndex(FrameIndex) // addr
1769 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1770 .addImm(0) // offset
1771 .addMemOperand(MMO);
1772}
1773
1774static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1775 switch (Size) {
1776 case 4:
1777 return AMDGPU::SI_SPILL_S32_RESTORE;
1778 case 8:
1779 return AMDGPU::SI_SPILL_S64_RESTORE;
1780 case 12:
1781 return AMDGPU::SI_SPILL_S96_RESTORE;
1782 case 16:
1783 return AMDGPU::SI_SPILL_S128_RESTORE;
1784 case 20:
1785 return AMDGPU::SI_SPILL_S160_RESTORE;
1786 case 24:
1787 return AMDGPU::SI_SPILL_S192_RESTORE;
1788 case 28:
1789 return AMDGPU::SI_SPILL_S224_RESTORE;
1790 case 32:
1791 return AMDGPU::SI_SPILL_S256_RESTORE;
1792 case 36:
1793 return AMDGPU::SI_SPILL_S288_RESTORE;
1794 case 40:
1795 return AMDGPU::SI_SPILL_S320_RESTORE;
1796 case 44:
1797 return AMDGPU::SI_SPILL_S352_RESTORE;
1798 case 48:
1799 return AMDGPU::SI_SPILL_S384_RESTORE;
1800 case 64:
1801 return AMDGPU::SI_SPILL_S512_RESTORE;
1802 case 128:
1803 return AMDGPU::SI_SPILL_S1024_RESTORE;
1804 default:
1805 llvm_unreachable("unknown register size");
1806 }
1807}
1808
1809static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1810 switch (Size) {
1811 case 2:
1812 return AMDGPU::SI_SPILL_V16_RESTORE;
1813 case 4:
1814 return AMDGPU::SI_SPILL_V32_RESTORE;
1815 case 8:
1816 return AMDGPU::SI_SPILL_V64_RESTORE;
1817 case 12:
1818 return AMDGPU::SI_SPILL_V96_RESTORE;
1819 case 16:
1820 return AMDGPU::SI_SPILL_V128_RESTORE;
1821 case 20:
1822 return AMDGPU::SI_SPILL_V160_RESTORE;
1823 case 24:
1824 return AMDGPU::SI_SPILL_V192_RESTORE;
1825 case 28:
1826 return AMDGPU::SI_SPILL_V224_RESTORE;
1827 case 32:
1828 return AMDGPU::SI_SPILL_V256_RESTORE;
1829 case 36:
1830 return AMDGPU::SI_SPILL_V288_RESTORE;
1831 case 40:
1832 return AMDGPU::SI_SPILL_V320_RESTORE;
1833 case 44:
1834 return AMDGPU::SI_SPILL_V352_RESTORE;
1835 case 48:
1836 return AMDGPU::SI_SPILL_V384_RESTORE;
1837 case 64:
1838 return AMDGPU::SI_SPILL_V512_RESTORE;
1839 case 128:
1840 return AMDGPU::SI_SPILL_V1024_RESTORE;
1841 default:
1842 llvm_unreachable("unknown register size");
1843 }
1844}
1845
1846static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1847 switch (Size) {
1848 case 4:
1849 return AMDGPU::SI_SPILL_AV32_RESTORE;
1850 case 8:
1851 return AMDGPU::SI_SPILL_AV64_RESTORE;
1852 case 12:
1853 return AMDGPU::SI_SPILL_AV96_RESTORE;
1854 case 16:
1855 return AMDGPU::SI_SPILL_AV128_RESTORE;
1856 case 20:
1857 return AMDGPU::SI_SPILL_AV160_RESTORE;
1858 case 24:
1859 return AMDGPU::SI_SPILL_AV192_RESTORE;
1860 case 28:
1861 return AMDGPU::SI_SPILL_AV224_RESTORE;
1862 case 32:
1863 return AMDGPU::SI_SPILL_AV256_RESTORE;
1864 case 36:
1865 return AMDGPU::SI_SPILL_AV288_RESTORE;
1866 case 40:
1867 return AMDGPU::SI_SPILL_AV320_RESTORE;
1868 case 44:
1869 return AMDGPU::SI_SPILL_AV352_RESTORE;
1870 case 48:
1871 return AMDGPU::SI_SPILL_AV384_RESTORE;
1872 case 64:
1873 return AMDGPU::SI_SPILL_AV512_RESTORE;
1874 case 128:
1875 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1876 default:
1877 llvm_unreachable("unknown register size");
1878 }
1879}
1880
1881static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1882 bool IsVectorSuperClass) {
1883 // Currently, there is only 32-bit WWM register spills needed.
1884 if (Size != 4)
1885 llvm_unreachable("unknown wwm register spill size");
1886
1887 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1888 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1889
1890 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1891}
1892
1894 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1895 const SIMachineFunctionInfo &MFI) const {
1896 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1897
1898 // Choose the right opcode if restoring a WWM register.
1900 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1901
1902 // TODO: Check if AGPRs are available
1903 if (ST.hasMAIInsts())
1905
1906 assert(!RI.isAGPRClass(RC));
1908}
1909
1912 Register DestReg, int FrameIndex,
1913 const TargetRegisterClass *RC,
1914 Register VReg, unsigned SubReg,
1915 MachineInstr::MIFlag Flags) const {
1916 MachineFunction *MF = MBB.getParent();
1918 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1919 const DebugLoc &DL = MBB.findDebugLoc(MI);
1920 unsigned SpillSize = RI.getSpillSize(*RC);
1921
1922 MachinePointerInfo PtrInfo
1923 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1924
1926 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1927 FrameInfo.getObjectAlign(FrameIndex));
1928
1929 if (RI.isSGPRClass(RC)) {
1930 MFI->setHasSpilledSGPRs();
1931 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1932 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1933 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1934
1935 // FIXME: Maybe this should not include a memoperand because it will be
1936 // lowered to non-memory instructions.
1937 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1938 if (DestReg.isVirtual() && SpillSize == 4) {
1939 MachineRegisterInfo &MRI = MF->getRegInfo();
1940 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1941 }
1942
1943 if (RI.spillSGPRToVGPR())
1944 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1945 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1946 .addFrameIndex(FrameIndex) // addr
1947 .addMemOperand(MMO)
1949
1950 return;
1951 }
1952
1953 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1954 SpillSize, *MFI);
1955 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1956 .addFrameIndex(FrameIndex) // vaddr
1957 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1958 .addImm(0) // offset
1959 .addMemOperand(MMO);
1960}
1961
1966
1969 unsigned Quantity) const {
1970 DebugLoc DL = MBB.findDebugLoc(MI);
1971 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1972 while (Quantity > 0) {
1973 unsigned Arg = std::min(Quantity, MaxSNopCount);
1974 Quantity -= Arg;
1975 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1976 }
1977}
1978
1980 auto *MF = MBB.getParent();
1981 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1982
1983 assert(Info->isEntryFunction());
1984
1985 if (MBB.succ_empty()) {
1986 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1987 if (HasNoTerminator) {
1988 if (Info->returnsVoid()) {
1989 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1990 } else {
1991 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1992 }
1993 }
1994 }
1995}
1996
2000 const DebugLoc &DL) const {
2001 MachineFunction *MF = MBB.getParent();
2002 constexpr unsigned DoorbellIDMask = 0x3ff;
2003 constexpr unsigned ECQueueWaveAbort = 0x400;
2004
2005 MachineBasicBlock *TrapBB = &MBB;
2006 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2007
2008 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
2009 MBB.splitAt(MI, /*UpdateLiveIns=*/false);
2010 TrapBB = MF->CreateMachineBasicBlock();
2011 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
2012 MF->push_back(TrapBB);
2013 MBB.addSuccessor(TrapBB);
2014 }
2015 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2016 // will be a nop.
2017 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
2018 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2019 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2020 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2021 DoorbellReg)
2023 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2024 .addUse(AMDGPU::M0);
2025 Register DoorbellRegMasked =
2026 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2027 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2028 .addUse(DoorbellReg)
2029 .addImm(DoorbellIDMask);
2030 Register SetWaveAbortBit =
2031 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2032 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2033 .addUse(DoorbellRegMasked)
2034 .addImm(ECQueueWaveAbort);
2035 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2036 .addUse(SetWaveAbortBit);
2037 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2039 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2040 .addUse(AMDGPU::TTMP2);
2041 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2042 TrapBB->addSuccessor(HaltLoopBB);
2043
2044 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2045 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2046 .addMBB(HaltLoopBB);
2047 MF->push_back(HaltLoopBB);
2048 HaltLoopBB->addSuccessor(HaltLoopBB);
2049
2050 return MBB.getNextNode();
2051}
2052
2054 switch (MI.getOpcode()) {
2055 default:
2056 if (MI.isMetaInstruction())
2057 return 0;
2058 return 1; // FIXME: Do wait states equal cycles?
2059
2060 case AMDGPU::S_NOP:
2061 return MI.getOperand(0).getImm() + 1;
2062 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2063 // hazard, even if one exist, won't really be visible. Should we handle it?
2064 }
2065}
2066
2068 MachineBasicBlock &MBB = *MI.getParent();
2069 DebugLoc DL = MBB.findDebugLoc(MI);
2071 switch (MI.getOpcode()) {
2072 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2073 case AMDGPU::S_MOV_B64_term:
2074 // This is only a terminator to get the correct spill code placement during
2075 // register allocation.
2076 MI.setDesc(get(AMDGPU::S_MOV_B64));
2077 break;
2078
2079 case AMDGPU::S_MOV_B32_term:
2080 // This is only a terminator to get the correct spill code placement during
2081 // register allocation.
2082 MI.setDesc(get(AMDGPU::S_MOV_B32));
2083 break;
2084
2085 case AMDGPU::S_XOR_B64_term:
2086 // This is only a terminator to get the correct spill code placement during
2087 // register allocation.
2088 MI.setDesc(get(AMDGPU::S_XOR_B64));
2089 break;
2090
2091 case AMDGPU::S_XOR_B32_term:
2092 // This is only a terminator to get the correct spill code placement during
2093 // register allocation.
2094 MI.setDesc(get(AMDGPU::S_XOR_B32));
2095 break;
2096 case AMDGPU::S_OR_B64_term:
2097 // This is only a terminator to get the correct spill code placement during
2098 // register allocation.
2099 MI.setDesc(get(AMDGPU::S_OR_B64));
2100 break;
2101 case AMDGPU::S_OR_B32_term:
2102 // This is only a terminator to get the correct spill code placement during
2103 // register allocation.
2104 MI.setDesc(get(AMDGPU::S_OR_B32));
2105 break;
2106
2107 case AMDGPU::S_ANDN2_B64_term:
2108 // This is only a terminator to get the correct spill code placement during
2109 // register allocation.
2110 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2111 break;
2112
2113 case AMDGPU::S_ANDN2_B32_term:
2114 // This is only a terminator to get the correct spill code placement during
2115 // register allocation.
2116 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2117 break;
2118
2119 case AMDGPU::S_AND_B64_term:
2120 // This is only a terminator to get the correct spill code placement during
2121 // register allocation.
2122 MI.setDesc(get(AMDGPU::S_AND_B64));
2123 break;
2124
2125 case AMDGPU::S_AND_B32_term:
2126 // This is only a terminator to get the correct spill code placement during
2127 // register allocation.
2128 MI.setDesc(get(AMDGPU::S_AND_B32));
2129 break;
2130
2131 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2132 // This is only a terminator to get the correct spill code placement during
2133 // register allocation.
2134 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2135 break;
2136
2137 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2138 // This is only a terminator to get the correct spill code placement during
2139 // register allocation.
2140 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2141 break;
2142
2143 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2144 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2145 break;
2146
2147 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2148 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2149 break;
2150 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2151 Register Dst = MI.getOperand(0).getReg();
2152 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2153 MI.setDesc(
2154 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2155 break;
2156 }
2157 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2158 Register Dst = MI.getOperand(0).getReg();
2159 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2160 int64_t Imm = MI.getOperand(1).getImm();
2161
2162 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2163 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2164 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2165 .addImm(SignExtend64<32>(Imm));
2166 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2167 .addImm(SignExtend64<32>(Imm >> 32));
2168 MI.eraseFromParent();
2169 break;
2170 }
2171
2172 [[fallthrough]];
2173 }
2174 case AMDGPU::V_MOV_B64_PSEUDO: {
2175 Register Dst = MI.getOperand(0).getReg();
2176 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2177 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2178
2179 const MCInstrDesc &Mov64Desc = get(AMDGPU::V_MOV_B64_e32);
2180 const TargetRegisterClass *Mov64RC = getRegClass(Mov64Desc, /*OpNum=*/0);
2181
2182 const MachineOperand &SrcOp = MI.getOperand(1);
2183 // FIXME: Will this work for 64-bit floating point immediates?
2184 assert(!SrcOp.isFPImm());
2185 if (ST.hasMovB64() && Mov64RC->contains(Dst)) {
2186 MI.setDesc(Mov64Desc);
2187 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2188 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2189 break;
2190 }
2191 if (SrcOp.isImm()) {
2192 APInt Imm(64, SrcOp.getImm());
2193 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2194 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2195 const MCInstrDesc &PkMovDesc = get(AMDGPU::V_PK_MOV_B32);
2196 const TargetRegisterClass *PkMovRC = getRegClass(PkMovDesc, /*OpNum=*/0);
2197
2198 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo) &&
2199 PkMovRC->contains(Dst)) {
2200 BuildMI(MBB, MI, DL, PkMovDesc, Dst)
2202 .addImm(Lo.getSExtValue())
2204 .addImm(Lo.getSExtValue())
2205 .addImm(0) // op_sel_lo
2206 .addImm(0) // op_sel_hi
2207 .addImm(0) // neg_lo
2208 .addImm(0) // neg_hi
2209 .addImm(0); // clamp
2210 } else {
2211 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2212 .addImm(Lo.getSExtValue());
2213 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2214 .addImm(Hi.getSExtValue());
2215 }
2216 } else {
2217 assert(SrcOp.isReg());
2218 if (ST.hasPkMovB32() &&
2219 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2220 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2221 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2222 .addReg(SrcOp.getReg())
2224 .addReg(SrcOp.getReg())
2225 .addImm(0) // op_sel_lo
2226 .addImm(0) // op_sel_hi
2227 .addImm(0) // neg_lo
2228 .addImm(0) // neg_hi
2229 .addImm(0); // clamp
2230 } else {
2231 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2232 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0));
2233 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2234 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1));
2235 }
2236 }
2237 MI.eraseFromParent();
2238 break;
2239 }
2240 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2242 break;
2243 }
2244 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2245 const MachineOperand &SrcOp = MI.getOperand(1);
2246 assert(!SrcOp.isFPImm());
2247
2248 if (ST.has64BitLiterals()) {
2249 MI.setDesc(get(AMDGPU::S_MOV_B64));
2250 break;
2251 }
2252
2253 APInt Imm(64, SrcOp.getImm());
2254 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2255 MI.setDesc(get(AMDGPU::S_MOV_B64));
2256 break;
2257 }
2258
2259 Register Dst = MI.getOperand(0).getReg();
2260 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2261 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2262
2263 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2264 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2265 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2266 .addImm(Lo.getSExtValue());
2267 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2268 .addImm(Hi.getSExtValue());
2269 MI.eraseFromParent();
2270 break;
2271 }
2272 case AMDGPU::V_SET_INACTIVE_B32: {
2273 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2274 Register DstReg = MI.getOperand(0).getReg();
2275 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2276 .add(MI.getOperand(3))
2277 .add(MI.getOperand(4))
2278 .add(MI.getOperand(1))
2279 .add(MI.getOperand(2))
2280 .add(MI.getOperand(5));
2281 MI.eraseFromParent();
2282 break;
2283 }
2284 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2285 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2286 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2287 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2288 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2289 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2290 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2291 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2292 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2293 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2294 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2295 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2296 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2297 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2298 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2299 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2300 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2301 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2302 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2303 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2304 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2305 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2306 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2307 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2308 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2309 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2310 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2311 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2312 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2313 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2314 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2315 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2316 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2317 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2318
2319 unsigned Opc;
2320 if (RI.hasVGPRs(EltRC)) {
2321 Opc = AMDGPU::V_MOVRELD_B32_e32;
2322 } else {
2323 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2324 : AMDGPU::S_MOVRELD_B32;
2325 }
2326
2327 const MCInstrDesc &OpDesc = get(Opc);
2328 Register VecReg = MI.getOperand(0).getReg();
2329 bool IsUndef = MI.getOperand(1).isUndef();
2330 unsigned SubReg = MI.getOperand(3).getImm();
2331 assert(VecReg == MI.getOperand(1).getReg());
2332
2334 BuildMI(MBB, MI, DL, OpDesc)
2335 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2336 .add(MI.getOperand(2))
2338 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2339
2340 const int ImpDefIdx =
2341 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2342 const int ImpUseIdx = ImpDefIdx + 1;
2343 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2344 MI.eraseFromParent();
2345 break;
2346 }
2347 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2348 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2349 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2350 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2351 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2352 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6:
2353 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7:
2354 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2355 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2356 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2357 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2358 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2359 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2360 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2361 assert(ST.useVGPRIndexMode());
2362 Register VecReg = MI.getOperand(0).getReg();
2363 bool IsUndef = MI.getOperand(1).isUndef();
2364 MachineOperand &Idx = MI.getOperand(3);
2365 Register SubReg = MI.getOperand(4).getImm();
2366
2367 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2368 .add(Idx)
2370 SetOn->getOperand(3).setIsUndef();
2371
2372 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2374 BuildMI(MBB, MI, DL, OpDesc)
2375 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2376 .add(MI.getOperand(2))
2378 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2379
2380 const int ImpDefIdx =
2381 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2382 const int ImpUseIdx = ImpDefIdx + 1;
2383 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2384
2385 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2386
2387 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2388
2389 MI.eraseFromParent();
2390 break;
2391 }
2392 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2393 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2394 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2395 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2396 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2397 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6:
2398 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7:
2399 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2400 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2401 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2402 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2403 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2404 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2405 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2406 assert(ST.useVGPRIndexMode());
2407 Register Dst = MI.getOperand(0).getReg();
2408 Register VecReg = MI.getOperand(1).getReg();
2409 bool IsUndef = MI.getOperand(1).isUndef();
2410 Register SubReg = MI.getOperand(3).getImm();
2411
2412 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2413 .add(MI.getOperand(2))
2415 SetOn->getOperand(3).setIsUndef();
2416
2417 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2418 .addDef(Dst)
2419 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2420 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2421
2422 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2423
2424 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2425
2426 MI.eraseFromParent();
2427 break;
2428 }
2429 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2430 MachineFunction &MF = *MBB.getParent();
2431 Register Reg = MI.getOperand(0).getReg();
2432 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2433 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2434 MachineOperand OpLo = MI.getOperand(1);
2435 MachineOperand OpHi = MI.getOperand(2);
2436
2437 // Create a bundle so these instructions won't be re-ordered by the
2438 // post-RA scheduler.
2439 MIBundleBuilder Bundler(MBB, MI);
2440 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2441
2442 // What we want here is an offset from the value returned by s_getpc (which
2443 // is the address of the s_add_u32 instruction) to the global variable, but
2444 // since the encoding of $symbol starts 4 bytes after the start of the
2445 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2446 // small. This requires us to add 4 to the global variable offset in order
2447 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2448 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2449 // instruction.
2450
2451 int64_t Adjust = 0;
2452 if (ST.hasGetPCZeroExtension()) {
2453 // Fix up hardware that does not sign-extend the 48-bit PC value by
2454 // inserting: s_sext_i32_i16 reghi, reghi
2455 Bundler.append(
2456 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2457 Adjust += 4;
2458 }
2459
2460 if (OpLo.isGlobal())
2461 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2462 Bundler.append(
2463 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2464
2465 if (OpHi.isGlobal())
2466 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2467 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2468 .addReg(RegHi)
2469 .add(OpHi));
2470
2471 finalizeBundle(MBB, Bundler.begin());
2472
2473 MI.eraseFromParent();
2474 break;
2475 }
2476 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2477 MachineFunction &MF = *MBB.getParent();
2478 Register Reg = MI.getOperand(0).getReg();
2479 MachineOperand Op = MI.getOperand(1);
2480
2481 // Create a bundle so these instructions won't be re-ordered by the
2482 // post-RA scheduler.
2483 MIBundleBuilder Bundler(MBB, MI);
2484 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2485 if (Op.isGlobal())
2486 Op.setOffset(Op.getOffset() + 4);
2487 Bundler.append(
2488 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2489
2490 finalizeBundle(MBB, Bundler.begin());
2491
2492 MI.eraseFromParent();
2493 break;
2494 }
2495 case AMDGPU::ENTER_STRICT_WWM: {
2496 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2497 // Whole Wave Mode is entered.
2498 MI.setDesc(get(LMC.OrSaveExecOpc));
2499 break;
2500 }
2501 case AMDGPU::ENTER_STRICT_WQM: {
2502 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2503 // STRICT_WQM is entered.
2504 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2505 .addReg(LMC.ExecReg);
2506 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2507
2508 MI.eraseFromParent();
2509 break;
2510 }
2511 case AMDGPU::EXIT_STRICT_WWM:
2512 case AMDGPU::EXIT_STRICT_WQM: {
2513 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2514 // WWM/STICT_WQM is exited.
2515 MI.setDesc(get(LMC.MovOpc));
2516 break;
2517 }
2518 case AMDGPU::SI_RETURN: {
2519 const MachineFunction *MF = MBB.getParent();
2520 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2521 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2522 // Hiding the return address use with SI_RETURN may lead to extra kills in
2523 // the function and missing live-ins. We are fine in practice because callee
2524 // saved register handling ensures the register value is restored before
2525 // RET, but we need the undef flag here to appease the MachineVerifier
2526 // liveness checks.
2528 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2529 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2530
2531 MIB.copyImplicitOps(MI);
2532 MI.eraseFromParent();
2533 break;
2534 }
2535
2536 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2537 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2538 MI.setDesc(get(AMDGPU::S_MUL_U64));
2539 break;
2540
2541 case AMDGPU::S_GETPC_B64_pseudo:
2542 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2543 if (ST.hasGetPCZeroExtension()) {
2544 Register Dst = MI.getOperand(0).getReg();
2545 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2546 // Fix up hardware that does not sign-extend the 48-bit PC value by
2547 // inserting: s_sext_i32_i16 dsthi, dsthi
2548 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2549 DstHi)
2550 .addReg(DstHi);
2551 }
2552 break;
2553
2554 case AMDGPU::V_MAX_BF16_PSEUDO_e64: {
2555 assert(ST.hasBF16PackedInsts());
2556 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2557 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2558 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2559 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2560 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2561 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2562 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2563 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2564 break;
2565 }
2566
2567 case AMDGPU::GET_STACK_BASE:
2568 // The stack starts at offset 0 unless we need to reserve some space at the
2569 // bottom.
2570 if (ST.getFrameLowering()->mayReserveScratchForCWSR(*MBB.getParent())) {
2571 // When CWSR is used in dynamic VGPR mode, the trap handler needs to save
2572 // some of the VGPRs. The size of the required scratch space has already
2573 // been computed by prolog epilog insertion.
2574 const SIMachineFunctionInfo *MFI =
2575 MBB.getParent()->getInfo<SIMachineFunctionInfo>();
2576 unsigned VGPRSize = MFI->getScratchReservedForDynamicVGPRs();
2577 Register DestReg = MI.getOperand(0).getReg();
2578 BuildMI(MBB, MI, DL, get(AMDGPU::S_GETREG_B32), DestReg)
2581 // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
2582 // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
2583 // SCC, so we need to check for 0 manually.
2584 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(DestReg);
2585 // Change the implicif-def of SCC to an explicit use (but first remove
2586 // the dead flag if present).
2587 MI.getOperand(MI.getNumExplicitOperands()).setIsDead(false);
2588 MI.getOperand(MI.getNumExplicitOperands()).setIsUse();
2589 MI.setDesc(get(AMDGPU::S_CMOVK_I32));
2590 MI.addOperand(MachineOperand::CreateImm(VGPRSize));
2591 } else {
2592 MI.setDesc(get(AMDGPU::S_MOV_B32));
2593 MI.addOperand(MachineOperand::CreateImm(0));
2594 MI.removeOperand(
2595 MI.getNumExplicitOperands()); // Drop implicit def of SCC.
2596 }
2597 break;
2598 }
2599
2600 return true;
2601}
2602
2605 unsigned SubIdx, const MachineInstr &Orig,
2606 LaneBitmask UsedLanes) const {
2607
2608 // Try shrinking the instruction to remat only the part needed for current
2609 // context.
2610 // TODO: Handle more cases.
2611 unsigned Opcode = Orig.getOpcode();
2612 switch (Opcode) {
2613 case AMDGPU::S_MOV_B64:
2614 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2615 if (SubIdx != 0)
2616 break;
2617
2618 if (!Orig.getOperand(1).isImm())
2619 break;
2620
2621 // Shrink S_MOV_B64 to S_MOV_B32 when UsedLanes indicates only a single
2622 // 32-bit lane of the 64-bit value is live at the rematerialization point.
2623 if (UsedLanes.all())
2624 break;
2625
2626 // Determine which half of the 64-bit immediate corresponds to the use.
2627 unsigned OrigSubReg = Orig.getOperand(0).getSubReg();
2628 unsigned LoSubReg = RI.composeSubRegIndices(OrigSubReg, AMDGPU::sub0);
2629 unsigned HiSubReg = RI.composeSubRegIndices(OrigSubReg, AMDGPU::sub1);
2630
2631 bool NeedLo = (UsedLanes & RI.getSubRegIndexLaneMask(LoSubReg)).any();
2632 bool NeedHi = (UsedLanes & RI.getSubRegIndexLaneMask(HiSubReg)).any();
2633
2634 if (NeedLo && NeedHi)
2635 break;
2636
2637 int64_t Imm64 = Orig.getOperand(1).getImm();
2638 int32_t Imm32 = NeedLo ? Lo_32(Imm64) : Hi_32(Imm64);
2639
2640 unsigned UseSubReg = NeedLo ? LoSubReg : HiSubReg;
2641
2642 // Emit S_MOV_B32 defining just the needed 32-bit subreg of DestReg.
2643 BuildMI(MBB, I, Orig.getDebugLoc(), get(AMDGPU::S_MOV_B32))
2644 .addReg(DestReg, RegState::Define | RegState::Undef, UseSubReg)
2645 .addImm(Imm32);
2646 return;
2647 }
2648
2649 case AMDGPU::S_LOAD_DWORDX16_IMM:
2650 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2651 if (SubIdx != 0)
2652 break;
2653
2654 if (I == MBB.end())
2655 break;
2656
2657 if (I->isBundled())
2658 break;
2659
2660 // Look for a single use of the register that is also a subreg.
2661 Register RegToFind = Orig.getOperand(0).getReg();
2662 MachineOperand *UseMO = nullptr;
2663 for (auto &CandMO : I->operands()) {
2664 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2665 continue;
2666 if (UseMO) {
2667 UseMO = nullptr;
2668 break;
2669 }
2670 UseMO = &CandMO;
2671 }
2672 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2673 break;
2674
2675 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2676 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2677
2678 MachineFunction *MF = MBB.getParent();
2679 MachineRegisterInfo &MRI = MF->getRegInfo();
2680 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2681
2682 unsigned NewOpcode = -1;
2683 if (SubregSize == 256)
2684 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2685 else if (SubregSize == 128)
2686 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2687 else
2688 break;
2689
2690 const MCInstrDesc &TID = get(NewOpcode);
2691 const TargetRegisterClass *NewRC =
2692 RI.getAllocatableClass(getRegClass(TID, 0));
2693 MRI.setRegClass(DestReg, NewRC);
2694
2695 UseMO->setReg(DestReg);
2696 UseMO->setSubReg(AMDGPU::NoSubRegister);
2697
2698 // Use a smaller load with the desired size, possibly with updated offset.
2699 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2700 MI->setDesc(TID);
2701 MI->getOperand(0).setReg(DestReg);
2702 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2703 if (Offset) {
2704 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2705 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2706 OffsetMO->setImm(FinalOffset);
2707 }
2709 for (const MachineMemOperand *MemOp : Orig.memoperands())
2710 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2711 SubregSize / 8));
2712 MI->setMemRefs(*MF, NewMMOs);
2713
2714 MBB.insert(I, MI);
2715 return;
2716 }
2717
2718 default:
2719 break;
2720 }
2721
2722 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, UsedLanes);
2723}
2724
2725std::pair<MachineInstr*, MachineInstr*>
2727 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2728
2729 if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2731 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2732 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2733 return std::pair(&MI, nullptr);
2734 }
2735
2736 MachineBasicBlock &MBB = *MI.getParent();
2737 DebugLoc DL = MBB.findDebugLoc(MI);
2738 MachineFunction *MF = MBB.getParent();
2739 MachineRegisterInfo &MRI = MF->getRegInfo();
2740 Register Dst = MI.getOperand(0).getReg();
2741 unsigned Part = 0;
2742 MachineInstr *Split[2];
2743
2744 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2745 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2746 if (Dst.isPhysical()) {
2747 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2748 } else {
2749 assert(MRI.isSSA());
2750 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2751 MovDPP.addDef(Tmp);
2752 }
2753
2754 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2755 const MachineOperand &SrcOp = MI.getOperand(I);
2756 assert(!SrcOp.isFPImm());
2757 if (SrcOp.isImm()) {
2758 APInt Imm(64, SrcOp.getImm());
2759 Imm.ashrInPlace(Part * 32);
2760 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2761 } else {
2762 assert(SrcOp.isReg());
2763 Register Src = SrcOp.getReg();
2764 if (Src.isPhysical())
2765 MovDPP.addReg(RI.getSubReg(Src, Sub));
2766 else
2767 MovDPP.addReg(Src, getUndefRegState(SrcOp.isUndef()), Sub);
2768 }
2769 }
2770
2771 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2772 MovDPP.addImm(MO.getImm());
2773
2774 Split[Part] = MovDPP;
2775 ++Part;
2776 }
2777
2778 if (Dst.isVirtual())
2779 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2780 .addReg(Split[0]->getOperand(0).getReg())
2781 .addImm(AMDGPU::sub0)
2782 .addReg(Split[1]->getOperand(0).getReg())
2783 .addImm(AMDGPU::sub1);
2784
2785 MI.eraseFromParent();
2786 return std::pair(Split[0], Split[1]);
2787}
2788
2789std::optional<DestSourcePair>
2791 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2792 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2793
2794 return std::nullopt;
2795}
2796
2798 AMDGPU::OpName Src0OpName,
2799 MachineOperand &Src1,
2800 AMDGPU::OpName Src1OpName) const {
2801 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2802 if (!Src0Mods)
2803 return false;
2804
2805 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2806 assert(Src1Mods &&
2807 "All commutable instructions have both src0 and src1 modifiers");
2808
2809 int Src0ModsVal = Src0Mods->getImm();
2810 int Src1ModsVal = Src1Mods->getImm();
2811
2812 Src1Mods->setImm(Src0ModsVal);
2813 Src0Mods->setImm(Src1ModsVal);
2814 return true;
2815}
2816
2818 MachineOperand &RegOp,
2819 MachineOperand &NonRegOp) {
2820 Register Reg = RegOp.getReg();
2821 unsigned SubReg = RegOp.getSubReg();
2822 bool IsKill = RegOp.isKill();
2823 bool IsDead = RegOp.isDead();
2824 bool IsUndef = RegOp.isUndef();
2825 bool IsDebug = RegOp.isDebug();
2826
2827 if (NonRegOp.isImm())
2828 RegOp.ChangeToImmediate(NonRegOp.getImm());
2829 else if (NonRegOp.isFI())
2830 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2831 else if (NonRegOp.isGlobal()) {
2832 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2833 NonRegOp.getTargetFlags());
2834 } else
2835 return nullptr;
2836
2837 // Make sure we don't reinterpret a subreg index in the target flags.
2838 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2839
2840 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2841 NonRegOp.setSubReg(SubReg);
2842
2843 return &MI;
2844}
2845
2847 MachineOperand &NonRegOp1,
2848 MachineOperand &NonRegOp2) {
2849 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2850 int64_t NonRegVal = NonRegOp1.getImm();
2851
2852 NonRegOp1.setImm(NonRegOp2.getImm());
2853 NonRegOp2.setImm(NonRegVal);
2854 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2855 NonRegOp2.setTargetFlags(TargetFlags);
2856 return &MI;
2857}
2858
2859bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2860 unsigned OpIdx1) const {
2861 const MCInstrDesc &InstDesc = MI.getDesc();
2862 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2863 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2864
2865 unsigned Opc = MI.getOpcode();
2866 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2867
2868 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2869 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2870
2871 // Swap doesn't breach constant bus or literal limits
2872 // It may move literal to position other than src0, this is not allowed
2873 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2874 // FIXME: After gfx9, literal can be in place other than Src0
2875 if (isVALU(MI)) {
2876 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2877 !isInlineConstant(MO0, OpInfo1))
2878 return false;
2879 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2880 !isInlineConstant(MO1, OpInfo0))
2881 return false;
2882 }
2883
2884 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2885 if (OpInfo1.RegClass == -1)
2886 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2887 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2888 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2889 }
2890 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2891 if (OpInfo0.RegClass == -1)
2892 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2893 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2894 isLegalRegOperand(MI, OpIdx0, MO1);
2895 }
2896
2897 // No need to check 64-bit literals since swapping does not bring new
2898 // 64-bit literals into current instruction to fold to 32-bit
2899
2900 return isImmOperandLegal(MI, OpIdx1, MO0);
2901}
2902
2904 unsigned Src0Idx,
2905 unsigned Src1Idx) const {
2906 assert(!NewMI && "this should never be used");
2907
2908 unsigned Opc = MI.getOpcode();
2909 int CommutedOpcode = commuteOpcode(Opc);
2910 if (CommutedOpcode == -1)
2911 return nullptr;
2912
2913 if (Src0Idx > Src1Idx)
2914 std::swap(Src0Idx, Src1Idx);
2915
2916 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2917 static_cast<int>(Src0Idx) &&
2918 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2919 static_cast<int>(Src1Idx) &&
2920 "inconsistency with findCommutedOpIndices");
2921
2922 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2923 return nullptr;
2924
2925 MachineInstr *CommutedMI = nullptr;
2926 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2927 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2928 if (Src0.isReg() && Src1.isReg()) {
2929 // Be sure to copy the source modifiers to the right place.
2930 CommutedMI =
2931 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2932 } else if (Src0.isReg() && !Src1.isReg()) {
2933 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2934 } else if (!Src0.isReg() && Src1.isReg()) {
2935 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2936 } else if (Src0.isImm() && Src1.isImm()) {
2937 CommutedMI = swapImmOperands(MI, Src0, Src1);
2938 } else {
2939 // FIXME: Found two non registers to commute. This does happen.
2940 return nullptr;
2941 }
2942
2943 if (CommutedMI) {
2944 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2945 Src1, AMDGPU::OpName::src1_modifiers);
2946
2947 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2948 AMDGPU::OpName::src1_sel);
2949
2950 CommutedMI->setDesc(get(CommutedOpcode));
2951 }
2952
2953 return CommutedMI;
2954}
2955
2956// This needs to be implemented because the source modifiers may be inserted
2957// between the true commutable operands, and the base
2958// TargetInstrInfo::commuteInstruction uses it.
2960 unsigned &SrcOpIdx0,
2961 unsigned &SrcOpIdx1) const {
2962 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2963}
2964
2966 unsigned &SrcOpIdx0,
2967 unsigned &SrcOpIdx1) const {
2968 if (!Desc.isCommutable())
2969 return false;
2970
2971 unsigned Opc = Desc.getOpcode();
2972 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2973 if (Src0Idx == -1)
2974 return false;
2975
2976 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2977 if (Src1Idx == -1)
2978 return false;
2979
2980 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2981}
2982
2984 int64_t BrOffset) const {
2985 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2986 // because its dest block is unanalyzable.
2987 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2988
2989 // Convert to dwords.
2990 BrOffset /= 4;
2991
2992 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2993 // from the next instruction.
2994 BrOffset -= 1;
2995
2996 return isIntN(BranchOffsetBits, BrOffset);
2997}
2998
3001 return MI.getOperand(0).getMBB();
3002}
3003
3005 for (const MachineInstr &MI : MBB->terminators()) {
3006 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
3007 MI.getOpcode() == AMDGPU::SI_LOOP)
3008 return true;
3009 }
3010 return false;
3011}
3012
3014 MachineBasicBlock &DestBB,
3015 MachineBasicBlock &RestoreBB,
3016 const DebugLoc &DL, int64_t BrOffset,
3017 RegScavenger *RS) const {
3018 assert(MBB.empty() &&
3019 "new block should be inserted for expanding unconditional branch");
3020 assert(MBB.pred_size() == 1);
3021 assert(RestoreBB.empty() &&
3022 "restore block should be inserted for restoring clobbered registers");
3023
3024 MachineFunction *MF = MBB.getParent();
3025 MachineRegisterInfo &MRI = MF->getRegInfo();
3027 auto I = MBB.end();
3028 auto &MCCtx = MF->getContext();
3029
3030 if (ST.useAddPC64Inst()) {
3031 MCSymbol *Offset =
3032 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
3033 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
3035 MCSymbol *PostAddPCLabel =
3036 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
3037 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
3038 auto *OffsetExpr = MCBinaryExpr::createSub(
3039 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
3040 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
3041 Offset->setVariableValue(OffsetExpr);
3042 return;
3043 }
3044
3045 assert(RS && "RegScavenger required for long branching");
3046
3047 // FIXME: Virtual register workaround for RegScavenger not working with empty
3048 // blocks.
3049 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3050
3051 // Note: as this is used after hazard recognizer we need to apply some hazard
3052 // workarounds directly.
3053 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
3054 ST.hasVALUReadSGPRHazard();
3055 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
3056 if (FlushSGPRWrites)
3057 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
3059 };
3060
3061 // We need to compute the offset relative to the instruction immediately after
3062 // s_getpc_b64. Insert pc arithmetic code before last terminator.
3063 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
3064 ApplyHazardWorkarounds();
3065
3066 MCSymbol *PostGetPCLabel =
3067 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
3068 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
3069
3070 MCSymbol *OffsetLo =
3071 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
3072 MCSymbol *OffsetHi =
3073 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
3074 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
3075 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
3076 .addReg(PCReg, {}, AMDGPU::sub0)
3077 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
3078 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
3079 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
3080 .addReg(PCReg, {}, AMDGPU::sub1)
3081 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
3082 ApplyHazardWorkarounds();
3083
3084 // Insert the indirect branch after the other terminator.
3085 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
3086 .addReg(PCReg);
3087
3088 // If a spill is needed for the pc register pair, we need to insert a spill
3089 // restore block right before the destination block, and insert a short branch
3090 // into the old destination block's fallthrough predecessor.
3091 // e.g.:
3092 //
3093 // s_cbranch_scc0 skip_long_branch:
3094 //
3095 // long_branch_bb:
3096 // spill s[8:9]
3097 // s_getpc_b64 s[8:9]
3098 // s_add_u32 s8, s8, restore_bb
3099 // s_addc_u32 s9, s9, 0
3100 // s_setpc_b64 s[8:9]
3101 //
3102 // skip_long_branch:
3103 // foo;
3104 //
3105 // .....
3106 //
3107 // dest_bb_fallthrough_predecessor:
3108 // bar;
3109 // s_branch dest_bb
3110 //
3111 // restore_bb:
3112 // restore s[8:9]
3113 // fallthrough dest_bb
3114 ///
3115 // dest_bb:
3116 // buzz;
3117
3118 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3119 Register Scav;
3120
3121 // If we've previously reserved a register for long branches
3122 // avoid running the scavenger and just use those registers
3123 if (LongBranchReservedReg) {
3124 RS->enterBasicBlock(MBB);
3125 Scav = LongBranchReservedReg;
3126 } else {
3127 RS->enterBasicBlockEnd(MBB);
3128 Scav = RS->scavengeRegisterBackwards(
3129 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3130 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3131 }
3132 if (Scav) {
3133 RS->setRegUsed(Scav);
3134 MRI.replaceRegWith(PCReg, Scav);
3135 MRI.clearVirtRegs();
3136 } else {
3137 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3138 // SGPR spill.
3139 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3140 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3141 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3142 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3143 MRI.clearVirtRegs();
3144 }
3145
3146 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3147 // Now, the distance could be defined.
3149 MCSymbolRefExpr::create(DestLabel, MCCtx),
3150 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3151 // Add offset assignments.
3152 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3153 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3154 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3155 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3156}
3157
3158unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3159 switch (Cond) {
3160 case SIInstrInfo::SCC_TRUE:
3161 return AMDGPU::S_CBRANCH_SCC1;
3162 case SIInstrInfo::SCC_FALSE:
3163 return AMDGPU::S_CBRANCH_SCC0;
3164 case SIInstrInfo::VCCNZ:
3165 return AMDGPU::S_CBRANCH_VCCNZ;
3166 case SIInstrInfo::VCCZ:
3167 return AMDGPU::S_CBRANCH_VCCZ;
3168 case SIInstrInfo::EXECNZ:
3169 return AMDGPU::S_CBRANCH_EXECNZ;
3170 case SIInstrInfo::EXECZ:
3171 return AMDGPU::S_CBRANCH_EXECZ;
3172 default:
3173 llvm_unreachable("invalid branch predicate");
3174 }
3175}
3176
3177SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3178 switch (Opcode) {
3179 case AMDGPU::S_CBRANCH_SCC0:
3180 return SCC_FALSE;
3181 case AMDGPU::S_CBRANCH_SCC1:
3182 return SCC_TRUE;
3183 case AMDGPU::S_CBRANCH_VCCNZ:
3184 return VCCNZ;
3185 case AMDGPU::S_CBRANCH_VCCZ:
3186 return VCCZ;
3187 case AMDGPU::S_CBRANCH_EXECNZ:
3188 return EXECNZ;
3189 case AMDGPU::S_CBRANCH_EXECZ:
3190 return EXECZ;
3191 default:
3192 return INVALID_BR;
3193 }
3194}
3195
3199 MachineBasicBlock *&FBB,
3201 bool AllowModify) const {
3202 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3203 // Unconditional Branch
3204 TBB = I->getOperand(0).getMBB();
3205 return false;
3206 }
3207
3208 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3209 if (Pred == INVALID_BR)
3210 return true;
3211
3212 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3213 Cond.push_back(MachineOperand::CreateImm(Pred));
3214 Cond.push_back(I->getOperand(1)); // Save the branch register.
3215
3216 ++I;
3217
3218 if (I == MBB.end()) {
3219 // Conditional branch followed by fall-through.
3220 TBB = CondBB;
3221 return false;
3222 }
3223
3224 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3225 TBB = CondBB;
3226 FBB = I->getOperand(0).getMBB();
3227 return false;
3228 }
3229
3230 return true;
3231}
3232
3234 MachineBasicBlock *&FBB,
3236 bool AllowModify) const {
3237 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3238 auto E = MBB.end();
3239 if (I == E)
3240 return false;
3241
3242 // Skip over the instructions that are artificially terminators for special
3243 // exec management.
3244 while (I != E && !I->isBranch() && !I->isReturn()) {
3245 switch (I->getOpcode()) {
3246 case AMDGPU::S_MOV_B64_term:
3247 case AMDGPU::S_XOR_B64_term:
3248 case AMDGPU::S_OR_B64_term:
3249 case AMDGPU::S_ANDN2_B64_term:
3250 case AMDGPU::S_AND_B64_term:
3251 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3252 case AMDGPU::S_MOV_B32_term:
3253 case AMDGPU::S_XOR_B32_term:
3254 case AMDGPU::S_OR_B32_term:
3255 case AMDGPU::S_ANDN2_B32_term:
3256 case AMDGPU::S_AND_B32_term:
3257 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3258 break;
3259 case AMDGPU::SI_IF:
3260 case AMDGPU::SI_ELSE:
3261 case AMDGPU::SI_KILL_I1_TERMINATOR:
3262 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3263 // FIXME: It's messy that these need to be considered here at all.
3264 return true;
3265 default:
3266 llvm_unreachable("unexpected non-branch terminator inst");
3267 }
3268
3269 ++I;
3270 }
3271
3272 if (I == E)
3273 return false;
3274
3275 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3276}
3277
3279 int *BytesRemoved) const {
3280 unsigned Count = 0;
3281 unsigned RemovedSize = 0;
3282 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3283 // Skip over artificial terminators when removing instructions.
3284 if (MI.isBranch() || MI.isReturn()) {
3285 RemovedSize += getInstSizeInBytes(MI);
3286 MI.eraseFromParent();
3287 ++Count;
3288 }
3289 }
3290
3291 if (BytesRemoved)
3292 *BytesRemoved = RemovedSize;
3293
3294 return Count;
3295}
3296
3297// Copy the flags onto the implicit condition register operand.
3299 const MachineOperand &OrigCond) {
3300 CondReg.setIsUndef(OrigCond.isUndef());
3301 CondReg.setIsKill(OrigCond.isKill());
3302}
3303
3306 MachineBasicBlock *FBB,
3308 const DebugLoc &DL,
3309 int *BytesAdded) const {
3310 if (!FBB && Cond.empty()) {
3311 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3312 .addMBB(TBB);
3313 if (BytesAdded)
3314 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3315 return 1;
3316 }
3317
3318 assert(TBB && Cond[0].isImm());
3319
3320 unsigned Opcode
3321 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3322
3323 if (!FBB) {
3324 MachineInstr *CondBr =
3325 BuildMI(&MBB, DL, get(Opcode))
3326 .addMBB(TBB);
3327
3328 // Copy the flags onto the implicit condition register operand.
3329 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3330 fixImplicitOperands(*CondBr);
3331
3332 if (BytesAdded)
3333 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3334 return 1;
3335 }
3336
3337 assert(TBB && FBB);
3338
3339 MachineInstr *CondBr =
3340 BuildMI(&MBB, DL, get(Opcode))
3341 .addMBB(TBB);
3342 fixImplicitOperands(*CondBr);
3343 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3344 .addMBB(FBB);
3345
3346 MachineOperand &CondReg = CondBr->getOperand(1);
3347 CondReg.setIsUndef(Cond[1].isUndef());
3348 CondReg.setIsKill(Cond[1].isKill());
3349
3350 if (BytesAdded)
3351 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3352
3353 return 2;
3354}
3355
3358 if (Cond.size() != 2) {
3359 return true;
3360 }
3361
3362 if (Cond[0].isImm()) {
3363 Cond[0].setImm(-Cond[0].getImm());
3364 return false;
3365 }
3366
3367 return true;
3368}
3369
3372 Register DstReg, Register TrueReg,
3373 Register FalseReg, int &CondCycles,
3374 int &TrueCycles, int &FalseCycles) const {
3375 switch (Cond[0].getImm()) {
3376 case VCCNZ:
3377 case VCCZ: {
3378 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3379 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3380 if (MRI.getRegClass(FalseReg) != RC)
3381 return false;
3382
3383 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3384 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3385
3386 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3387 return RI.hasVGPRs(RC) && NumInsts <= 6;
3388 }
3389 case SCC_TRUE:
3390 case SCC_FALSE: {
3391 // FIXME: We could insert for VGPRs if we could replace the original compare
3392 // with a vector one.
3393 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3394 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3395 if (MRI.getRegClass(FalseReg) != RC)
3396 return false;
3397
3398 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3399
3400 // Multiples of 8 can do s_cselect_b64
3401 if (NumInsts % 2 == 0)
3402 NumInsts /= 2;
3403
3404 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3405 return RI.isSGPRClass(RC);
3406 }
3407 default:
3408 return false;
3409 }
3410}
3411
3415 Register TrueReg, Register FalseReg) const {
3416 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3417 if (Pred == VCCZ || Pred == SCC_FALSE) {
3418 Pred = static_cast<BranchPredicate>(-Pred);
3419 std::swap(TrueReg, FalseReg);
3420 }
3421
3422 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3423 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3424 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3425
3426 if (DstSize == 32) {
3428 if (Pred == SCC_TRUE) {
3429 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3430 .addReg(TrueReg)
3431 .addReg(FalseReg);
3432 } else {
3433 // Instruction's operands are backwards from what is expected.
3434 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3435 .addReg(FalseReg)
3436 .addReg(TrueReg);
3437 }
3438
3439 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3440 return;
3441 }
3442
3443 if (DstSize == 64 && Pred == SCC_TRUE) {
3445 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3446 .addReg(TrueReg)
3447 .addReg(FalseReg);
3448
3449 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3450 return;
3451 }
3452
3453 static const int16_t Sub0_15[] = {
3454 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3455 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3456 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3457 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3458 };
3459
3460 static const int16_t Sub0_15_64[] = {
3461 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3462 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3463 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3464 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3465 };
3466
3467 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3468 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3469 const int16_t *SubIndices = Sub0_15;
3470 int NElts = DstSize / 32;
3471
3472 // 64-bit select is only available for SALU.
3473 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3474 if (Pred == SCC_TRUE) {
3475 if (NElts % 2) {
3476 SelOp = AMDGPU::S_CSELECT_B32;
3477 EltRC = &AMDGPU::SGPR_32RegClass;
3478 } else {
3479 SelOp = AMDGPU::S_CSELECT_B64;
3480 EltRC = &AMDGPU::SGPR_64RegClass;
3481 SubIndices = Sub0_15_64;
3482 NElts /= 2;
3483 }
3484 }
3485
3487 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3488
3489 I = MIB->getIterator();
3490
3492 for (int Idx = 0; Idx != NElts; ++Idx) {
3493 Register DstElt = MRI.createVirtualRegister(EltRC);
3494 Regs.push_back(DstElt);
3495
3496 unsigned SubIdx = SubIndices[Idx];
3497
3499 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3500 Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
3501 .addReg(FalseReg, {}, SubIdx)
3502 .addReg(TrueReg, {}, SubIdx);
3503 } else {
3504 Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
3505 .addReg(TrueReg, {}, SubIdx)
3506 .addReg(FalseReg, {}, SubIdx);
3507 }
3508
3509 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3511
3512 MIB.addReg(DstElt)
3513 .addImm(SubIdx);
3514 }
3515}
3516
3518 switch (MI.getOpcode()) {
3519 case AMDGPU::V_MOV_B16_t16_e32:
3520 case AMDGPU::V_MOV_B16_t16_e64:
3521 case AMDGPU::V_MOV_B32_e32:
3522 case AMDGPU::V_MOV_B32_e64:
3523 case AMDGPU::V_MOV_B64_PSEUDO:
3524 case AMDGPU::V_MOV_B64_e32:
3525 case AMDGPU::V_MOV_B64_e64:
3526 case AMDGPU::S_MOV_B32:
3527 case AMDGPU::S_MOV_B64:
3528 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3529 case AMDGPU::COPY:
3530 case AMDGPU::WWM_COPY:
3531 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3532 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3533 case AMDGPU::V_ACCVGPR_MOV_B32:
3534 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3535 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3536 return true;
3537 default:
3538 return false;
3539 }
3540}
3541
3543 switch (MI.getOpcode()) {
3544 case AMDGPU::V_MOV_B16_t16_e32:
3545 case AMDGPU::V_MOV_B16_t16_e64:
3546 return 2;
3547 case AMDGPU::V_MOV_B32_e32:
3548 case AMDGPU::V_MOV_B32_e64:
3549 case AMDGPU::V_MOV_B64_PSEUDO:
3550 case AMDGPU::V_MOV_B64_e32:
3551 case AMDGPU::V_MOV_B64_e64:
3552 case AMDGPU::S_MOV_B32:
3553 case AMDGPU::S_MOV_B64:
3554 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3555 case AMDGPU::COPY:
3556 case AMDGPU::WWM_COPY:
3557 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3558 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3559 case AMDGPU::V_ACCVGPR_MOV_B32:
3560 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3561 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3562 return 1;
3563 default:
3564 llvm_unreachable("MI is not a foldable copy");
3565 }
3566}
3567
3568static constexpr AMDGPU::OpName ModifierOpNames[] = {
3569 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3570 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3571 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3572
3574 unsigned Opc = MI.getOpcode();
3575 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3576 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3577 if (Idx >= 0)
3578 MI.removeOperand(Idx);
3579 }
3580}
3581
3583 const MCInstrDesc &NewDesc) const {
3584 MI.setDesc(NewDesc);
3585
3586 // Remove any leftover implicit operands from mutating the instruction. e.g.
3587 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
3588 // anymore.
3589 const MCInstrDesc &Desc = MI.getDesc();
3590 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
3591 Desc.implicit_defs().size();
3592
3593 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
3594 MI.removeOperand(I);
3595}
3596
3597std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3598 unsigned SubRegIndex) {
3599 switch (SubRegIndex) {
3600 case AMDGPU::NoSubRegister:
3601 return Imm;
3602 case AMDGPU::sub0:
3603 return SignExtend64<32>(Imm);
3604 case AMDGPU::sub1:
3605 return SignExtend64<32>(Imm >> 32);
3606 case AMDGPU::lo16:
3607 return SignExtend64<16>(Imm);
3608 case AMDGPU::hi16:
3609 return SignExtend64<16>(Imm >> 16);
3610 case AMDGPU::sub1_lo16:
3611 return SignExtend64<16>(Imm >> 32);
3612 case AMDGPU::sub1_hi16:
3613 return SignExtend64<16>(Imm >> 48);
3614 default:
3615 return std::nullopt;
3616 }
3617
3618 llvm_unreachable("covered subregister switch");
3619}
3620
3621static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3622 switch (Opc) {
3623 case AMDGPU::V_MAC_F16_e32:
3624 case AMDGPU::V_MAC_F16_e64:
3625 case AMDGPU::V_MAD_F16_e64:
3626 return AMDGPU::V_MADAK_F16;
3627 case AMDGPU::V_MAC_F32_e32:
3628 case AMDGPU::V_MAC_F32_e64:
3629 case AMDGPU::V_MAD_F32_e64:
3630 return AMDGPU::V_MADAK_F32;
3631 case AMDGPU::V_FMAC_F32_e32:
3632 case AMDGPU::V_FMAC_F32_e64:
3633 case AMDGPU::V_FMA_F32_e64:
3634 return AMDGPU::V_FMAAK_F32;
3635 case AMDGPU::V_FMAC_F16_e32:
3636 case AMDGPU::V_FMAC_F16_e64:
3637 case AMDGPU::V_FMAC_F16_t16_e64:
3638 case AMDGPU::V_FMAC_F16_fake16_e64:
3639 case AMDGPU::V_FMAC_F16_t16_e32:
3640 case AMDGPU::V_FMAC_F16_fake16_e32:
3641 case AMDGPU::V_FMA_F16_e64:
3642 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3643 ? AMDGPU::V_FMAAK_F16_t16
3644 : AMDGPU::V_FMAAK_F16_fake16
3645 : AMDGPU::V_FMAAK_F16;
3646 case AMDGPU::V_FMAC_F64_e32:
3647 case AMDGPU::V_FMAC_F64_e64:
3648 case AMDGPU::V_FMA_F64_e64:
3649 return AMDGPU::V_FMAAK_F64;
3650 default:
3651 llvm_unreachable("invalid instruction");
3652 }
3653}
3654
3655static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3656 switch (Opc) {
3657 case AMDGPU::V_MAC_F16_e32:
3658 case AMDGPU::V_MAC_F16_e64:
3659 case AMDGPU::V_MAD_F16_e64:
3660 return AMDGPU::V_MADMK_F16;
3661 case AMDGPU::V_MAC_F32_e32:
3662 case AMDGPU::V_MAC_F32_e64:
3663 case AMDGPU::V_MAD_F32_e64:
3664 return AMDGPU::V_MADMK_F32;
3665 case AMDGPU::V_FMAC_F32_e32:
3666 case AMDGPU::V_FMAC_F32_e64:
3667 case AMDGPU::V_FMA_F32_e64:
3668 return AMDGPU::V_FMAMK_F32;
3669 case AMDGPU::V_FMAC_F16_e32:
3670 case AMDGPU::V_FMAC_F16_e64:
3671 case AMDGPU::V_FMAC_F16_t16_e64:
3672 case AMDGPU::V_FMAC_F16_fake16_e64:
3673 case AMDGPU::V_FMAC_F16_t16_e32:
3674 case AMDGPU::V_FMAC_F16_fake16_e32:
3675 case AMDGPU::V_FMA_F16_e64:
3676 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3677 ? AMDGPU::V_FMAMK_F16_t16
3678 : AMDGPU::V_FMAMK_F16_fake16
3679 : AMDGPU::V_FMAMK_F16;
3680 case AMDGPU::V_FMAC_F64_e32:
3681 case AMDGPU::V_FMAC_F64_e64:
3682 case AMDGPU::V_FMA_F64_e64:
3683 return AMDGPU::V_FMAMK_F64;
3684 default:
3685 llvm_unreachable("invalid instruction");
3686 }
3687}
3688
3690 Register Reg, MachineRegisterInfo *MRI) const {
3691 int64_t Imm;
3692 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3693 return false;
3694
3695 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3696
3697 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3698
3699 unsigned Opc = UseMI.getOpcode();
3700 if (Opc == AMDGPU::COPY) {
3701 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3702
3703 Register DstReg = UseMI.getOperand(0).getReg();
3704 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3705
3706 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3707
3708 if (HasMultipleUses) {
3709 // TODO: This should fold in more cases with multiple use, but we need to
3710 // more carefully consider what those uses are.
3711 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3712
3713 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3714 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3715 return false;
3716
3717 // Most of the time folding a 32-bit inline constant is free (though this
3718 // might not be true if we can't later fold it into a real user).
3719 //
3720 // FIXME: This isInlineConstant check is imprecise if
3721 // getConstValDefinedInReg handled the tricky non-mov cases.
3722 if (ImmDefSize == 32 &&
3724 return false;
3725 }
3726
3727 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3728 RI.getSubRegIdxSize(UseSubReg) == 16;
3729
3730 if (Is16Bit) {
3731 if (RI.hasVGPRs(DstRC))
3732 return false; // Do not clobber vgpr_hi16
3733
3734 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3735 return false;
3736 }
3737
3738 MachineFunction *MF = UseMI.getMF();
3739
3740 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3741 MCRegister MovDstPhysReg =
3742 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3743
3744 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3745
3746 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3747 for (unsigned MovOp :
3748 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3749 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3750 const MCInstrDesc &MovDesc = get(MovOp);
3751
3752 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0);
3753 if (Is16Bit) {
3754 // We just need to find a correctly sized register class, so the
3755 // subregister index compatibility doesn't matter since we're statically
3756 // extracting the immediate value.
3757 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3758 if (!MovDstRC)
3759 continue;
3760
3761 if (MovDstPhysReg) {
3762 // FIXME: We probably should not do this. If there is a live value in
3763 // the high half of the register, it will be corrupted.
3764 MovDstPhysReg =
3765 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3766 if (!MovDstPhysReg)
3767 continue;
3768 }
3769 }
3770
3771 // Result class isn't the right size, try the next instruction.
3772 if (MovDstPhysReg) {
3773 if (!MovDstRC->contains(MovDstPhysReg))
3774 return false;
3775 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3776 // TODO: This will be overly conservative in the case of 16-bit virtual
3777 // SGPRs. We could hack up the virtual register uses to use a compatible
3778 // 32-bit class.
3779 continue;
3780 }
3781
3782 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3783
3784 // Ensure the interpreted immediate value is a valid operand in the new
3785 // mov.
3786 //
3787 // FIXME: isImmOperandLegal should have form that doesn't require existing
3788 // MachineInstr or MachineOperand
3789 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3790 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3791 break;
3792
3793 NewOpc = MovOp;
3794 break;
3795 }
3796
3797 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3798 return false;
3799
3800 if (Is16Bit) {
3801 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3802 if (MovDstPhysReg)
3803 UseMI.getOperand(0).setReg(MovDstPhysReg);
3804 assert(UseMI.getOperand(1).getReg().isVirtual());
3805 }
3806
3807 const MCInstrDesc &NewMCID = get(NewOpc);
3808 UseMI.setDesc(NewMCID);
3809 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3810 UseMI.addImplicitDefUseOperands(*MF);
3811 return true;
3812 }
3813
3814 if (HasMultipleUses)
3815 return false;
3816
3817 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3818 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3819 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3820 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3821 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3822 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3823 Opc == AMDGPU::V_FMAC_F64_e64) {
3824 // Don't fold if we are using source or output modifiers. The new VOP2
3825 // instructions don't have them.
3827 return false;
3828
3829 // If this is a free constant, there's no reason to do this.
3830 // TODO: We could fold this here instead of letting SIFoldOperands do it
3831 // later.
3832 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3833
3834 // Any src operand can be used for the legality check.
3835 if (isInlineConstant(UseMI, Src0Idx, Imm))
3836 return false;
3837
3838 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3839
3840 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3841 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3842
3843 auto CopyRegOperandToNarrowerRC =
3844 [MRI, this](MachineInstr &MI, unsigned OpNo,
3845 const TargetRegisterClass *NewRC) -> void {
3846 if (!MI.getOperand(OpNo).isReg())
3847 return;
3848 Register Reg = MI.getOperand(OpNo).getReg();
3849 const TargetRegisterClass *RC = RI.getRegClassForReg(*MRI, Reg);
3850 if (RI.getCommonSubClass(RC, NewRC) != NewRC)
3851 return;
3852 Register Tmp = MRI->createVirtualRegister(NewRC);
3853 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
3854 get(AMDGPU::COPY), Tmp)
3855 .addReg(Reg);
3856 MI.getOperand(OpNo).setReg(Tmp);
3857 MI.getOperand(OpNo).setIsKill();
3858 };
3859
3860 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3861 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3862 (Src1->isReg() && Src1->getReg() == Reg)) {
3863 MachineOperand *RegSrc =
3864 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3865 if (!RegSrc->isReg())
3866 return false;
3867 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3868 ST.getConstantBusLimit(Opc) < 2)
3869 return false;
3870
3871 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3872 return false;
3873
3874 // If src2 is also a literal constant then we have to choose which one to
3875 // fold. In general it is better to choose madak so that the other literal
3876 // can be materialized in an sgpr instead of a vgpr:
3877 // s_mov_b32 s0, literal
3878 // v_madak_f32 v0, s0, v0, literal
3879 // Instead of:
3880 // v_mov_b32 v1, literal
3881 // v_madmk_f32 v0, v0, literal, v1
3882 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3883 if (Def && Def->isMoveImmediate() &&
3884 !isInlineConstant(Def->getOperand(1)))
3885 return false;
3886
3887 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3888 if (pseudoToMCOpcode(NewOpc) == -1)
3889 return false;
3890
3891 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3892 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3893
3894 // FIXME: This would be a lot easier if we could return a new instruction
3895 // instead of having to modify in place.
3896
3897 Register SrcReg = RegSrc->getReg();
3898 unsigned SrcSubReg = RegSrc->getSubReg();
3899 Src0->setReg(SrcReg);
3900 Src0->setSubReg(SrcSubReg);
3901 Src0->setIsKill(RegSrc->isKill());
3902
3903 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3904 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3905 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3906 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3907 UseMI.untieRegOperand(
3908 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3909
3910 Src1->ChangeToImmediate(*SubRegImm);
3911
3913 UseMI.setDesc(get(NewOpc));
3914
3915 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3916 NewOpc == AMDGPU::V_FMAMK_F16_fake16) {
3917 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3918 Register Tmp = MRI->createVirtualRegister(NewRC);
3919 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3920 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3921 UseMI.getOperand(0).getReg())
3922 .addReg(Tmp, RegState::Kill);
3923 UseMI.getOperand(0).setReg(Tmp);
3924 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3925 CopyRegOperandToNarrowerRC(UseMI, 3, NewRC);
3926 }
3927
3928 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3929 if (DeleteDef)
3930 DefMI.eraseFromParent();
3931
3932 return true;
3933 }
3934
3935 // Added part is the constant: Use v_madak_{f16, f32}.
3936 if (Src2->isReg() && Src2->getReg() == Reg) {
3937 if (ST.getConstantBusLimit(Opc) < 2) {
3938 // Not allowed to use constant bus for another operand.
3939 // We can however allow an inline immediate as src0.
3940 bool Src0Inlined = false;
3941 if (Src0->isReg()) {
3942 // Try to inline constant if possible.
3943 // If the Def moves immediate and the use is single
3944 // We are saving VGPR here.
3945 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3946 if (Def && Def->isMoveImmediate() &&
3947 isInlineConstant(Def->getOperand(1)) &&
3948 MRI->hasOneNonDBGUse(Src0->getReg())) {
3949 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3950 Src0Inlined = true;
3951 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3952 RI.isSGPRReg(*MRI, Src0->getReg())) {
3953 return false;
3954 }
3955 // VGPR is okay as Src0 - fallthrough
3956 }
3957
3958 if (Src1->isReg() && !Src0Inlined) {
3959 // We have one slot for inlinable constant so far - try to fill it
3960 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3961 if (Def && Def->isMoveImmediate() &&
3962 isInlineConstant(Def->getOperand(1)) &&
3963 MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI))
3964 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3965 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3966 return false;
3967 // VGPR is okay as Src1 - fallthrough
3968 }
3969 }
3970
3971 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3972 if (pseudoToMCOpcode(NewOpc) == -1)
3973 return false;
3974
3975 // FIXME: This would be a lot easier if we could return a new instruction
3976 // instead of having to modify in place.
3977
3978 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3979 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3980 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3981 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3982 UseMI.untieRegOperand(
3983 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3984
3985 const std::optional<int64_t> SubRegImm =
3986 extractSubregFromImm(Imm, Src2->getSubReg());
3987
3988 // ChangingToImmediate adds Src2 back to the instruction.
3989 Src2->ChangeToImmediate(*SubRegImm);
3990
3991 // These come before src2.
3993 UseMI.setDesc(get(NewOpc));
3994
3995 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3996 NewOpc == AMDGPU::V_FMAAK_F16_fake16) {
3997 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3998 Register Tmp = MRI->createVirtualRegister(NewRC);
3999 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
4000 UseMI.getDebugLoc(), get(AMDGPU::COPY),
4001 UseMI.getOperand(0).getReg())
4002 .addReg(Tmp, RegState::Kill);
4003 UseMI.getOperand(0).setReg(Tmp);
4004 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
4005 CopyRegOperandToNarrowerRC(UseMI, 2, NewRC);
4006 }
4007
4008 // It might happen that UseMI was commuted
4009 // and we now have SGPR as SRC1. If so 2 inlined
4010 // constant and SGPR are illegal.
4012
4013 bool DeleteDef = MRI->use_nodbg_empty(Reg);
4014 if (DeleteDef)
4015 DefMI.eraseFromParent();
4016
4017 return true;
4018 }
4019 }
4020
4021 return false;
4022}
4023
4024static bool
4027 if (BaseOps1.size() != BaseOps2.size())
4028 return false;
4029 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
4030 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
4031 return false;
4032 }
4033 return true;
4034}
4035
4036static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
4037 LocationSize WidthB, int OffsetB) {
4038 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
4039 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
4040 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
4041 return LowWidth.hasValue() &&
4042 LowOffset + (int)LowWidth.getValue() <= HighOffset;
4043}
4044
4045bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
4046 const MachineInstr &MIb) const {
4047 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
4048 int64_t Offset0, Offset1;
4049 LocationSize Dummy0 = LocationSize::precise(0);
4050 LocationSize Dummy1 = LocationSize::precise(0);
4051 bool Offset0IsScalable, Offset1IsScalable;
4052 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
4053 Dummy0, &RI) ||
4054 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
4055 Dummy1, &RI))
4056 return false;
4057
4058 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
4059 return false;
4060
4061 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
4062 // FIXME: Handle ds_read2 / ds_write2.
4063 return false;
4064 }
4065 LocationSize Width0 = MIa.memoperands().front()->getSize();
4066 LocationSize Width1 = MIb.memoperands().front()->getSize();
4067 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
4068}
4069
4071 const MachineInstr &MIb) const {
4072 assert(MIa.mayLoadOrStore() &&
4073 "MIa must load from or modify a memory location");
4074 assert(MIb.mayLoadOrStore() &&
4075 "MIb must load from or modify a memory location");
4076
4078 return false;
4079
4080 // XXX - Can we relax this between address spaces?
4081 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
4082 return false;
4083
4084 if (isLDSDMA(MIa) || isLDSDMA(MIb))
4085 return false;
4086
4087 if (MIa.isBundle() || MIb.isBundle())
4088 return false;
4089
4090 // TODO: Should we check the address space from the MachineMemOperand? That
4091 // would allow us to distinguish objects we know don't alias based on the
4092 // underlying address space, even if it was lowered to a different one,
4093 // e.g. private accesses lowered to use MUBUF instructions on a scratch
4094 // buffer.
4095 if (isDS(MIa)) {
4096 if (isDS(MIb))
4097 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4098
4099 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
4100 }
4101
4102 if (isMUBUF(MIa) || isMTBUF(MIa)) {
4103 if (isMUBUF(MIb) || isMTBUF(MIb))
4104 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4105
4106 if (isFLAT(MIb))
4107 return isFLATScratch(MIb);
4108
4109 return !isSMRD(MIb);
4110 }
4111
4112 if (isSMRD(MIa)) {
4113 if (isSMRD(MIb))
4114 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4115
4116 if (isFLAT(MIb))
4117 return isFLATScratch(MIb);
4118
4119 return !isMUBUF(MIb) && !isMTBUF(MIb);
4120 }
4121
4122 if (isFLAT(MIa)) {
4123 if (isFLAT(MIb)) {
4124 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
4125 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
4126 return true;
4127
4128 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4129 }
4130
4131 return false;
4132 }
4133
4134 return false;
4135}
4136
4138 int64_t &Imm, MachineInstr **DefMI = nullptr) {
4139 if (Reg.isPhysical())
4140 return false;
4141 auto *Def = MRI.getUniqueVRegDef(Reg);
4142 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
4143 Imm = Def->getOperand(1).getImm();
4144 if (DefMI)
4145 *DefMI = Def;
4146 return true;
4147 }
4148 return false;
4149}
4150
4151static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
4152 MachineInstr **DefMI = nullptr) {
4153 if (!MO->isReg())
4154 return false;
4155 const MachineFunction *MF = MO->getParent()->getMF();
4156 const MachineRegisterInfo &MRI = MF->getRegInfo();
4157 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
4158}
4159
4161 MachineInstr &NewMI) {
4162 if (LV) {
4163 unsigned NumOps = MI.getNumOperands();
4164 for (unsigned I = 1; I < NumOps; ++I) {
4165 MachineOperand &Op = MI.getOperand(I);
4166 if (Op.isReg() && Op.isKill())
4167 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
4168 }
4169 }
4170}
4171
4172static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4173 switch (Opc) {
4174 case AMDGPU::V_MAC_F16_e32:
4175 case AMDGPU::V_MAC_F16_e64:
4176 return AMDGPU::V_MAD_F16_e64;
4177 case AMDGPU::V_MAC_F32_e32:
4178 case AMDGPU::V_MAC_F32_e64:
4179 return AMDGPU::V_MAD_F32_e64;
4180 case AMDGPU::V_MAC_LEGACY_F32_e32:
4181 case AMDGPU::V_MAC_LEGACY_F32_e64:
4182 return AMDGPU::V_MAD_LEGACY_F32_e64;
4183 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4184 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4185 return AMDGPU::V_FMA_LEGACY_F32_e64;
4186 case AMDGPU::V_FMAC_F16_e32:
4187 case AMDGPU::V_FMAC_F16_e64:
4188 case AMDGPU::V_FMAC_F16_t16_e64:
4189 case AMDGPU::V_FMAC_F16_fake16_e64:
4190 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4191 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4192 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4193 : AMDGPU::V_FMA_F16_gfx9_e64;
4194 case AMDGPU::V_FMAC_F32_e32:
4195 case AMDGPU::V_FMAC_F32_e64:
4196 return AMDGPU::V_FMA_F32_e64;
4197 case AMDGPU::V_FMAC_F64_e32:
4198 case AMDGPU::V_FMAC_F64_e64:
4199 return AMDGPU::V_FMA_F64_e64;
4200 default:
4201 llvm_unreachable("invalid instruction");
4202 }
4203}
4204
4205/// Helper struct for the implementation of 3-address conversion to communicate
4206/// updates made to instruction operands.
4208 /// Other instruction whose def is no longer used by the converted
4209 /// instruction.
4211};
4212
4214 LiveVariables *LV,
4215 LiveIntervals *LIS) const {
4216 MachineBasicBlock &MBB = *MI.getParent();
4217 MachineInstr *CandidateMI = &MI;
4218
4219 if (MI.isBundle()) {
4220 // This is a temporary placeholder for bundle handling that enables us to
4221 // exercise the relevant code paths in the two-address instruction pass.
4222 if (MI.getBundleSize() != 1)
4223 return nullptr;
4224 CandidateMI = MI.getNextNode();
4225 }
4226
4228 MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U);
4229 if (!NewMI)
4230 return nullptr;
4231
4232 if (MI.isBundle()) {
4233 CandidateMI->eraseFromBundle();
4234
4235 for (MachineOperand &MO : MI.all_defs()) {
4236 if (MO.isTied())
4237 MI.untieRegOperand(MO.getOperandNo());
4238 }
4239 } else {
4240 updateLiveVariables(LV, MI, *NewMI);
4241 if (LIS) {
4242 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
4243 // SlotIndex of defs needs to be updated when converting to early-clobber
4244 MachineOperand &Def = NewMI->getOperand(0);
4245 if (Def.isEarlyClobber() && Def.isReg() &&
4246 LIS->hasInterval(Def.getReg())) {
4247 SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
4248 SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
4249 auto &LI = LIS->getInterval(Def.getReg());
4250 auto UpdateDefIndex = [&](LiveRange &LR) {
4251 auto *S = LR.find(OldIndex);
4252 if (S != LR.end() && S->start == OldIndex) {
4253 assert(S->valno && S->valno->def == OldIndex);
4254 S->start = NewIndex;
4255 S->valno->def = NewIndex;
4256 }
4257 };
4258 UpdateDefIndex(LI);
4259 for (auto &SR : LI.subranges())
4260 UpdateDefIndex(SR);
4261 }
4262 }
4263 }
4264
4265 if (U.RemoveMIUse) {
4266 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4267 // The only user is the instruction which will be killed.
4268 Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
4269
4270 if (MRI.hasOneNonDBGUse(DefReg)) {
4271 // We cannot just remove the DefMI here, calling pass will crash.
4272 U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
4273 U.RemoveMIUse->getOperand(0).setIsDead(true);
4274 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4275 U.RemoveMIUse->removeOperand(I);
4276 if (LV)
4277 LV->getVarInfo(DefReg).AliveBlocks.clear();
4278 }
4279
4280 if (MI.isBundle()) {
4281 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4282 if (!VRI.Reads && !VRI.Writes) {
4283 for (MachineOperand &MO : MI.all_uses()) {
4284 if (MO.isReg() && MO.getReg() == DefReg) {
4285 assert(MO.getSubReg() == 0 &&
4286 "tied sub-registers in bundles currently not supported");
4287 MI.removeOperand(MO.getOperandNo());
4288 break;
4289 }
4290 }
4291
4292 if (LIS)
4293 LIS->shrinkToUses(&LIS->getInterval(DefReg));
4294 }
4295 } else if (LIS) {
4296 LiveInterval &DefLI = LIS->getInterval(DefReg);
4297
4298 // We cannot delete the original instruction here, so hack out the use
4299 // in the original instruction with a dummy register so we can use
4300 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4301 // not have the complexity of deleting a use to consider here.
4302 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4303 for (MachineOperand &MIOp : MI.uses()) {
4304 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4305 MIOp.setIsUndef(true);
4306 MIOp.setReg(DummyReg);
4307 }
4308 }
4309
4310 if (MI.isBundle()) {
4311 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4312 if (!VRI.Reads && !VRI.Writes) {
4313 for (MachineOperand &MIOp : MI.uses()) {
4314 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4315 MIOp.setIsUndef(true);
4316 MIOp.setReg(DummyReg);
4317 }
4318 }
4319 }
4320
4321 MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false,
4322 false, /*isUndef=*/true));
4323 }
4324
4325 LIS->shrinkToUses(&DefLI);
4326 }
4327 }
4328
4329 return MI.isBundle() ? &MI : NewMI;
4330}
4331
4333SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4334 ThreeAddressUpdates &U) const {
4335 MachineBasicBlock &MBB = *MI.getParent();
4336 unsigned Opc = MI.getOpcode();
4337
4338 // Handle MFMA.
4339 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4340 if (NewMFMAOpc != -1) {
4342 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4343 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4344 MIB.add(MI.getOperand(I));
4345 return MIB;
4346 }
4347
4348 if (SIInstrInfo::isWMMA(MI)) {
4349 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4350 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4351 .setMIFlags(MI.getFlags());
4352 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4353 MIB->addOperand(MI.getOperand(I));
4354 return MIB;
4355 }
4356
4357 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4358 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4359 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4360 "present pre-RA");
4361
4362 // Handle MAC/FMAC.
4363 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4364 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4365 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4366 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4367 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4368 bool Src0Literal = false;
4369
4370 switch (Opc) {
4371 default:
4372 return nullptr;
4373 case AMDGPU::V_MAC_F16_e64:
4374 case AMDGPU::V_FMAC_F16_e64:
4375 case AMDGPU::V_FMAC_F16_t16_e64:
4376 case AMDGPU::V_FMAC_F16_fake16_e64:
4377 case AMDGPU::V_MAC_F32_e64:
4378 case AMDGPU::V_MAC_LEGACY_F32_e64:
4379 case AMDGPU::V_FMAC_F32_e64:
4380 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4381 case AMDGPU::V_FMAC_F64_e64:
4382 break;
4383 case AMDGPU::V_MAC_F16_e32:
4384 case AMDGPU::V_FMAC_F16_e32:
4385 case AMDGPU::V_MAC_F32_e32:
4386 case AMDGPU::V_MAC_LEGACY_F32_e32:
4387 case AMDGPU::V_FMAC_F32_e32:
4388 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4389 case AMDGPU::V_FMAC_F64_e32: {
4390 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4391 AMDGPU::OpName::src0);
4392 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4393 if (!Src0->isReg() && !Src0->isImm())
4394 return nullptr;
4395
4396 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4397 Src0Literal = true;
4398
4399 break;
4400 }
4401 }
4402
4403 MachineInstrBuilder MIB;
4404 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4405 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4406 const MachineOperand *Src0Mods =
4407 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4408 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4409 const MachineOperand *Src1Mods =
4410 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4411 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4412 const MachineOperand *Src2Mods =
4413 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4414 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4415 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4416 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4417
4418 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4419 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4420 // If we have an SGPR input, we will violate the constant bus restriction.
4421 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4422 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4423 MachineInstr *DefMI;
4424
4425 int64_t Imm;
4426 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4427 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4428 if (pseudoToMCOpcode(NewOpc) != -1) {
4429 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4430 .add(*Dst)
4431 .add(*Src0)
4432 .add(*Src1)
4433 .addImm(Imm)
4434 .setMIFlags(MI.getFlags());
4435 U.RemoveMIUse = DefMI;
4436 return MIB;
4437 }
4438 }
4439 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4440 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4441 if (pseudoToMCOpcode(NewOpc) != -1) {
4442 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4443 .add(*Dst)
4444 .add(*Src0)
4445 .addImm(Imm)
4446 .add(*Src2)
4447 .setMIFlags(MI.getFlags());
4448 U.RemoveMIUse = DefMI;
4449 return MIB;
4450 }
4451 }
4452 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4453 if (Src0Literal) {
4454 Imm = Src0->getImm();
4455 DefMI = nullptr;
4456 }
4457 if (pseudoToMCOpcode(NewOpc) != -1 &&
4459 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4460 Src1)) {
4461 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4462 .add(*Dst)
4463 .add(*Src1)
4464 .addImm(Imm)
4465 .add(*Src2)
4466 .setMIFlags(MI.getFlags());
4467 U.RemoveMIUse = DefMI;
4468 return MIB;
4469 }
4470 }
4471 }
4472
4473 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4474 // if VOP3 does not allow a literal operand.
4475 if (Src0Literal && !ST.hasVOP3Literal())
4476 return nullptr;
4477
4478 unsigned NewOpc = getNewFMAInst(ST, Opc);
4479
4480 if (pseudoToMCOpcode(NewOpc) == -1)
4481 return nullptr;
4482
4483 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4484 .add(*Dst)
4485 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4486 .add(*Src0)
4487 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4488 .add(*Src1)
4489 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4490 .add(*Src2)
4491 .addImm(Clamp ? Clamp->getImm() : 0)
4492 .addImm(Omod ? Omod->getImm() : 0)
4493 .setMIFlags(MI.getFlags());
4494 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4495 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4496 return MIB;
4497}
4498
4499// It's not generally safe to move VALU instructions across these since it will
4500// start using the register as a base index rather than directly.
4501// XXX - Why isn't hasSideEffects sufficient for these?
4503 switch (MI.getOpcode()) {
4504 case AMDGPU::S_SET_GPR_IDX_ON:
4505 case AMDGPU::S_SET_GPR_IDX_MODE:
4506 case AMDGPU::S_SET_GPR_IDX_OFF:
4507 return true;
4508 default:
4509 return false;
4510 }
4511}
4512
4514 const MachineBasicBlock *MBB,
4515 const MachineFunction &MF) const {
4516 // Skipping the check for SP writes in the base implementation. The reason it
4517 // was added was apparently due to compile time concerns.
4518 //
4519 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4520 // but is probably avoidable.
4521
4522 // Copied from base implementation.
4523 // Terminators and labels can't be scheduled around.
4524 if (MI.isTerminator() || MI.isPosition())
4525 return true;
4526
4527 // INLINEASM_BR can jump to another block
4528 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4529 return true;
4530
4531 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4532 return true;
4533
4534 // Target-independent instructions do not have an implicit-use of EXEC, even
4535 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4536 // boundaries prevents incorrect movements of such instructions.
4537 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4538 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4539 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4540 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4541 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4543}
4544
4546 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4547 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4548 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4549}
4550
4552 // Instructions that access scratch use FLAT encoding or BUF encodings.
4553 if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
4554 return false;
4555
4556 // SCRATCH instructions always access scratch.
4557 if (isFLATScratch(MI))
4558 return true;
4559
4560 // If FLAT_SCRATCH registers are not initialized, we can never access scratch
4561 // via the aperture.
4562 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4563 return false;
4564
4565 // If there are no memory operands then conservatively assume the flat
4566 // operation may access scratch.
4567 if (MI.memoperands_empty())
4568 return true;
4569
4570 // See if any memory operand specifies an address space that involves scratch.
4571 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4572 unsigned AS = Memop->getAddrSpace();
4573 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4574 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4575 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4576 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4577 }
4578 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4579 });
4580}
4581
4583 assert(isFLAT(MI));
4584
4585 // All flat instructions use the VMEM counter except prefetch.
4586 if (!usesVM_CNT(MI))
4587 return false;
4588
4589 // If there are no memory operands then conservatively assume the flat
4590 // operation may access VMEM.
4591 if (MI.memoperands_empty())
4592 return true;
4593
4594 // See if any memory operand specifies an address space that involves VMEM.
4595 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4596 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4597 // (GDS) address space is not supported by flat operations. Therefore, simply
4598 // return true unless only the LDS address space is found.
4599 for (const MachineMemOperand *Memop : MI.memoperands()) {
4600 unsigned AS = Memop->getAddrSpace();
4602 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4603 return true;
4604 }
4605
4606 return false;
4607}
4608
4610 assert(isFLAT(MI));
4611
4612 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4613 if (!usesLGKM_CNT(MI))
4614 return false;
4615
4616 // If in tgsplit mode then there can be no use of LDS.
4617 if (ST.isTgSplitEnabled())
4618 return false;
4619
4620 // If there are no memory operands then conservatively assume the flat
4621 // operation may access LDS.
4622 if (MI.memoperands_empty())
4623 return true;
4624
4625 // See if any memory operand specifies an address space that involves LDS.
4626 for (const MachineMemOperand *Memop : MI.memoperands()) {
4627 unsigned AS = Memop->getAddrSpace();
4629 return true;
4630 }
4631
4632 return false;
4633}
4634
4636 // Skip the full operand and register alias search modifiesRegister
4637 // does. There's only a handful of instructions that touch this, it's only an
4638 // implicit def, and doesn't alias any other registers.
4639 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4640}
4641
4643 unsigned Opcode = MI.getOpcode();
4644
4645 if (MI.mayStore() && isSMRD(MI))
4646 return true; // scalar store or atomic
4647
4648 // This will terminate the function when other lanes may need to continue.
4649 if (MI.isReturn())
4650 return true;
4651
4652 // These instructions cause shader I/O that may cause hardware lockups
4653 // when executed with an empty EXEC mask.
4654 //
4655 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4656 // EXEC = 0, but checking for that case here seems not worth it
4657 // given the typical code patterns.
4658 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4659 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4660 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT ||
4661 Opcode == AMDGPU::S_SETHALT)
4662 return true;
4663
4664 if (MI.isCall() || MI.isInlineAsm())
4665 return true; // conservative assumption
4666
4667 // Assume that barrier interactions are only intended with active lanes.
4668 if (isBarrier(Opcode))
4669 return true;
4670
4671 // A mode change is a scalar operation that influences vector instructions.
4673 return true;
4674
4675 // These are like SALU instructions in terms of effects, so it's questionable
4676 // whether we should return true for those.
4677 //
4678 // However, executing them with EXEC = 0 causes them to operate on undefined
4679 // data, which we avoid by returning true here.
4680 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4681 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4682 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4683 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4684 return true;
4685
4686 return false;
4687}
4688
4690 const MachineInstr &MI) const {
4691 if (MI.isMetaInstruction())
4692 return false;
4693
4694 // This won't read exec if this is an SGPR->SGPR copy.
4695 if (MI.isCopyLike()) {
4696 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4697 return true;
4698
4699 // Make sure this isn't copying exec as a normal operand
4700 return MI.readsRegister(AMDGPU::EXEC, &RI);
4701 }
4702
4703 // Make a conservative assumption about the callee.
4704 if (MI.isCall())
4705 return true;
4706
4707 // Be conservative with any unhandled generic opcodes.
4708 if (!isTargetSpecificOpcode(MI.getOpcode()))
4709 return true;
4710
4711 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4712}
4713
4714bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4715 switch (Imm.getBitWidth()) {
4716 case 1: // This likely will be a condition code mask.
4717 return true;
4718
4719 case 32:
4720 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4721 ST.hasInv2PiInlineImm());
4722 case 64:
4723 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4724 ST.hasInv2PiInlineImm());
4725 case 16:
4726 return ST.has16BitInsts() &&
4727 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4728 ST.hasInv2PiInlineImm());
4729 default:
4730 llvm_unreachable("invalid bitwidth");
4731 }
4732}
4733
4735 APInt IntImm = Imm.bitcastToAPInt();
4736 int64_t IntImmVal = IntImm.getSExtValue();
4737 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4738 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4739 default:
4740 llvm_unreachable("invalid fltSemantics");
4743 return isInlineConstant(IntImm);
4745 return ST.has16BitInsts() &&
4746 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4748 return ST.has16BitInsts() &&
4749 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4750 }
4751}
4752
4753bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4754 // MachineOperand provides no way to tell the true operand size, since it only
4755 // records a 64-bit value. We need to know the size to determine if a 32-bit
4756 // floating point immediate bit pattern is legal for an integer immediate. It
4757 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4758 switch (OperandType) {
4768 int32_t Trunc = static_cast<int32_t>(Imm);
4769 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4770 }
4776 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4779 // We would expect inline immediates to not be concerned with an integer/fp
4780 // distinction. However, in the case of 16-bit integer operations, the
4781 // "floating point" values appear to not work. It seems read the low 16-bits
4782 // of 32-bit immediates, which happens to always work for the integer
4783 // values.
4784 //
4785 // See llvm bugzilla 46302.
4786 //
4787 // TODO: Theoretically we could use op-sel to use the high bits of the
4788 // 32-bit FP values.
4797 return AMDGPU::isPKFMACF16InlineConstant(Imm, ST.isGFX11Plus());
4802 return false;
4805 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4806 // A few special case instructions have 16-bit operands on subtargets
4807 // where 16-bit instructions are not legal.
4808 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4809 // constants in these cases
4810 int16_t Trunc = static_cast<int16_t>(Imm);
4811 return ST.has16BitInsts() &&
4812 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4813 }
4814
4815 return false;
4816 }
4819 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4820 int16_t Trunc = static_cast<int16_t>(Imm);
4821 return ST.has16BitInsts() &&
4822 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4823 }
4824 return false;
4825 }
4829 return false;
4831 return isLegalAV64PseudoImm(Imm);
4834 // Always embedded in the instruction for free.
4835 return true;
4845 // Just ignore anything else.
4846 return true;
4847 default:
4848 llvm_unreachable("invalid operand type");
4849 }
4850}
4851
4852static bool compareMachineOp(const MachineOperand &Op0,
4853 const MachineOperand &Op1) {
4854 if (Op0.getType() != Op1.getType())
4855 return false;
4856
4857 switch (Op0.getType()) {
4859 return Op0.getReg() == Op1.getReg();
4861 return Op0.getImm() == Op1.getImm();
4862 default:
4863 llvm_unreachable("Didn't expect to be comparing these operand types");
4864 }
4865}
4866
4868 const MCOperandInfo &OpInfo) const {
4869 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4870 return true;
4871
4872 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4873 return false;
4874
4875 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4876 return true;
4877
4878 return ST.hasVOP3Literal();
4879}
4880
4881bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4882 int64_t ImmVal) const {
4883 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4884 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4885 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4886 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4887 AMDGPU::OpName::src2))
4888 return false;
4889 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4890 }
4891
4892 return isLiteralOperandLegal(InstDesc, OpInfo);
4893}
4894
4895bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4896 const MachineOperand &MO) const {
4897 if (MO.isImm())
4898 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4899
4900 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4901 "unexpected imm-like operand kind");
4902 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4903 return isLiteralOperandLegal(InstDesc, OpInfo);
4904}
4905
4907 // 2 32-bit inline constants packed into one.
4908 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4909 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4910}
4911
4912bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4913 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4914 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4915 return false;
4916
4917 int Op32 = AMDGPU::getVOPe32(Opcode);
4918 if (Op32 == -1)
4919 return false;
4920
4921 return pseudoToMCOpcode(Op32) != -1;
4922}
4923
4924bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4925 // The src0_modifier operand is present on all instructions
4926 // that have modifiers.
4927
4928 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4929}
4930
4932 AMDGPU::OpName OpName) const {
4933 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4934 return Mods && Mods->getImm();
4935}
4936
4938 return any_of(ModifierOpNames,
4939 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4940}
4941
4943 const MachineRegisterInfo &MRI) const {
4944 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4945 // Can't shrink instruction with three operands.
4946 if (Src2) {
4947 switch (MI.getOpcode()) {
4948 default: return false;
4949
4950 case AMDGPU::V_ADDC_U32_e64:
4951 case AMDGPU::V_SUBB_U32_e64:
4952 case AMDGPU::V_SUBBREV_U32_e64: {
4953 const MachineOperand *Src1
4954 = getNamedOperand(MI, AMDGPU::OpName::src1);
4955 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4956 return false;
4957 // Additional verification is needed for sdst/src2.
4958 return true;
4959 }
4960 case AMDGPU::V_MAC_F16_e64:
4961 case AMDGPU::V_MAC_F32_e64:
4962 case AMDGPU::V_MAC_LEGACY_F32_e64:
4963 case AMDGPU::V_FMAC_F16_e64:
4964 case AMDGPU::V_FMAC_F16_t16_e64:
4965 case AMDGPU::V_FMAC_F16_fake16_e64:
4966 case AMDGPU::V_FMAC_F32_e64:
4967 case AMDGPU::V_FMAC_F64_e64:
4968 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4969 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4970 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4971 return false;
4972 break;
4973
4974 case AMDGPU::V_CNDMASK_B32_e64:
4975 break;
4976 }
4977 }
4978
4979 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4980 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4981 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4982 return false;
4983
4984 // We don't need to check src0, all input types are legal, so just make sure
4985 // src0 isn't using any modifiers.
4986 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4987 return false;
4988
4989 // Can it be shrunk to a valid 32 bit opcode?
4990 if (!hasVALU32BitEncoding(MI.getOpcode()))
4991 return false;
4992
4993 // Check output modifiers
4994 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4995 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4996 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4997 // TODO: Can we avoid checking bound_ctrl/fi here?
4998 // They are only used by permlane*_swap special case.
4999 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
5000 !hasModifiersSet(MI, AMDGPU::OpName::fi);
5001}
5002
5003// Set VCC operand with all flags from \p Orig, except for setting it as
5004// implicit.
5006 const MachineOperand &Orig) {
5007
5008 for (MachineOperand &Use : MI.implicit_operands()) {
5009 if (Use.isUse() &&
5010 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
5011 Use.setIsUndef(Orig.isUndef());
5012 Use.setIsKill(Orig.isKill());
5013 return;
5014 }
5015 }
5016}
5017
5019 unsigned Op32) const {
5020 MachineBasicBlock *MBB = MI.getParent();
5021
5022 const MCInstrDesc &Op32Desc = get(Op32);
5023 MachineInstrBuilder Inst32 =
5024 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
5025 .setMIFlags(MI.getFlags());
5026
5027 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
5028 // For VOPC instructions, this is replaced by an implicit def of vcc.
5029
5030 // We assume the defs of the shrunk opcode are in the same order, and the
5031 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
5032 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
5033 Inst32.add(MI.getOperand(I));
5034
5035 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
5036
5037 int Idx = MI.getNumExplicitDefs();
5038 for (const MachineOperand &Use : MI.explicit_uses()) {
5039 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
5041 continue;
5042
5043 if (&Use == Src2) {
5044 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
5045 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
5046 // replaced with an implicit read of vcc or vcc_lo. The implicit read
5047 // of vcc was already added during the initial BuildMI, but we
5048 // 1) may need to change vcc to vcc_lo to preserve the original register
5049 // 2) have to preserve the original flags.
5050 copyFlagsToImplicitVCC(*Inst32, *Src2);
5051 continue;
5052 }
5053 }
5054
5055 Inst32.add(Use);
5056 }
5057
5058 // FIXME: Losing implicit operands
5059 fixImplicitOperands(*Inst32);
5060 return Inst32;
5061}
5062
5064 // Null is free
5065 Register Reg = RegOp.getReg();
5066 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
5067 return false;
5068
5069 // SGPRs use the constant bus
5070
5071 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
5072 // physical register operands should also count, except for exec.
5073 if (RegOp.isImplicit())
5074 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
5075
5076 // SGPRs use the constant bus
5077 return AMDGPU::SReg_32RegClass.contains(Reg) ||
5078 AMDGPU::SReg_64RegClass.contains(Reg);
5079}
5080
5082 const MachineRegisterInfo &MRI) const {
5083 Register Reg = RegOp.getReg();
5084 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5085 : physRegUsesConstantBus(RegOp);
5086}
5087
5089 const MachineOperand &MO,
5090 const MCOperandInfo &OpInfo) const {
5091 // Literal constants use the constant bus.
5092 if (!MO.isReg())
5093 return !isInlineConstant(MO, OpInfo);
5094
5095 Register Reg = MO.getReg();
5096 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5098}
5099
5101 for (const MachineOperand &MO : MI.implicit_operands()) {
5102 // We only care about reads.
5103 if (MO.isDef())
5104 continue;
5105
5106 switch (MO.getReg()) {
5107 case AMDGPU::VCC:
5108 case AMDGPU::VCC_LO:
5109 case AMDGPU::VCC_HI:
5110 case AMDGPU::M0:
5111 case AMDGPU::FLAT_SCR:
5112 return MO.getReg();
5113
5114 default:
5115 break;
5116 }
5117 }
5118
5119 return Register();
5120}
5121
5122static bool shouldReadExec(const MachineInstr &MI) {
5123 if (SIInstrInfo::isVALU(MI)) {
5124 switch (MI.getOpcode()) {
5125 case AMDGPU::V_READLANE_B32:
5126 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
5127 case AMDGPU::V_WRITELANE_B32:
5128 case AMDGPU::SI_SPILL_S32_TO_VGPR:
5129 return false;
5130 }
5131
5132 return true;
5133 }
5134
5135 if (MI.isPreISelOpcode() ||
5136 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
5139 return false;
5140
5141 return true;
5142}
5143
5144static bool isRegOrFI(const MachineOperand &MO) {
5145 return MO.isReg() || MO.isFI();
5146}
5147
5148static bool isSubRegOf(const SIRegisterInfo &TRI,
5149 const MachineOperand &SuperVec,
5150 const MachineOperand &SubReg) {
5151 if (SubReg.getReg().isPhysical())
5152 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
5153
5154 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
5155 SubReg.getReg() == SuperVec.getReg();
5156}
5157
5158// Verify the illegal copy from vector register to SGPR for generic opcode COPY
5159bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
5160 const MachineRegisterInfo &MRI,
5161 StringRef &ErrInfo) const {
5162 Register DstReg = MI.getOperand(0).getReg();
5163 Register SrcReg = MI.getOperand(1).getReg();
5164 // This is a check for copy from vector register to SGPR
5165 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
5166 ErrInfo = "illegal copy from vector register to SGPR";
5167 return false;
5168 }
5169 return true;
5170}
5171
5173 StringRef &ErrInfo) const {
5174 uint32_t Opcode = MI.getOpcode();
5175 const MachineFunction *MF = MI.getMF();
5176 const MachineRegisterInfo &MRI = MF->getRegInfo();
5177
5178 // FIXME: At this point the COPY verify is done only for non-ssa forms.
5179 // Find a better property to recognize the point where instruction selection
5180 // is just done.
5181 // We can only enforce this check after SIFixSGPRCopies pass so that the
5182 // illegal copies are legalized and thereafter we don't expect a pass
5183 // inserting similar copies.
5184 if (!MRI.isSSA() && MI.isCopy())
5185 return verifyCopy(MI, MRI, ErrInfo);
5186
5187 if (SIInstrInfo::isGenericOpcode(Opcode))
5188 return true;
5189
5190 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
5191 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
5192 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
5193 int Src3Idx = -1;
5194 if (Src0Idx == -1) {
5195 // VOPD V_DUAL_* instructions use different operand names.
5196 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
5197 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
5198 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
5199 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
5200 }
5201
5202 // Make sure the number of operands is correct.
5203 const MCInstrDesc &Desc = get(Opcode);
5204 if (!Desc.isVariadic() &&
5205 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
5206 ErrInfo = "Instruction has wrong number of operands.";
5207 return false;
5208 }
5209
5210 if (MI.isInlineAsm()) {
5211 // Verify register classes for inlineasm constraints.
5212 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
5213 I != E; ++I) {
5214 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
5215 if (!RC)
5216 continue;
5217
5218 const MachineOperand &Op = MI.getOperand(I);
5219 if (!Op.isReg())
5220 continue;
5221
5222 Register Reg = Op.getReg();
5223 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5224 ErrInfo = "inlineasm operand has incorrect register class.";
5225 return false;
5226 }
5227 }
5228
5229 return true;
5230 }
5231
5232 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5233 ErrInfo = "missing memory operand from image instruction.";
5234 return false;
5235 }
5236
5237 // Make sure the register classes are correct.
5238 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5239 const MachineOperand &MO = MI.getOperand(i);
5240 if (MO.isFPImm()) {
5241 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5242 "all fp values to integers.";
5243 return false;
5244 }
5245
5246 const MCOperandInfo &OpInfo = Desc.operands()[i];
5247 int16_t RegClass = getOpRegClassID(OpInfo);
5248
5249 switch (OpInfo.OperandType) {
5251 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5252 ErrInfo = "Illegal immediate value for operand.";
5253 return false;
5254 }
5255 break;
5269 break;
5271 break;
5272 break;
5286 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5287 ErrInfo = "Illegal immediate value for operand.";
5288 return false;
5289 }
5290 break;
5291 }
5294 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5295 ErrInfo = "Expected inline constant for operand.";
5296 return false;
5297 }
5298 break;
5301 break;
5306 // Check if this operand is an immediate.
5307 // FrameIndex operands will be replaced by immediates, so they are
5308 // allowed.
5309 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5310 ErrInfo = "Expected immediate, but got non-immediate";
5311 return false;
5312 }
5313 break;
5317 break;
5318 default:
5319 if (OpInfo.isGenericType())
5320 continue;
5321 break;
5322 }
5323
5324 if (!MO.isReg())
5325 continue;
5326 Register Reg = MO.getReg();
5327 if (!Reg)
5328 continue;
5329
5330 // FIXME: Ideally we would have separate instruction definitions with the
5331 // aligned register constraint.
5332 // FIXME: We do not verify inline asm operands, but custom inline asm
5333 // verification is broken anyway
5334 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO &&
5335 Opcode != AMDGPU::V_MOV_B64_PSEUDO && !isSpill(MI)) {
5336 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5337 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5338 if (const TargetRegisterClass *SubRC =
5339 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5340 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5341 if (RC)
5342 RC = SubRC;
5343 }
5344 }
5345
5346 // Check that this is the aligned version of the class.
5347 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5348 ErrInfo = "Subtarget requires even aligned vector registers";
5349 return false;
5350 }
5351 }
5352
5353 if (RegClass != -1) {
5354 if (Reg.isVirtual())
5355 continue;
5356
5357 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5358 if (!RC->contains(Reg)) {
5359 ErrInfo = "Operand has incorrect register class.";
5360 return false;
5361 }
5362 }
5363 }
5364
5365 // Verify SDWA
5366 if (isSDWA(MI)) {
5367 if (!ST.hasSDWA()) {
5368 ErrInfo = "SDWA is not supported on this target";
5369 return false;
5370 }
5371
5372 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5373 AMDGPU::OpName::dst_sel}) {
5374 const MachineOperand *MO = getNamedOperand(MI, Op);
5375 if (!MO)
5376 continue;
5377 int64_t Imm = MO->getImm();
5378 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5379 ErrInfo = "Invalid SDWA selection";
5380 return false;
5381 }
5382 }
5383
5384 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5385
5386 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5387 if (OpIdx == -1)
5388 continue;
5389 const MachineOperand &MO = MI.getOperand(OpIdx);
5390
5391 if (!ST.hasSDWAScalar()) {
5392 // Only VGPRS on VI
5393 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5394 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5395 return false;
5396 }
5397 } else {
5398 // No immediates on GFX9
5399 if (!MO.isReg()) {
5400 ErrInfo =
5401 "Only reg allowed as operands in SDWA instructions on GFX9+";
5402 return false;
5403 }
5404 }
5405 }
5406
5407 if (!ST.hasSDWAOmod()) {
5408 // No omod allowed on VI
5409 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5410 if (OMod != nullptr &&
5411 (!OMod->isImm() || OMod->getImm() != 0)) {
5412 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5413 return false;
5414 }
5415 }
5416
5417 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5418 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5419 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5420 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5421 const MachineOperand *Src0ModsMO =
5422 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5423 unsigned Mods = Src0ModsMO->getImm();
5424 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5425 Mods & SISrcMods::SEXT) {
5426 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5427 return false;
5428 }
5429 }
5430
5431 uint32_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5432 if (isVOPC(BasicOpcode)) {
5433 if (!ST.hasSDWASdst() && DstIdx != -1) {
5434 // Only vcc allowed as dst on VI for VOPC
5435 const MachineOperand &Dst = MI.getOperand(DstIdx);
5436 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5437 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5438 return false;
5439 }
5440 } else if (!ST.hasSDWAOutModsVOPC()) {
5441 // No clamp allowed on GFX9 for VOPC
5442 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5443 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5444 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5445 return false;
5446 }
5447
5448 // No omod allowed on GFX9 for VOPC
5449 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5450 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5451 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5452 return false;
5453 }
5454 }
5455 }
5456
5457 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5458 if (DstUnused && DstUnused->isImm() &&
5459 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5460 const MachineOperand &Dst = MI.getOperand(DstIdx);
5461 if (!Dst.isReg() || !Dst.isTied()) {
5462 ErrInfo = "Dst register should have tied register";
5463 return false;
5464 }
5465
5466 const MachineOperand &TiedMO =
5467 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5468 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5469 ErrInfo =
5470 "Dst register should be tied to implicit use of preserved register";
5471 return false;
5472 }
5473 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5474 ErrInfo = "Dst register should use same physical register as preserved";
5475 return false;
5476 }
5477 }
5478 }
5479
5480 // Verify MIMG / VIMAGE / VSAMPLE
5481 if (isImage(Opcode) && !MI.mayStore()) {
5482 // Ensure that the return type used is large enough for all the options
5483 // being used TFE/LWE require an extra result register.
5484 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5485 if (DMask) {
5486 uint64_t DMaskImm = DMask->getImm();
5487 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5488 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5489 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5490 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5491
5492 // Adjust for packed 16 bit values
5493 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5494 RegCount = divideCeil(RegCount, 2);
5495
5496 // Adjust if using LWE or TFE
5497 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5498 RegCount += 1;
5499
5500 const uint32_t DstIdx =
5501 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5502 const MachineOperand &Dst = MI.getOperand(DstIdx);
5503 if (Dst.isReg()) {
5504 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5505 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5506 if (RegCount > DstSize) {
5507 ErrInfo = "Image instruction returns too many registers for dst "
5508 "register class";
5509 return false;
5510 }
5511 }
5512 }
5513 }
5514
5515 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5516 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5517 unsigned ConstantBusCount = 0;
5518 bool UsesLiteral = false;
5519 const MachineOperand *LiteralVal = nullptr;
5520
5521 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5522 if (ImmIdx != -1) {
5523 ++ConstantBusCount;
5524 UsesLiteral = true;
5525 LiteralVal = &MI.getOperand(ImmIdx);
5526 }
5527
5528 SmallVector<Register, 2> SGPRsUsed;
5529 Register SGPRUsed;
5530
5531 // Only look at the true operands. Only a real operand can use the constant
5532 // bus, and we don't want to check pseudo-operands like the source modifier
5533 // flags.
5534 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5535 if (OpIdx == -1)
5536 continue;
5537 const MachineOperand &MO = MI.getOperand(OpIdx);
5538 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5539 if (MO.isReg()) {
5540 SGPRUsed = MO.getReg();
5541 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5542 ++ConstantBusCount;
5543 SGPRsUsed.push_back(SGPRUsed);
5544 }
5545 } else if (!MO.isFI()) { // Treat FI like a register.
5546 if (!UsesLiteral) {
5547 ++ConstantBusCount;
5548 UsesLiteral = true;
5549 LiteralVal = &MO;
5550 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5551 assert(isVOP2(MI) || isVOP3(MI));
5552 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5553 return false;
5554 }
5555 }
5556 }
5557 }
5558
5559 SGPRUsed = findImplicitSGPRRead(MI);
5560 if (SGPRUsed) {
5561 // Implicit uses may safely overlap true operands
5562 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5563 return !RI.regsOverlap(SGPRUsed, SGPR);
5564 })) {
5565 ++ConstantBusCount;
5566 SGPRsUsed.push_back(SGPRUsed);
5567 }
5568 }
5569
5570 // v_writelane_b32 is an exception from constant bus restriction:
5571 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5572 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5573 Opcode != AMDGPU::V_WRITELANE_B32) {
5574 ErrInfo = "VOP* instruction violates constant bus restriction";
5575 return false;
5576 }
5577
5578 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5579 ErrInfo = "VOP3 instruction uses literal";
5580 return false;
5581 }
5582 }
5583
5584 // Special case for writelane - this can break the multiple constant bus rule,
5585 // but still can't use more than one SGPR register
5586 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5587 unsigned SGPRCount = 0;
5588 Register SGPRUsed;
5589
5590 for (int OpIdx : {Src0Idx, Src1Idx}) {
5591 if (OpIdx == -1)
5592 break;
5593
5594 const MachineOperand &MO = MI.getOperand(OpIdx);
5595
5596 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5597 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5598 if (MO.getReg() != SGPRUsed)
5599 ++SGPRCount;
5600 SGPRUsed = MO.getReg();
5601 }
5602 }
5603 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5604 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5605 return false;
5606 }
5607 }
5608 }
5609
5610 // Verify misc. restrictions on specific instructions.
5611 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5612 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5613 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5614 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5615 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5616 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5617 if (!compareMachineOp(Src0, Src1) &&
5618 !compareMachineOp(Src0, Src2)) {
5619 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5620 return false;
5621 }
5622 }
5623 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5624 SISrcMods::ABS) ||
5625 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5626 SISrcMods::ABS) ||
5627 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5628 SISrcMods::ABS)) {
5629 ErrInfo = "ABS not allowed in VOP3B instructions";
5630 return false;
5631 }
5632 }
5633
5634 if (isSOP2(MI) || isSOPC(MI)) {
5635 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5636 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5637
5638 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5639 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5640 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5641 !Src0.isIdenticalTo(Src1)) {
5642 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5643 return false;
5644 }
5645 }
5646
5647 if (isSOPK(MI)) {
5648 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5649 if (Desc.isBranch()) {
5650 if (!Op->isMBB()) {
5651 ErrInfo = "invalid branch target for SOPK instruction";
5652 return false;
5653 }
5654 } else {
5655 uint64_t Imm = Op->getImm();
5656 if (sopkIsZext(Opcode)) {
5657 if (!isUInt<16>(Imm)) {
5658 ErrInfo = "invalid immediate for SOPK instruction";
5659 return false;
5660 }
5661 } else {
5662 if (!isInt<16>(Imm)) {
5663 ErrInfo = "invalid immediate for SOPK instruction";
5664 return false;
5665 }
5666 }
5667 }
5668 }
5669
5670 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5671 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5672 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5673 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5674 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5675 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5676
5677 const unsigned StaticNumOps =
5678 Desc.getNumOperands() + Desc.implicit_uses().size();
5679 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5680
5681 // Require additional implicit operands. This allows a fixup done by the
5682 // post RA scheduler where the main implicit operand is killed and
5683 // implicit-defs are added for sub-registers that remain live after this
5684 // instruction.
5685 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5686 ErrInfo = "missing implicit register operands";
5687 return false;
5688 }
5689
5690 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5691 if (IsDst) {
5692 if (!Dst->isUse()) {
5693 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5694 return false;
5695 }
5696
5697 unsigned UseOpIdx;
5698 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5699 UseOpIdx != StaticNumOps + 1) {
5700 ErrInfo = "movrel implicit operands should be tied";
5701 return false;
5702 }
5703 }
5704
5705 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5706 const MachineOperand &ImpUse
5707 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5708 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5709 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5710 ErrInfo = "src0 should be subreg of implicit vector use";
5711 return false;
5712 }
5713 }
5714
5715 // Make sure we aren't losing exec uses in the td files. This mostly requires
5716 // being careful when using let Uses to try to add other use registers.
5717 if (shouldReadExec(MI)) {
5718 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5719 ErrInfo = "VALU instruction does not implicitly read exec mask";
5720 return false;
5721 }
5722 }
5723
5724 if (isSMRD(MI)) {
5725 if (MI.mayStore() &&
5726 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5727 // The register offset form of scalar stores may only use m0 as the
5728 // soffset register.
5729 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5730 if (Soff && Soff->getReg() != AMDGPU::M0) {
5731 ErrInfo = "scalar stores must use m0 as offset register";
5732 return false;
5733 }
5734 }
5735 }
5736
5737 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5738 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5739 if (Offset->getImm() != 0) {
5740 ErrInfo = "subtarget does not support offsets in flat instructions";
5741 return false;
5742 }
5743 }
5744
5745 if (isDS(MI) && !ST.hasGDS()) {
5746 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5747 if (GDSOp && GDSOp->getImm() != 0) {
5748 ErrInfo = "GDS is not supported on this subtarget";
5749 return false;
5750 }
5751 }
5752
5753 if (isImage(MI)) {
5754 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5755 if (DimOp) {
5756 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5757 AMDGPU::OpName::vaddr0);
5758 AMDGPU::OpName RSrcOpName =
5759 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5760 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5761 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5762 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5763 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5764 const AMDGPU::MIMGDimInfo *Dim =
5766
5767 if (!Dim) {
5768 ErrInfo = "dim is out of range";
5769 return false;
5770 }
5771
5772 bool IsA16 = false;
5773 if (ST.hasR128A16()) {
5774 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5775 IsA16 = R128A16->getImm() != 0;
5776 } else if (ST.hasA16()) {
5777 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5778 IsA16 = A16->getImm() != 0;
5779 }
5780
5781 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5782
5783 unsigned AddrWords =
5784 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5785
5786 unsigned VAddrWords;
5787 if (IsNSA) {
5788 VAddrWords = RsrcIdx - VAddr0Idx;
5789 if (ST.hasPartialNSAEncoding() &&
5790 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5791 unsigned LastVAddrIdx = RsrcIdx - 1;
5792 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5793 }
5794 } else {
5795 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5796 if (AddrWords > 12)
5797 AddrWords = 16;
5798 }
5799
5800 if (VAddrWords != AddrWords) {
5801 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5802 << " but got " << VAddrWords << "\n");
5803 ErrInfo = "bad vaddr size";
5804 return false;
5805 }
5806 }
5807 }
5808
5809 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5810 if (DppCt) {
5811 using namespace AMDGPU::DPP;
5812
5813 unsigned DC = DppCt->getImm();
5814 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5815 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5816 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5817 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5818 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5819 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5820 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5821 ErrInfo = "Invalid dpp_ctrl value";
5822 return false;
5823 }
5824 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5825 !ST.hasDPPWavefrontShifts()) {
5826 ErrInfo = "Invalid dpp_ctrl value: "
5827 "wavefront shifts are not supported on GFX10+";
5828 return false;
5829 }
5830 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5831 !ST.hasDPPBroadcasts()) {
5832 ErrInfo = "Invalid dpp_ctrl value: "
5833 "broadcasts are not supported on GFX10+";
5834 return false;
5835 }
5836 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5837 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5838 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5839 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5840 !ST.hasGFX90AInsts()) {
5841 ErrInfo = "Invalid dpp_ctrl value: "
5842 "row_newbroadcast/row_share is not supported before "
5843 "GFX90A/GFX10";
5844 return false;
5845 }
5846 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5847 ErrInfo = "Invalid dpp_ctrl value: "
5848 "row_share and row_xmask are not supported before GFX10";
5849 return false;
5850 }
5851 }
5852
5853 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5855 AMDGPU::isDPALU_DPP(Desc, *this, ST)) {
5856 ErrInfo = "Invalid dpp_ctrl value: "
5857 "DP ALU dpp only support row_newbcast";
5858 return false;
5859 }
5860 }
5861
5862 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5863 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5864 AMDGPU::OpName DataName =
5865 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5866 const MachineOperand *Data = getNamedOperand(MI, DataName);
5867 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5868 if (Data && !Data->isReg())
5869 Data = nullptr;
5870
5871 if (ST.hasGFX90AInsts()) {
5872 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5873 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5874 ErrInfo = "Invalid register class: "
5875 "vdata and vdst should be both VGPR or AGPR";
5876 return false;
5877 }
5878 if (Data && Data2 &&
5879 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5880 ErrInfo = "Invalid register class: "
5881 "both data operands should be VGPR or AGPR";
5882 return false;
5883 }
5884 } else {
5885 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5886 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5887 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5888 ErrInfo = "Invalid register class: "
5889 "agpr loads and stores not supported on this GPU";
5890 return false;
5891 }
5892 }
5893 }
5894
5895 if (ST.needsAlignedVGPRs()) {
5896 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5898 if (!Op)
5899 return true;
5900 Register Reg = Op->getReg();
5901 if (Reg.isPhysical())
5902 return !(RI.getHWRegIndex(Reg) & 1);
5903 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5904 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5905 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5906 };
5907
5908 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5909 Opcode == AMDGPU::DS_GWS_BARRIER) {
5910
5911 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5912 ErrInfo = "Subtarget requires even aligned vector registers "
5913 "for DS_GWS instructions";
5914 return false;
5915 }
5916 }
5917
5918 if (isMIMG(MI)) {
5919 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5920 ErrInfo = "Subtarget requires even aligned vector registers "
5921 "for vaddr operand of image instructions";
5922 return false;
5923 }
5924 }
5925 }
5926
5927 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5928 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5929 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5930 ErrInfo = "Invalid register class: "
5931 "v_accvgpr_write with an SGPR is not supported on this GPU";
5932 return false;
5933 }
5934 }
5935
5936 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5937 const MachineOperand &SrcOp = MI.getOperand(1);
5938 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5939 ErrInfo = "pseudo expects only physical SGPRs";
5940 return false;
5941 }
5942 }
5943
5944 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5945 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5946 if (!ST.hasScaleOffset()) {
5947 ErrInfo = "Subtarget does not support offset scaling";
5948 return false;
5949 }
5950 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5951 ErrInfo = "Instruction does not support offset scaling";
5952 return false;
5953 }
5954 }
5955 }
5956
5957 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5958 // information.
5959 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5960 for (unsigned I = 0; I < 3; ++I) {
5962 return false;
5963 }
5964 }
5965
5966 if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
5967 MI.readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, nullptr)) {
5968 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
5969 if ((Dst && RI.getRegClassForReg(MRI, Dst->getReg()) ==
5970 &AMDGPU::SReg_64RegClass) ||
5971 Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
5972 ErrInfo = "Instruction cannot read flat_scratch_base_hi";
5973 return false;
5974 }
5975 }
5976
5977 return true;
5978}
5979
5981 if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
5982 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
5983 return MI.getOperand(1).isReg() || RI.isAGPR(MRI, MI.getOperand(0).getReg())
5984 ? AMDGPU::COPY
5985 : AMDGPU::V_MOV_B32_e32;
5986 }
5987 return getVALUOp(MI.getOpcode());
5988}
5989
5990// It is more readable to list mapped opcodes on the same line.
5991// clang-format off
5992
5993unsigned SIInstrInfo::getVALUOp(unsigned Opc) const {
5994 switch (Opc) {
5995 default: return AMDGPU::INSTRUCTION_LIST_END;
5996 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5997 case AMDGPU::COPY: return AMDGPU::COPY;
5998 case AMDGPU::PHI: return AMDGPU::PHI;
5999 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
6000 case AMDGPU::WQM: return AMDGPU::WQM;
6001 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
6002 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
6003 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
6004 case AMDGPU::S_ADD_I32:
6005 return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
6006 case AMDGPU::S_ADDC_U32:
6007 return AMDGPU::V_ADDC_U32_e32;
6008 case AMDGPU::S_SUB_I32:
6009 return ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
6010 // FIXME: These are not consistently handled, and selected when the carry is
6011 // used.
6012 case AMDGPU::S_ADD_U32:
6013 return AMDGPU::V_ADD_CO_U32_e32;
6014 case AMDGPU::S_SUB_U32:
6015 return AMDGPU::V_SUB_CO_U32_e32;
6016 case AMDGPU::S_ADD_U64_PSEUDO:
6017 return AMDGPU::V_ADD_U64_PSEUDO;
6018 case AMDGPU::S_SUB_U64_PSEUDO:
6019 return AMDGPU::V_SUB_U64_PSEUDO;
6020 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
6021 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
6022 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
6023 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
6024 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
6025 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
6026 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
6027 case AMDGPU::S_XNOR_B32:
6028 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
6029 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
6030 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
6031 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
6032 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
6033 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
6034 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
6035 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
6036 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
6037 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
6038 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
6039 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
6040 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
6041 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
6042 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
6043 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
6044 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
6045 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
6046 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
6047 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
6048 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
6049 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
6050 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
6051 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
6052 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
6053 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
6054 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
6055 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
6056 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
6057 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
6058 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
6059 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
6060 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
6061 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
6062 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
6063 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
6064 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
6065 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
6066 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
6067 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
6068 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
6069 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
6070 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
6071 case AMDGPU::S_CVT_F32_F16:
6072 case AMDGPU::S_CVT_HI_F32_F16:
6073 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
6074 : AMDGPU::V_CVT_F32_F16_fake16_e64;
6075 case AMDGPU::S_CVT_F16_F32:
6076 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
6077 : AMDGPU::V_CVT_F16_F32_fake16_e64;
6078 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
6079 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
6080 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
6081 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
6082 case AMDGPU::S_CEIL_F16:
6083 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
6084 : AMDGPU::V_CEIL_F16_fake16_e64;
6085 case AMDGPU::S_FLOOR_F16:
6086 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
6087 : AMDGPU::V_FLOOR_F16_fake16_e64;
6088 case AMDGPU::S_TRUNC_F16:
6089 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
6090 : AMDGPU::V_TRUNC_F16_fake16_e64;
6091 case AMDGPU::S_RNDNE_F16:
6092 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
6093 : AMDGPU::V_RNDNE_F16_fake16_e64;
6094 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
6095 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
6096 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
6097 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
6098 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
6099 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
6100 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
6101 case AMDGPU::S_ADD_F16:
6102 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
6103 : AMDGPU::V_ADD_F16_fake16_e64;
6104 case AMDGPU::S_SUB_F16:
6105 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
6106 : AMDGPU::V_SUB_F16_fake16_e64;
6107 case AMDGPU::S_MIN_F16:
6108 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
6109 : AMDGPU::V_MIN_F16_fake16_e64;
6110 case AMDGPU::S_MAX_F16:
6111 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
6112 : AMDGPU::V_MAX_F16_fake16_e64;
6113 case AMDGPU::S_MINIMUM_F16:
6114 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
6115 : AMDGPU::V_MINIMUM_F16_fake16_e64;
6116 case AMDGPU::S_MAXIMUM_F16:
6117 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
6118 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
6119 case AMDGPU::S_MUL_F16:
6120 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
6121 : AMDGPU::V_MUL_F16_fake16_e64;
6122 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
6123 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
6124 case AMDGPU::S_FMAC_F16:
6125 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
6126 : AMDGPU::V_FMAC_F16_fake16_e64;
6127 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
6128 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
6129 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
6130 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
6131 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
6132 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
6133 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
6134 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
6135 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
6136 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
6137 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
6138 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
6139 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
6140 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
6141 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
6142 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
6143 case AMDGPU::S_CMP_LT_F16:
6144 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
6145 : AMDGPU::V_CMP_LT_F16_fake16_e64;
6146 case AMDGPU::S_CMP_EQ_F16:
6147 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
6148 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
6149 case AMDGPU::S_CMP_LE_F16:
6150 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
6151 : AMDGPU::V_CMP_LE_F16_fake16_e64;
6152 case AMDGPU::S_CMP_GT_F16:
6153 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
6154 : AMDGPU::V_CMP_GT_F16_fake16_e64;
6155 case AMDGPU::S_CMP_LG_F16:
6156 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
6157 : AMDGPU::V_CMP_LG_F16_fake16_e64;
6158 case AMDGPU::S_CMP_GE_F16:
6159 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
6160 : AMDGPU::V_CMP_GE_F16_fake16_e64;
6161 case AMDGPU::S_CMP_O_F16:
6162 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
6163 : AMDGPU::V_CMP_O_F16_fake16_e64;
6164 case AMDGPU::S_CMP_U_F16:
6165 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
6166 : AMDGPU::V_CMP_U_F16_fake16_e64;
6167 case AMDGPU::S_CMP_NGE_F16:
6168 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
6169 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
6170 case AMDGPU::S_CMP_NLG_F16:
6171 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
6172 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
6173 case AMDGPU::S_CMP_NGT_F16:
6174 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
6175 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
6176 case AMDGPU::S_CMP_NLE_F16:
6177 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
6178 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
6179 case AMDGPU::S_CMP_NEQ_F16:
6180 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
6181 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
6182 case AMDGPU::S_CMP_NLT_F16:
6183 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
6184 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
6185 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
6186 case AMDGPU::V_S_EXP_F16_e64:
6187 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
6188 : AMDGPU::V_EXP_F16_fake16_e64;
6189 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
6190 case AMDGPU::V_S_LOG_F16_e64:
6191 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
6192 : AMDGPU::V_LOG_F16_fake16_e64;
6193 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
6194 case AMDGPU::V_S_RCP_F16_e64:
6195 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
6196 : AMDGPU::V_RCP_F16_fake16_e64;
6197 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
6198 case AMDGPU::V_S_RSQ_F16_e64:
6199 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
6200 : AMDGPU::V_RSQ_F16_fake16_e64;
6201 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
6202 case AMDGPU::V_S_SQRT_F16_e64:
6203 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
6204 : AMDGPU::V_SQRT_F16_fake16_e64;
6205 }
6207 "Unexpected scalar opcode without corresponding vector one!");
6208}
6209
6210// clang-format on
6211
6215 const DebugLoc &DL, Register Reg,
6216 bool IsSCCLive,
6217 SlotIndexes *Indexes) const {
6218 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6219 const SIInstrInfo *TII = ST.getInstrInfo();
6221 if (IsSCCLive) {
6222 // Insert two move instructions, one to save the original value of EXEC and
6223 // the other to turn on all bits in EXEC. This is required as we can't use
6224 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
6225 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
6227 auto FlipExecMI =
6228 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
6229 if (Indexes) {
6230 Indexes->insertMachineInstrInMaps(*StoreExecMI);
6231 Indexes->insertMachineInstrInMaps(*FlipExecMI);
6232 }
6233 } else {
6234 auto SaveExec =
6235 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
6236 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
6237 if (Indexes)
6238 Indexes->insertMachineInstrInMaps(*SaveExec);
6239 }
6240}
6241
6244 const DebugLoc &DL, Register Reg,
6245 SlotIndexes *Indexes) const {
6247 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
6248 .addReg(Reg, RegState::Kill);
6249 if (Indexes)
6250 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
6251}
6252
6256 "Not a whole wave func");
6257 MachineBasicBlock &MBB = *MF.begin();
6258 for (MachineInstr &MI : MBB)
6259 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6260 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6261 return &MI;
6262
6263 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6264}
6265
6267 unsigned OpNo) const {
6268 const MCInstrDesc &Desc = get(MI.getOpcode());
6269 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6270 Desc.operands()[OpNo].RegClass == -1) {
6271 Register Reg = MI.getOperand(OpNo).getReg();
6272
6273 if (Reg.isVirtual()) {
6274 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6275 return MRI.getRegClass(Reg);
6276 }
6277 return RI.getPhysRegBaseClass(Reg);
6278 }
6279
6280 int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]);
6281 return RegClass < 0 ? nullptr : RI.getRegClass(RegClass);
6282}
6283
6286 MachineBasicBlock *MBB = MI.getParent();
6287 MachineOperand &MO = MI.getOperand(OpIdx);
6288 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6289 unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]);
6290 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6291 unsigned Size = RI.getRegSizeInBits(*RC);
6292 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6293 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6294 : AMDGPU::V_MOV_B32_e32;
6295 if (MO.isReg())
6296 Opcode = AMDGPU::COPY;
6297 else if (RI.isSGPRClass(RC))
6298 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6299
6300 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6301 Register Reg = MRI.createVirtualRegister(VRC);
6302 DebugLoc DL = MBB->findDebugLoc(I);
6303 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6304 MO.ChangeToRegister(Reg, false);
6305}
6306
6309 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6310 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6311 if (!SuperReg.getReg().isVirtual())
6312 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6313
6314 MachineBasicBlock *MBB = MI->getParent();
6315 const DebugLoc &DL = MI->getDebugLoc();
6316 Register SubReg = MRI.createVirtualRegister(SubRC);
6317
6318 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6319 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6320 .addReg(SuperReg.getReg(), {}, NewSubIdx);
6321 return SubReg;
6322}
6323
6326 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6327 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6328 if (Op.isImm()) {
6329 if (SubIdx == AMDGPU::sub0)
6330 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6331 if (SubIdx == AMDGPU::sub1)
6332 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6333
6334 llvm_unreachable("Unhandled register index for immediate");
6335 }
6336
6337 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6338 SubIdx, SubRC);
6339 return MachineOperand::CreateReg(SubReg, false);
6340}
6341
6342// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6343void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6344 assert(Inst.getNumExplicitOperands() == 3);
6345 MachineOperand Op1 = Inst.getOperand(1);
6346 Inst.removeOperand(1);
6347 Inst.addOperand(Op1);
6348}
6349
6351 const MCOperandInfo &OpInfo,
6352 const MachineOperand &MO) const {
6353 if (!MO.isReg())
6354 return false;
6355
6356 Register Reg = MO.getReg();
6357
6358 const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo));
6359 if (Reg.isPhysical())
6360 return DRC->contains(Reg);
6361
6362 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6363
6364 if (MO.getSubReg()) {
6365 const MachineFunction *MF = MO.getParent()->getMF();
6366 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6367 if (!SuperRC)
6368 return false;
6369 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6370 }
6371
6372 return RI.getCommonSubClass(DRC, RC) != nullptr;
6373}
6374
6376 const MachineOperand &MO) const {
6377 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6378 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6379 unsigned Opc = MI.getOpcode();
6380
6381 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6382 // information.
6383 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6384 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6385 constexpr AMDGPU::OpName OpNames[] = {
6386 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6387
6388 for (auto [I, OpName] : enumerate(OpNames)) {
6389 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6390 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6392 return false;
6393 }
6394 }
6395
6396 if (!isLegalRegOperand(MRI, OpInfo, MO))
6397 return false;
6398
6399 // check Accumulate GPR operand
6400 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6401 if (IsAGPR && !ST.hasMAIInsts())
6402 return false;
6403 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6404 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6405 return false;
6406 // Atomics should have both vdst and vdata either vgpr or agpr.
6407 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6408 const int DataIdx = AMDGPU::getNamedOperandIdx(
6409 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6410 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6411 MI.getOperand(DataIdx).isReg() &&
6412 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6413 return false;
6414 if ((int)OpIdx == DataIdx) {
6415 if (VDstIdx != -1 &&
6416 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6417 return false;
6418 // DS instructions with 2 src operands also must have tied RC.
6419 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6420 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6421 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6422 return false;
6423 }
6424
6425 // Check V_ACCVGPR_WRITE_B32_e64
6426 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6427 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6428 RI.isSGPRReg(MRI, MO.getReg()))
6429 return false;
6430
6431 if (ST.hasFlatScratchHiInB64InstHazard() &&
6432 MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
6433 if (const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst)) {
6434 if (AMDGPU::getRegBitWidth(*RI.getRegClassForReg(MRI, Dst->getReg())) ==
6435 64)
6436 return false;
6437 }
6438 if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
6439 return false;
6440 }
6441
6442 return true;
6443}
6444
6446 const MCOperandInfo &OpInfo,
6447 const MachineOperand &MO) const {
6448 if (MO.isReg())
6449 return isLegalRegOperand(MRI, OpInfo, MO);
6450
6451 // Handle non-register types that are treated like immediates.
6452 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6453 return true;
6454}
6455
6457 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6458 const MachineOperand *MO) const {
6459 constexpr unsigned NumOps = 3;
6460 constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
6461 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6462 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6463 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6464
6465 assert(SrcN < NumOps);
6466
6467 if (!MO) {
6468 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6469 if (SrcIdx == -1)
6470 return true;
6471 MO = &MI.getOperand(SrcIdx);
6472 }
6473
6474 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6475 return true;
6476
6477 int ModsIdx =
6478 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6479 if (ModsIdx == -1)
6480 return true;
6481
6482 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6483 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6484 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6485
6486 return !OpSel && !OpSelHi;
6487}
6488
6490 const MachineOperand *MO) const {
6491 const MachineFunction &MF = *MI.getMF();
6492 const MachineRegisterInfo &MRI = MF.getRegInfo();
6493 const MCInstrDesc &InstDesc = MI.getDesc();
6494 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6495 int64_t RegClass = getOpRegClassID(OpInfo);
6496 const TargetRegisterClass *DefinedRC =
6497 RegClass != -1 ? RI.getRegClass(RegClass) : nullptr;
6498 if (!MO)
6499 MO = &MI.getOperand(OpIdx);
6500
6501 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6502
6503 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6504 const MachineOperand *UsedLiteral = nullptr;
6505
6506 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6507 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6508
6509 // TODO: Be more permissive with frame indexes.
6510 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6511 if (!LiteralLimit--)
6512 return false;
6513
6514 UsedLiteral = MO;
6515 }
6516
6518 if (MO->isReg())
6519 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6520
6521 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6522 if (i == OpIdx)
6523 continue;
6524 const MachineOperand &Op = MI.getOperand(i);
6525 if (Op.isReg()) {
6526 if (Op.isUse()) {
6527 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6528 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6529 if (--ConstantBusLimit <= 0)
6530 return false;
6531 }
6532 }
6533 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6534 !isInlineConstant(Op, InstDesc.operands()[i])) {
6535 // The same literal may be used multiple times.
6536 if (!UsedLiteral)
6537 UsedLiteral = &Op;
6538 else if (UsedLiteral->isIdenticalTo(Op))
6539 continue;
6540
6541 if (!LiteralLimit--)
6542 return false;
6543 if (--ConstantBusLimit <= 0)
6544 return false;
6545 }
6546 }
6547 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6548 // There can be at most one literal operand, but it can be repeated.
6549 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6550 if (i == OpIdx)
6551 continue;
6552 const MachineOperand &Op = MI.getOperand(i);
6553 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6554 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6555 !Op.isIdenticalTo(*MO))
6556 return false;
6557
6558 // Do not fold a non-inlineable and non-register operand into an
6559 // instruction that already has a frame index. The frame index handling
6560 // code could not handle well when a frame index co-exists with another
6561 // non-register operand, unless that operand is an inlineable immediate.
6562 if (Op.isFI())
6563 return false;
6564 }
6565 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6566 isF16PseudoScalarTrans(MI.getOpcode())) {
6567 return false;
6568 }
6569
6570 if (MO->isReg()) {
6571 if (!DefinedRC)
6572 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6573 return isLegalRegOperand(MI, OpIdx, *MO);
6574 }
6575
6576 if (MO->isImm()) {
6577 uint64_t Imm = MO->getImm();
6578 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6579 bool Is64BitOp = Is64BitFPOp ||
6580 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6581 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6582 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6583 if (Is64BitOp &&
6584 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6585 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6586 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6587 return false;
6588
6589 // FIXME: We can use sign extended 64-bit literals, but only for signed
6590 // operands. At the moment we do not know if an operand is signed.
6591 // Such operand will be encoded as its low 32 bits and then either
6592 // correctly sign extended or incorrectly zero extended by HW.
6593 // If 64-bit literals are supported and the literal will be encoded
6594 // as full 64 bit we still can use it.
6595 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6596 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6597 return false;
6598 }
6599 }
6600
6601 // Handle non-register types that are treated like immediates.
6602 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6603
6604 if (!DefinedRC) {
6605 // This operand expects an immediate.
6606 return true;
6607 }
6608
6609 return isImmOperandLegal(MI, OpIdx, *MO);
6610}
6611
6613 bool IsGFX950Only = ST.hasGFX950Insts();
6614 bool IsGFX940Only = ST.hasGFX940Insts();
6615
6616 if (!IsGFX950Only && !IsGFX940Only)
6617 return false;
6618
6619 if (!isVALU(MI))
6620 return false;
6621
6622 // V_COS, V_EXP, V_RCP, etc.
6623 if (isTRANS(MI))
6624 return true;
6625
6626 // DOT2, DOT2C, DOT4, etc.
6627 if (isDOT(MI))
6628 return true;
6629
6630 // MFMA, SMFMA
6631 if (isMFMA(MI))
6632 return true;
6633
6634 unsigned Opcode = MI.getOpcode();
6635 switch (Opcode) {
6636 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6637 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6638 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6639 case AMDGPU::V_MQSAD_U32_U8_e64:
6640 case AMDGPU::V_PK_ADD_F16:
6641 case AMDGPU::V_PK_ADD_F32:
6642 case AMDGPU::V_PK_ADD_I16:
6643 case AMDGPU::V_PK_ADD_U16:
6644 case AMDGPU::V_PK_ASHRREV_I16:
6645 case AMDGPU::V_PK_FMA_F16:
6646 case AMDGPU::V_PK_FMA_F32:
6647 case AMDGPU::V_PK_FMAC_F16_e32:
6648 case AMDGPU::V_PK_FMAC_F16_e64:
6649 case AMDGPU::V_PK_LSHLREV_B16:
6650 case AMDGPU::V_PK_LSHRREV_B16:
6651 case AMDGPU::V_PK_MAD_I16:
6652 case AMDGPU::V_PK_MAD_U16:
6653 case AMDGPU::V_PK_MAX_F16:
6654 case AMDGPU::V_PK_MAX_I16:
6655 case AMDGPU::V_PK_MAX_U16:
6656 case AMDGPU::V_PK_MIN_F16:
6657 case AMDGPU::V_PK_MIN_I16:
6658 case AMDGPU::V_PK_MIN_U16:
6659 case AMDGPU::V_PK_MOV_B32:
6660 case AMDGPU::V_PK_MUL_F16:
6661 case AMDGPU::V_PK_MUL_F32:
6662 case AMDGPU::V_PK_MUL_LO_U16:
6663 case AMDGPU::V_PK_SUB_I16:
6664 case AMDGPU::V_PK_SUB_U16:
6665 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6666 return true;
6667 default:
6668 return false;
6669 }
6670}
6671
6673 MachineInstr &MI) const {
6674 unsigned Opc = MI.getOpcode();
6675 const MCInstrDesc &InstrDesc = get(Opc);
6676
6677 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6678 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6679
6680 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6681 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6682
6683 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6684 // we need to only have one constant bus use before GFX10.
6685 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6686 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6687 RI.isSGPRReg(MRI, Src0.getReg()))
6688 legalizeOpWithMove(MI, Src0Idx);
6689
6690 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6691 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6692 // src0/src1 with V_READFIRSTLANE.
6693 if (Opc == AMDGPU::V_WRITELANE_B32) {
6694 const DebugLoc &DL = MI.getDebugLoc();
6695 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6696 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6697 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6698 .add(Src0);
6699 Src0.ChangeToRegister(Reg, false);
6700 }
6701 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6702 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6703 const DebugLoc &DL = MI.getDebugLoc();
6704 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6705 .add(Src1);
6706 Src1.ChangeToRegister(Reg, false);
6707 }
6708 return;
6709 }
6710
6711 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6712 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6713 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6714 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6715 legalizeOpWithMove(MI, Src2Idx);
6716 }
6717
6718 // VOP2 src0 instructions support all operand types, so we don't need to check
6719 // their legality. If src1 is already legal, we don't need to do anything.
6720 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6721 return;
6722
6723 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6724 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6725 // select is uniform.
6726 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6727 RI.isVGPR(MRI, Src1.getReg())) {
6728 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6729 const DebugLoc &DL = MI.getDebugLoc();
6730 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6731 .add(Src1);
6732 Src1.ChangeToRegister(Reg, false);
6733 return;
6734 }
6735
6736 // We do not use commuteInstruction here because it is too aggressive and will
6737 // commute if it is possible. We only want to commute here if it improves
6738 // legality. This can be called a fairly large number of times so don't waste
6739 // compile time pointlessly swapping and checking legality again.
6740 if (HasImplicitSGPR || !MI.isCommutable()) {
6741 legalizeOpWithMove(MI, Src1Idx);
6742 return;
6743 }
6744
6745 // If src0 can be used as src1, commuting will make the operands legal.
6746 // Otherwise we have to give up and insert a move.
6747 //
6748 // TODO: Other immediate-like operand kinds could be commuted if there was a
6749 // MachineOperand::ChangeTo* for them.
6750 if ((!Src1.isImm() && !Src1.isReg()) ||
6751 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6752 legalizeOpWithMove(MI, Src1Idx);
6753 return;
6754 }
6755
6756 int CommutedOpc = commuteOpcode(MI);
6757 if (CommutedOpc == -1) {
6758 legalizeOpWithMove(MI, Src1Idx);
6759 return;
6760 }
6761
6762 MI.setDesc(get(CommutedOpc));
6763
6764 Register Src0Reg = Src0.getReg();
6765 unsigned Src0SubReg = Src0.getSubReg();
6766 bool Src0Kill = Src0.isKill();
6767
6768 if (Src1.isImm())
6769 Src0.ChangeToImmediate(Src1.getImm());
6770 else if (Src1.isReg()) {
6771 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6772 Src0.setSubReg(Src1.getSubReg());
6773 } else
6774 llvm_unreachable("Should only have register or immediate operands");
6775
6776 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6777 Src1.setSubReg(Src0SubReg);
6779}
6780
6781// Legalize VOP3 operands. All operand types are supported for any operand
6782// but only one literal constant and only starting from GFX10.
6784 MachineInstr &MI) const {
6785 unsigned Opc = MI.getOpcode();
6786
6787 int VOP3Idx[3] = {
6788 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6789 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6790 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6791 };
6792
6793 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6794 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6795 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6796 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6797 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6798 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6799 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6800 // src1 and src2 must be scalar
6801 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6802 const DebugLoc &DL = MI.getDebugLoc();
6803 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6804 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6805 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6806 .add(Src1);
6807 Src1.ChangeToRegister(Reg, false);
6808 }
6809 if (VOP3Idx[2] != -1) {
6810 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6811 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6812 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6813 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6814 .add(Src2);
6815 Src2.ChangeToRegister(Reg, false);
6816 }
6817 }
6818 }
6819
6820 // Find the one SGPR operand we are allowed to use.
6821 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6822 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6823 SmallDenseSet<unsigned> SGPRsUsed;
6824 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6825 if (SGPRReg) {
6826 SGPRsUsed.insert(SGPRReg);
6827 --ConstantBusLimit;
6828 }
6829
6830 for (int Idx : VOP3Idx) {
6831 if (Idx == -1)
6832 break;
6833 MachineOperand &MO = MI.getOperand(Idx);
6834
6835 if (!MO.isReg()) {
6836 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6837 continue;
6838
6839 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6840 --LiteralLimit;
6841 --ConstantBusLimit;
6842 continue;
6843 }
6844
6845 --LiteralLimit;
6846 --ConstantBusLimit;
6847 legalizeOpWithMove(MI, Idx);
6848 continue;
6849 }
6850
6851 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6852 continue; // VGPRs are legal
6853
6854 // We can use one SGPR in each VOP3 instruction prior to GFX10
6855 // and two starting from GFX10.
6856 if (SGPRsUsed.count(MO.getReg()))
6857 continue;
6858 if (ConstantBusLimit > 0) {
6859 SGPRsUsed.insert(MO.getReg());
6860 --ConstantBusLimit;
6861 continue;
6862 }
6863
6864 // If we make it this far, then the operand is not legal and we must
6865 // legalize it.
6866 legalizeOpWithMove(MI, Idx);
6867 }
6868
6869 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6870 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6871 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6872 legalizeOpWithMove(MI, VOP3Idx[2]);
6873
6874 // Fix the register class of packed FP32 instructions on gfx12+. See
6875 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6877 for (unsigned I = 0; I < 3; ++I) {
6878 if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, /*SrcN=*/I))
6879 legalizeOpWithMove(MI, VOP3Idx[I]);
6880 }
6881 }
6882}
6883
6886 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6887 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6888 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6889 if (DstRC)
6890 SRC = RI.getCommonSubClass(SRC, DstRC);
6891
6892 Register DstReg = MRI.createVirtualRegister(SRC);
6893 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6894
6895 if (RI.hasAGPRs(VRC)) {
6896 VRC = RI.getEquivalentVGPRClass(VRC);
6897 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6898 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6899 get(TargetOpcode::COPY), NewSrcReg)
6900 .addReg(SrcReg);
6901 SrcReg = NewSrcReg;
6902 }
6903
6904 if (SubRegs == 1) {
6905 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6906 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6907 .addReg(SrcReg);
6908 return DstReg;
6909 }
6910
6912 for (unsigned i = 0; i < SubRegs; ++i) {
6913 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6914 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6915 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6916 .addReg(SrcReg, {}, RI.getSubRegFromChannel(i));
6917 SRegs.push_back(SGPR);
6918 }
6919
6921 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6922 get(AMDGPU::REG_SEQUENCE), DstReg);
6923 for (unsigned i = 0; i < SubRegs; ++i) {
6924 MIB.addReg(SRegs[i]);
6925 MIB.addImm(RI.getSubRegFromChannel(i));
6926 }
6927 return DstReg;
6928}
6929
6931 MachineInstr &MI) const {
6932
6933 // If the pointer is store in VGPRs, then we need to move them to
6934 // SGPRs using v_readfirstlane. This is safe because we only select
6935 // loads with uniform pointers to SMRD instruction so we know the
6936 // pointer value is uniform.
6937 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6938 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6939 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6940 SBase->setReg(SGPR);
6941 }
6942 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6943 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6944 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6945 SOff->setReg(SGPR);
6946 }
6947}
6948
6950 unsigned Opc = Inst.getOpcode();
6951 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6952 if (OldSAddrIdx < 0)
6953 return false;
6954
6955 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6956
6957 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6958 if (NewOpc < 0)
6960 if (NewOpc < 0)
6961 return false;
6962
6963 MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();
6964 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6965 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6966 return false;
6967
6968 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6969 if (NewVAddrIdx < 0)
6970 return false;
6971
6972 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6973
6974 // Check vaddr, it shall be zero or absent.
6975 MachineInstr *VAddrDef = nullptr;
6976 if (OldVAddrIdx >= 0) {
6977 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6978 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6979 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6980 !VAddrDef->getOperand(1).isImm() ||
6981 VAddrDef->getOperand(1).getImm() != 0)
6982 return false;
6983 }
6984
6985 const MCInstrDesc &NewDesc = get(NewOpc);
6986 Inst.setDesc(NewDesc);
6987
6988 // Callers expect iterator to be valid after this call, so modify the
6989 // instruction in place.
6990 if (OldVAddrIdx == NewVAddrIdx) {
6991 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6992 // Clear use list from the old vaddr holding a zero register.
6993 MRI.removeRegOperandFromUseList(&NewVAddr);
6994 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6995 Inst.removeOperand(OldSAddrIdx);
6996 // Update the use list with the pointer we have just moved from vaddr to
6997 // saddr position. Otherwise new vaddr will be missing from the use list.
6998 MRI.removeRegOperandFromUseList(&NewVAddr);
6999 MRI.addRegOperandToUseList(&NewVAddr);
7000 } else {
7001 assert(OldSAddrIdx == NewVAddrIdx);
7002
7003 if (OldVAddrIdx >= 0) {
7004 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
7005 AMDGPU::OpName::vdst_in);
7006
7007 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
7008 // it asserts. Untie the operands for now and retie them afterwards.
7009 if (NewVDstIn != -1) {
7010 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
7011 Inst.untieRegOperand(OldVDstIn);
7012 }
7013
7014 Inst.removeOperand(OldVAddrIdx);
7015
7016 if (NewVDstIn != -1) {
7017 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
7018 Inst.tieOperands(NewVDst, NewVDstIn);
7019 }
7020 }
7021 }
7022
7023 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
7024 VAddrDef->eraseFromParent();
7025
7026 return true;
7027}
7028
7029// FIXME: Remove this when SelectionDAG is obsoleted.
7031 MachineInstr &MI) const {
7032 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
7033 return;
7034
7035 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
7036 // thinks they are uniform, so a readfirstlane should be valid.
7037 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
7038 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
7039 return;
7040
7042 return;
7043
7044 const TargetRegisterClass *DeclaredRC =
7045 getRegClass(MI.getDesc(), SAddr->getOperandNo());
7046
7047 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
7048 SAddr->setReg(ToSGPR);
7049}
7050
7053 const TargetRegisterClass *DstRC,
7056 const DebugLoc &DL) const {
7057 Register OpReg = Op.getReg();
7058 unsigned OpSubReg = Op.getSubReg();
7059
7060 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
7061 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
7062
7063 // Check if operand is already the correct register class.
7064 if (DstRC == OpRC)
7065 return;
7066
7067 Register DstReg = MRI.createVirtualRegister(DstRC);
7068 auto Copy =
7069 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
7070 Op.setReg(DstReg);
7071
7072 MachineInstr *Def = MRI.getVRegDef(OpReg);
7073 if (!Def)
7074 return;
7075
7076 // Try to eliminate the copy if it is copying an immediate value.
7077 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
7078 foldImmediate(*Copy, *Def, OpReg, &MRI);
7079
7080 bool ImpDef = Def->isImplicitDef();
7081 while (!ImpDef && Def && Def->isCopy()) {
7082 if (Def->getOperand(1).getReg().isPhysical())
7083 break;
7084 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
7085 ImpDef = Def && Def->isImplicitDef();
7086 }
7087 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
7088 !ImpDef)
7089 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
7090}
7091
7092// Emit the actual waterfall loop, executing the wrapped instruction for each
7093// unique value of \p ScalarOps across all lanes. In the best case we execute 1
7094// iteration, in the worst case we execute 64 (once per lane).
7097 MachineBasicBlock &BodyBB, const DebugLoc &DL,
7098 ArrayRef<MachineOperand *> ScalarOps, ArrayRef<Register> PhySGPRs = {}) {
7099 MachineFunction &MF = *LoopBB.getParent();
7101 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7103 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7104
7106 Register CondReg;
7107 for (auto [Idx, ScalarOp] : enumerate(ScalarOps)) {
7108 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
7109 unsigned NumSubRegs = RegSize / 32;
7110 Register VScalarOp = ScalarOp->getReg();
7111
7112 if (NumSubRegs == 1) {
7113 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7114
7115 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
7116 .addReg(VScalarOp);
7117
7118 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7119
7120 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
7121 .addReg(CurReg)
7122 .addReg(VScalarOp);
7123
7124 // Combine the comparison results with AND.
7125 if (!CondReg) // First.
7126 CondReg = NewCondReg;
7127 else { // If not the first, we create an AND.
7128 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7129 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7130 .addReg(CondReg)
7131 .addReg(NewCondReg);
7132 CondReg = AndReg;
7133 }
7134
7135 // Update ScalarOp operand to use the SGPR ScalarOp.
7136 if (PhySGPRs.empty() || !PhySGPRs[Idx].isValid())
7137 ScalarOp->setReg(CurReg);
7138 else {
7139 // Insert into the same block of use
7140 BuildMI(*ScalarOp->getParent()->getParent(), ScalarOp->getParent(), DL,
7141 TII.get(AMDGPU::COPY), PhySGPRs[Idx])
7142 .addReg(CurReg);
7143 ScalarOp->setReg(PhySGPRs[Idx]);
7144 }
7145 ScalarOp->setIsKill();
7146 } else {
7147 SmallVector<Register, 8> ReadlanePieces;
7148 RegState VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
7149 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
7150 "Unhandled register size");
7151
7152 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
7153 Register CurRegLo =
7154 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7155 Register CurRegHi =
7156 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7157
7158 // Read the next variant <- also loop target.
7159 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
7160 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
7161
7162 // Read the next variant <- also loop target.
7163 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
7164 .addReg(VScalarOp, VScalarOpUndef,
7165 TRI->getSubRegFromChannel(Idx + 1));
7166
7167 ReadlanePieces.push_back(CurRegLo);
7168 ReadlanePieces.push_back(CurRegHi);
7169
7170 // Comparison is to be done as 64-bit.
7171 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
7172 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
7173 .addReg(CurRegLo)
7174 .addImm(AMDGPU::sub0)
7175 .addReg(CurRegHi)
7176 .addImm(AMDGPU::sub1);
7177
7178 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7179 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
7180 NewCondReg)
7181 .addReg(CurReg);
7182 if (NumSubRegs <= 2)
7183 Cmp.addReg(VScalarOp);
7184 else
7185 Cmp.addReg(VScalarOp, VScalarOpUndef,
7186 TRI->getSubRegFromChannel(Idx, 2));
7187
7188 // Combine the comparison results with AND.
7189 if (!CondReg) // First.
7190 CondReg = NewCondReg;
7191 else { // If not the first, we create an AND.
7192 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7193 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7194 .addReg(CondReg)
7195 .addReg(NewCondReg);
7196 CondReg = AndReg;
7197 }
7198 } // End for loop.
7199
7200 const auto *SScalarOpRC =
7201 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
7202 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
7203
7204 // Build scalar ScalarOp.
7205 auto Merge =
7206 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
7207 unsigned Channel = 0;
7208 for (Register Piece : ReadlanePieces) {
7209 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
7210 }
7211
7212 // Update ScalarOp operand to use the SGPR ScalarOp.
7213 if (PhySGPRs.empty() || !PhySGPRs[Idx].isValid())
7214 ScalarOp->setReg(SScalarOp);
7215 else {
7216 BuildMI(*ScalarOp->getParent()->getParent(), ScalarOp->getParent(), DL,
7217 TII.get(AMDGPU::COPY), PhySGPRs[Idx])
7218 .addReg(SScalarOp);
7219 ScalarOp->setReg(PhySGPRs[Idx]);
7220 }
7221 ScalarOp->setIsKill();
7222 }
7223 }
7224
7225 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7226 MRI.setSimpleHint(SaveExec, CondReg);
7227
7228 // Update EXEC to matching lanes, saving original to SaveExec.
7229 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
7230 .addReg(CondReg, RegState::Kill);
7231
7232 // The original instruction is here; we insert the terminators after it.
7233 I = BodyBB.end();
7234
7235 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
7236 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
7237 .addReg(LMC.ExecReg)
7238 .addReg(SaveExec);
7239
7240 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
7241}
7242
7243// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7244// with SGPRs by iterating over all unique values across all lanes.
7245// Returns the loop basic block that now contains \p MI.
7246static MachineBasicBlock *
7250 MachineBasicBlock::iterator Begin = nullptr,
7251 MachineBasicBlock::iterator End = nullptr,
7252 ArrayRef<Register> PhySGPRs = {}) {
7253 assert((PhySGPRs.empty() || PhySGPRs.size() == ScalarOps.size()) &&
7254 "Physical SGPRs must be empty or match the number of scalar operands");
7255 MachineBasicBlock &MBB = *MI.getParent();
7256 MachineFunction &MF = *MBB.getParent();
7258 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7259 MachineRegisterInfo &MRI = MF.getRegInfo();
7260 if (!Begin.isValid())
7261 Begin = &MI;
7262 if (!End.isValid()) {
7263 End = &MI;
7264 ++End;
7265 }
7266 const DebugLoc &DL = MI.getDebugLoc();
7268 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7269
7270 // Save SCC. Waterfall Loop may overwrite SCC.
7271 Register SaveSCCReg;
7272
7273 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7274 // rather than unlimited scan everywhere
7275 bool SCCNotDead =
7276 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
7277 std::numeric_limits<unsigned>::max()) !=
7279 if (SCCNotDead) {
7280 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7281 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
7282 .addImm(1)
7283 .addImm(0);
7284 }
7285
7286 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7287
7288 // Save the EXEC mask
7289 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
7290
7291 // Killed uses in the instruction we are waterfalling around will be
7292 // incorrect due to the added control-flow.
7294 ++AfterMI;
7295 for (auto I = Begin; I != AfterMI; I++) {
7296 for (auto &MO : I->all_uses())
7297 MRI.clearKillFlags(MO.getReg());
7298 }
7299
7300 // To insert the loop we need to split the block. Move everything after this
7301 // point to a new block, and insert a new empty block between the two.
7304 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7306 ++MBBI;
7307
7308 MF.insert(MBBI, LoopBB);
7309 MF.insert(MBBI, BodyBB);
7310 MF.insert(MBBI, RemainderBB);
7311
7312 LoopBB->addSuccessor(BodyBB);
7313 BodyBB->addSuccessor(LoopBB);
7314 BodyBB->addSuccessor(RemainderBB);
7315
7316 // Move Begin to MI to the BodyBB, and the remainder of the block to
7317 // RemainderBB.
7318 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7319 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7320 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7321
7322 MBB.addSuccessor(LoopBB);
7323
7324 // Update dominators. We know that MBB immediately dominates LoopBB, that
7325 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7326 // RemainderBB. RemainderBB immediately dominates all of the successors
7327 // transferred to it from MBB that MBB used to properly dominate.
7328 if (MDT) {
7329 MDT->addNewBlock(LoopBB, &MBB);
7330 MDT->addNewBlock(BodyBB, LoopBB);
7331 MDT->addNewBlock(RemainderBB, BodyBB);
7332 for (auto &Succ : RemainderBB->successors()) {
7333 if (MDT->properlyDominates(&MBB, Succ)) {
7334 MDT->changeImmediateDominator(Succ, RemainderBB);
7335 }
7336 }
7337 }
7338
7339 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps,
7340 PhySGPRs);
7341
7342 MachineBasicBlock::iterator First = RemainderBB->begin();
7343 // Restore SCC
7344 if (SCCNotDead) {
7345 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7346 .addReg(SaveSCCReg, RegState::Kill)
7347 .addImm(0);
7348 }
7349
7350 // Restore the EXEC mask
7351 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7352 .addReg(SaveExec);
7353 return BodyBB;
7354}
7355
7356// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7357static std::tuple<unsigned, unsigned>
7359 MachineBasicBlock &MBB = *MI.getParent();
7360 MachineFunction &MF = *MBB.getParent();
7361 MachineRegisterInfo &MRI = MF.getRegInfo();
7362
7363 // Extract the ptr from the resource descriptor.
7364 unsigned RsrcPtr =
7365 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7366 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7367
7368 // Create an empty resource descriptor
7369 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7370 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7371 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7372 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7373 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7374
7375 // Zero64 = 0
7376 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7377 .addImm(0);
7378
7379 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7380 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7381 .addImm(Lo_32(RsrcDataFormat));
7382
7383 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7384 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7385 .addImm(Hi_32(RsrcDataFormat));
7386
7387 // NewSRsrc = {Zero64, SRsrcFormat}
7388 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7389 .addReg(Zero64)
7390 .addImm(AMDGPU::sub0_sub1)
7391 .addReg(SRsrcFormatLo)
7392 .addImm(AMDGPU::sub2)
7393 .addReg(SRsrcFormatHi)
7394 .addImm(AMDGPU::sub3);
7395
7396 return std::tuple(RsrcPtr, NewSRsrc);
7397}
7398
7401 MachineDominatorTree *MDT) const {
7402 MachineFunction &MF = *MI.getMF();
7403 MachineRegisterInfo &MRI = MF.getRegInfo();
7404 MachineBasicBlock *CreatedBB = nullptr;
7405
7406 // Legalize VOP2
7407 if (isVOP2(MI) || isVOPC(MI)) {
7409 return CreatedBB;
7410 }
7411
7412 // Legalize VOP3
7413 if (isVOP3(MI)) {
7415 return CreatedBB;
7416 }
7417
7418 // Legalize SMRD
7419 if (isSMRD(MI)) {
7421 return CreatedBB;
7422 }
7423
7424 // Legalize FLAT
7425 if (isFLAT(MI)) {
7427 return CreatedBB;
7428 }
7429
7430 // Legalize PHI
7431 // The register class of the operands must be the same type as the register
7432 // class of the output.
7433 if (MI.getOpcode() == AMDGPU::PHI) {
7434 const TargetRegisterClass *VRC = getOpRegClass(MI, 0);
7435 assert(!RI.isSGPRClass(VRC));
7436
7437 // Update all the operands so they have the same type.
7438 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7439 MachineOperand &Op = MI.getOperand(I);
7440 if (!Op.isReg() || !Op.getReg().isVirtual())
7441 continue;
7442
7443 // MI is a PHI instruction.
7444 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7446
7447 // Avoid creating no-op copies with the same src and dst reg class. These
7448 // confuse some of the machine passes.
7449 legalizeGenericOperand(*InsertBB, Insert, VRC, Op, MRI, MI.getDebugLoc());
7450 }
7451 }
7452
7453 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7454 // VGPR dest type and SGPR sources, insert copies so all operands are
7455 // VGPRs. This seems to help operand folding / the register coalescer.
7456 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7457 MachineBasicBlock *MBB = MI.getParent();
7458 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7459 if (RI.hasVGPRs(DstRC)) {
7460 // Update all the operands so they are VGPR register classes. These may
7461 // not be the same register class because REG_SEQUENCE supports mixing
7462 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7463 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7464 MachineOperand &Op = MI.getOperand(I);
7465 if (!Op.isReg() || !Op.getReg().isVirtual())
7466 continue;
7467
7468 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7469 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7470 if (VRC == OpRC)
7471 continue;
7472
7473 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7474 Op.setIsKill();
7475 }
7476 }
7477
7478 return CreatedBB;
7479 }
7480
7481 // Legalize INSERT_SUBREG
7482 // src0 must have the same register class as dst
7483 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7484 Register Dst = MI.getOperand(0).getReg();
7485 Register Src0 = MI.getOperand(1).getReg();
7486 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7487 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7488 if (DstRC != Src0RC) {
7489 MachineBasicBlock *MBB = MI.getParent();
7490 MachineOperand &Op = MI.getOperand(1);
7491 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7492 }
7493 return CreatedBB;
7494 }
7495
7496 // Legalize SI_INIT_M0
7497 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7498 MachineOperand &Src = MI.getOperand(0);
7499 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7500 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7501 return CreatedBB;
7502 }
7503
7504 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7505 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7506 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7507 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7508 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7509 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7510 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7511 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7512 MachineOperand &Src = MI.getOperand(1);
7513 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7514 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7515 return CreatedBB;
7516 }
7517
7518 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7519 //
7520 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7521 // scratch memory access. In both cases, the legalization never involves
7522 // conversion to the addr64 form.
7524 (isMUBUF(MI) || isMTBUF(MI)))) {
7525 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7526 ? AMDGPU::OpName::rsrc
7527 : AMDGPU::OpName::srsrc;
7528 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7529 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7530 CreatedBB = generateWaterFallLoop(*this, MI, {SRsrc}, MDT);
7531
7532 AMDGPU::OpName SampOpName =
7533 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7534 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7535 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7536 CreatedBB = generateWaterFallLoop(*this, MI, {SSamp}, MDT);
7537
7538 return CreatedBB;
7539 }
7540
7541 // Legalize SI_CALL
7542 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7543 MachineOperand *Dest = &MI.getOperand(0);
7544 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7545 createWaterFallForSiCall(&MI, MDT, {Dest});
7546 }
7547 }
7548
7549 // Legalize s_sleep_var.
7550 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7551 const DebugLoc &DL = MI.getDebugLoc();
7552 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7553 int Src0Idx =
7554 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7555 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7556 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7557 .add(Src0);
7558 Src0.ChangeToRegister(Reg, false);
7559 return nullptr;
7560 }
7561
7562 // Legalize TENSOR_LOAD_TO_LDS_d2/_d4, TENSOR_STORE_FROM_LDS_d2/_d4. All their
7563 // operands are scalar.
7564 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d2 ||
7565 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d4 ||
7566 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d2 ||
7567 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d4) {
7568 for (MachineOperand &Src : MI.explicit_operands()) {
7569 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7570 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7571 }
7572 return CreatedBB;
7573 }
7574
7575 // Legalize MUBUF instructions.
7576 bool isSoffsetLegal = true;
7577 int SoffsetIdx =
7578 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7579 if (SoffsetIdx != -1) {
7580 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7581 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7582 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7583 isSoffsetLegal = false;
7584 }
7585 }
7586
7587 bool isRsrcLegal = true;
7588 int RsrcIdx =
7589 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7590 if (RsrcIdx != -1) {
7591 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7592 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7593 isRsrcLegal = false;
7594 }
7595
7596 // The operands are legal.
7597 if (isRsrcLegal && isSoffsetLegal)
7598 return CreatedBB;
7599
7600 if (!isRsrcLegal) {
7601 // Legalize a VGPR Rsrc
7602 //
7603 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7604 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7605 // a zero-value SRsrc.
7606 //
7607 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7608 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7609 // above.
7610 //
7611 // Otherwise we are on non-ADDR64 hardware, and/or we have
7612 // idxen/offen/bothen and we fall back to a waterfall loop.
7613
7614 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7615 MachineBasicBlock &MBB = *MI.getParent();
7616
7617 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7618 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7619 // This is already an ADDR64 instruction so we need to add the pointer
7620 // extracted from the resource descriptor to the current value of VAddr.
7621 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7622 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7623 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7624
7625 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7626 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7627 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7628
7629 unsigned RsrcPtr, NewSRsrc;
7630 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7631
7632 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7633 const DebugLoc &DL = MI.getDebugLoc();
7634 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7635 .addDef(CondReg0)
7636 .addReg(RsrcPtr, {}, AMDGPU::sub0)
7637 .addReg(VAddr->getReg(), {}, AMDGPU::sub0)
7638 .addImm(0);
7639
7640 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7641 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7642 .addDef(CondReg1, RegState::Dead)
7643 .addReg(RsrcPtr, {}, AMDGPU::sub1)
7644 .addReg(VAddr->getReg(), {}, AMDGPU::sub1)
7645 .addReg(CondReg0, RegState::Kill)
7646 .addImm(0);
7647
7648 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7649 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7650 .addReg(NewVAddrLo)
7651 .addImm(AMDGPU::sub0)
7652 .addReg(NewVAddrHi)
7653 .addImm(AMDGPU::sub1);
7654
7655 VAddr->setReg(NewVAddr);
7656 Rsrc->setReg(NewSRsrc);
7657 } else if (!VAddr && ST.hasAddr64()) {
7658 // This instructions is the _OFFSET variant, so we need to convert it to
7659 // ADDR64.
7660 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7661 "FIXME: Need to emit flat atomics here");
7662
7663 unsigned RsrcPtr, NewSRsrc;
7664 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7665
7666 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7667 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7668 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7669 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7670 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7671
7672 // Atomics with return have an additional tied operand and are
7673 // missing some of the special bits.
7674 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7675 MachineInstr *Addr64;
7676
7677 if (!VDataIn) {
7678 // Regular buffer load / store.
7680 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7681 .add(*VData)
7682 .addReg(NewVAddr)
7683 .addReg(NewSRsrc)
7684 .add(*SOffset)
7685 .add(*Offset);
7686
7687 if (const MachineOperand *CPol =
7688 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7689 MIB.addImm(CPol->getImm());
7690 }
7691
7692 if (const MachineOperand *TFE =
7693 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7694 MIB.addImm(TFE->getImm());
7695 }
7696
7697 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7698
7699 MIB.cloneMemRefs(MI);
7700 Addr64 = MIB;
7701 } else {
7702 // Atomics with return.
7703 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7704 .add(*VData)
7705 .add(*VDataIn)
7706 .addReg(NewVAddr)
7707 .addReg(NewSRsrc)
7708 .add(*SOffset)
7709 .add(*Offset)
7710 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7711 .cloneMemRefs(MI);
7712 }
7713
7714 MI.removeFromParent();
7715
7716 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7717 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7718 NewVAddr)
7719 .addReg(RsrcPtr, {}, AMDGPU::sub0)
7720 .addImm(AMDGPU::sub0)
7721 .addReg(RsrcPtr, {}, AMDGPU::sub1)
7722 .addImm(AMDGPU::sub1);
7723 } else {
7724 // Legalize a VGPR Rsrc and soffset together.
7725 if (!isSoffsetLegal) {
7726 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7727 CreatedBB = generateWaterFallLoop(*this, MI, {Rsrc, Soffset}, MDT);
7728 return CreatedBB;
7729 }
7730 CreatedBB = generateWaterFallLoop(*this, MI, {Rsrc}, MDT);
7731 return CreatedBB;
7732 }
7733 }
7734
7735 // Legalize a VGPR soffset.
7736 if (!isSoffsetLegal) {
7737 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7738 CreatedBB = generateWaterFallLoop(*this, MI, {Soffset}, MDT);
7739 return CreatedBB;
7740 }
7741 return CreatedBB;
7742}
7743
7745 InstrList.insert(MI);
7746 // Add MBUF instructiosn to deferred list.
7747 int RsrcIdx =
7748 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7749 if (RsrcIdx != -1) {
7750 DeferredList.insert(MI);
7751 }
7752}
7753
7755 return DeferredList.contains(MI);
7756}
7757
7758// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7759// lowering (change sgpr to vgpr).
7760// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7761// size. Need to legalize the size of the operands during the vgpr lowering
7762// chain. This can be removed after we have sgpr16 in place
7764 MachineRegisterInfo &MRI) const {
7765 if (!ST.useRealTrue16Insts())
7766 return;
7767
7768 unsigned Opcode = MI.getOpcode();
7769 MachineBasicBlock *MBB = MI.getParent();
7770 // Legalize operands and check for size mismatch
7771 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7772 OpIdx >= get(Opcode).getNumOperands() ||
7773 get(Opcode).operands()[OpIdx].RegClass == -1)
7774 return;
7775
7776 MachineOperand &Op = MI.getOperand(OpIdx);
7777 if (!Op.isReg() || !Op.getReg().isVirtual())
7778 return;
7779
7780 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7781 if (!RI.isVGPRClass(CurrRC))
7782 return;
7783
7784 int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]);
7785 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7786 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7787 Op.setSubReg(AMDGPU::lo16);
7788 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7789 const DebugLoc &DL = MI.getDebugLoc();
7790 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7791 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7792 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7793 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7794 .addReg(Op.getReg())
7795 .addImm(AMDGPU::lo16)
7796 .addReg(Undef)
7797 .addImm(AMDGPU::hi16);
7798 Op.setReg(NewDstReg);
7799 }
7800}
7802 MachineRegisterInfo &MRI) const {
7803 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7805}
7806
7810 ArrayRef<Register> PhySGPRs) const {
7811 assert(MI->getOpcode() == AMDGPU::SI_CALL_ISEL &&
7812 "This only handle waterfall for SI_CALL_ISEL");
7813 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7814 // following copies, we also need to move copies from and to physical
7815 // registers into the loop block.
7816 // Also move the copies to physical registers into the loop block
7817 MachineBasicBlock &MBB = *MI->getParent();
7819 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
7820 --Start;
7822 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
7823 ++End;
7824
7825 // Also include following copies of the return value
7826 ++End;
7827 while (End != MBB.end() && End->isCopy() &&
7828 MI->definesRegister(End->getOperand(1).getReg(), &RI))
7829 ++End;
7830
7831 generateWaterFallLoop(*this, *MI, ScalarOps, MDT, Start, End, PhySGPRs);
7832}
7833
7835 MachineDominatorTree *MDT) const {
7837 DenseMap<MachineInstr *, bool> V2SPhyCopiesToErase;
7838 while (!Worklist.empty()) {
7839 MachineInstr &Inst = *Worklist.top();
7840 Worklist.erase_top();
7841 // Skip MachineInstr in the deferred list.
7842 if (Worklist.isDeferred(&Inst))
7843 continue;
7844 moveToVALUImpl(Worklist, MDT, Inst, WaterFalls, V2SPhyCopiesToErase);
7845 }
7846
7847 // Deferred list of instructions will be processed once
7848 // all the MachineInstr in the worklist are done.
7849 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7850 moveToVALUImpl(Worklist, MDT, *Inst, WaterFalls, V2SPhyCopiesToErase);
7851 assert(Worklist.empty() &&
7852 "Deferred MachineInstr are not supposed to re-populate worklist");
7853 }
7854
7855 for (std::pair<MachineInstr *, V2PhysSCopyInfo> &Entry : WaterFalls) {
7856 if (Entry.first->getOpcode() == AMDGPU::SI_CALL_ISEL)
7857 createWaterFallForSiCall(Entry.first, MDT, Entry.second.MOs,
7858 Entry.second.SGPRs);
7859 }
7860
7861 for (std::pair<MachineInstr *, bool> Entry : V2SPhyCopiesToErase)
7862 if (Entry.second)
7863 Entry.first->eraseFromParent();
7864}
7866 MachineRegisterInfo &MRI, Register DstReg, MachineInstr &Inst) const {
7867 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7868 // hope for the best.
7869 const TargetRegisterClass *DstRC = RI.getRegClassForReg(MRI, DstReg);
7870 ArrayRef<int16_t> SubRegIndices = RI.getRegSplitParts(DstRC, 4);
7871 if (SubRegIndices.size() <= 1) {
7872 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7873 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7874 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
7875 .add(Inst.getOperand(1));
7876 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
7877 DstReg)
7878 .addReg(NewDst);
7879 } else {
7881 for (int16_t Indice : SubRegIndices) {
7882 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7883 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7884 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
7885 .addReg(Inst.getOperand(1).getReg(), {}, Indice);
7886
7887 DstRegs.push_back(NewDst);
7888 }
7890 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7891 get(AMDGPU::REG_SEQUENCE), DstReg);
7892 for (unsigned i = 0; i < SubRegIndices.size(); ++i) {
7893 MIB.addReg(DstRegs[i]);
7894 MIB.addImm(RI.getSubRegFromChannel(i));
7895 }
7896 }
7897}
7898
7900 SIInstrWorklist &Worklist, Register DstReg, MachineInstr &Inst,
7903 DenseMap<MachineInstr *, bool> &V2SPhyCopiesToErase) const {
7904 if (DstReg == AMDGPU::M0) {
7905 createReadFirstLaneFromCopyToPhysReg(MRI, DstReg, Inst);
7906 V2SPhyCopiesToErase.try_emplace(&Inst, true);
7907 return;
7908 }
7909 Register SrcReg = Inst.getOperand(1).getReg();
7912 // Only search current block since phyreg's def & use cannot cross
7913 // blocks when MF.NoPhi = false.
7914 while (++I != E) {
7915 // For SI_CALL_ISEL users, replace the phys SGPR with the VGPR source
7916 // and record the operand for later waterfall loop generation.
7917 if (I->getOpcode() == AMDGPU::SI_CALL_ISEL) {
7918 MachineInstr *UseMI = &*I;
7919 for (unsigned i = 0; i < UseMI->getNumOperands(); ++i) {
7920 if (UseMI->getOperand(i).isReg() &&
7921 UseMI->getOperand(i).getReg() == DstReg) {
7922 MachineOperand *MO = &UseMI->getOperand(i);
7923 MO->setReg(SrcReg);
7924 V2PhysSCopyInfo &V2SCopyInfo = WaterFalls[UseMI];
7925 V2SCopyInfo.MOs.push_back(MO);
7926 V2SCopyInfo.SGPRs.push_back(DstReg);
7927 V2SPhyCopiesToErase.try_emplace(&Inst, true);
7928 }
7929 }
7930 } else if (I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG &&
7931 I->getOperand(0).isReg() &&
7932 I->getOperand(0).getReg() == DstReg) {
7933 createReadFirstLaneFromCopyToPhysReg(MRI, DstReg, Inst);
7934 V2SPhyCopiesToErase.try_emplace(&Inst, true);
7935 } else if (I->readsRegister(DstReg, &RI)) {
7936 // COPY cannot be erased if other type of inst uses it.
7937 V2SPhyCopiesToErase[&Inst] = false;
7938 }
7939 if (I->findRegisterDefOperand(DstReg, &RI))
7940 break;
7941 }
7942}
7943
7945 SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst,
7947 DenseMap<MachineInstr *, bool> &V2SPhyCopiesToErase) const {
7948
7950 if (!MBB)
7951 return;
7952 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7953 unsigned Opcode = Inst.getOpcode();
7954 unsigned NewOpcode = getVALUOp(Inst);
7955 const DebugLoc &DL = Inst.getDebugLoc();
7956
7957 // Handle some special cases
7958 switch (Opcode) {
7959 default:
7960 break;
7961 case AMDGPU::S_ADD_I32:
7962 case AMDGPU::S_SUB_I32: {
7963 // FIXME: The u32 versions currently selected use the carry.
7964 bool Changed;
7965 MachineBasicBlock *CreatedBBTmp = nullptr;
7966 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7967 if (Changed)
7968 return;
7969
7970 // Default handling
7971 break;
7972 }
7973
7974 case AMDGPU::S_MUL_U64:
7975 if (ST.hasVectorMulU64()) {
7976 NewOpcode = AMDGPU::V_MUL_U64_e64;
7977 break;
7978 }
7979 // Split s_mul_u64 in 32-bit vector multiplications.
7980 splitScalarSMulU64(Worklist, Inst, MDT);
7981 Inst.eraseFromParent();
7982 return;
7983
7984 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7985 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7986 // This is a special case of s_mul_u64 where all the operands are either
7987 // zero extended or sign extended.
7988 splitScalarSMulPseudo(Worklist, Inst, MDT);
7989 Inst.eraseFromParent();
7990 return;
7991
7992 case AMDGPU::S_AND_B64:
7993 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7994 Inst.eraseFromParent();
7995 return;
7996
7997 case AMDGPU::S_OR_B64:
7998 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7999 Inst.eraseFromParent();
8000 return;
8001
8002 case AMDGPU::S_XOR_B64:
8003 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
8004 Inst.eraseFromParent();
8005 return;
8006
8007 case AMDGPU::S_NAND_B64:
8008 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
8009 Inst.eraseFromParent();
8010 return;
8011
8012 case AMDGPU::S_NOR_B64:
8013 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
8014 Inst.eraseFromParent();
8015 return;
8016
8017 case AMDGPU::S_XNOR_B64:
8018 if (ST.hasDLInsts())
8019 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
8020 else
8021 splitScalar64BitXnor(Worklist, Inst, MDT);
8022 Inst.eraseFromParent();
8023 return;
8024
8025 case AMDGPU::S_ANDN2_B64:
8026 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
8027 Inst.eraseFromParent();
8028 return;
8029
8030 case AMDGPU::S_ORN2_B64:
8031 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
8032 Inst.eraseFromParent();
8033 return;
8034
8035 case AMDGPU::S_BREV_B64:
8036 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
8037 Inst.eraseFromParent();
8038 return;
8039
8040 case AMDGPU::S_NOT_B64:
8041 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
8042 Inst.eraseFromParent();
8043 return;
8044
8045 case AMDGPU::S_BCNT1_I32_B64:
8046 splitScalar64BitBCNT(Worklist, Inst);
8047 Inst.eraseFromParent();
8048 return;
8049
8050 case AMDGPU::S_BFE_I64:
8051 splitScalar64BitBFE(Worklist, Inst);
8052 Inst.eraseFromParent();
8053 return;
8054
8055 case AMDGPU::S_FLBIT_I32_B64:
8056 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
8057 Inst.eraseFromParent();
8058 return;
8059 case AMDGPU::S_FF1_I32_B64:
8060 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
8061 Inst.eraseFromParent();
8062 return;
8063
8064 case AMDGPU::S_LSHL_B32:
8065 if (ST.hasOnlyRevVALUShifts()) {
8066 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
8067 swapOperands(Inst);
8068 }
8069 break;
8070 case AMDGPU::S_ASHR_I32:
8071 if (ST.hasOnlyRevVALUShifts()) {
8072 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
8073 swapOperands(Inst);
8074 }
8075 break;
8076 case AMDGPU::S_LSHR_B32:
8077 if (ST.hasOnlyRevVALUShifts()) {
8078 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
8079 swapOperands(Inst);
8080 }
8081 break;
8082 case AMDGPU::S_LSHL_B64:
8083 if (ST.hasOnlyRevVALUShifts()) {
8084 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
8085 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
8086 : AMDGPU::V_LSHLREV_B64_e64;
8087 swapOperands(Inst);
8088 }
8089 break;
8090 case AMDGPU::S_ASHR_I64:
8091 if (ST.hasOnlyRevVALUShifts()) {
8092 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
8093 swapOperands(Inst);
8094 }
8095 break;
8096 case AMDGPU::S_LSHR_B64:
8097 if (ST.hasOnlyRevVALUShifts()) {
8098 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
8099 swapOperands(Inst);
8100 }
8101 break;
8102
8103 case AMDGPU::S_ABS_I32:
8104 lowerScalarAbs(Worklist, Inst);
8105 Inst.eraseFromParent();
8106 return;
8107
8108 case AMDGPU::S_ABSDIFF_I32:
8109 lowerScalarAbsDiff(Worklist, Inst);
8110 Inst.eraseFromParent();
8111 return;
8112
8113 case AMDGPU::S_CBRANCH_SCC0:
8114 case AMDGPU::S_CBRANCH_SCC1: {
8115 // Clear unused bits of vcc
8116 Register CondReg = Inst.getOperand(1).getReg();
8117 bool IsSCC = CondReg == AMDGPU::SCC;
8119 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
8120 .addReg(LMC.ExecReg)
8121 .addReg(IsSCC ? LMC.VccReg : CondReg);
8122 Inst.removeOperand(1);
8123 } break;
8124
8125 case AMDGPU::S_BFE_U64:
8126 case AMDGPU::S_BFM_B64:
8127 llvm_unreachable("Moving this op to VALU not implemented");
8128
8129 case AMDGPU::S_PACK_LL_B32_B16:
8130 case AMDGPU::S_PACK_LH_B32_B16:
8131 case AMDGPU::S_PACK_HL_B32_B16:
8132 case AMDGPU::S_PACK_HH_B32_B16:
8133 movePackToVALU(Worklist, MRI, Inst);
8134 Inst.eraseFromParent();
8135 return;
8136
8137 case AMDGPU::S_XNOR_B32:
8138 lowerScalarXnor(Worklist, Inst);
8139 Inst.eraseFromParent();
8140 return;
8141
8142 case AMDGPU::S_NAND_B32:
8143 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
8144 Inst.eraseFromParent();
8145 return;
8146
8147 case AMDGPU::S_NOR_B32:
8148 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
8149 Inst.eraseFromParent();
8150 return;
8151
8152 case AMDGPU::S_ANDN2_B32:
8153 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
8154 Inst.eraseFromParent();
8155 return;
8156
8157 case AMDGPU::S_ORN2_B32:
8158 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
8159 Inst.eraseFromParent();
8160 return;
8161
8162 // TODO: remove as soon as everything is ready
8163 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
8164 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
8165 // can only be selected from the uniform SDNode.
8166 case AMDGPU::S_ADD_CO_PSEUDO:
8167 case AMDGPU::S_SUB_CO_PSEUDO: {
8168 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
8169 ? AMDGPU::V_ADDC_U32_e64
8170 : AMDGPU::V_SUBB_U32_e64;
8171 const auto *CarryRC = RI.getWaveMaskRegClass();
8172
8173 Register CarryInReg = Inst.getOperand(4).getReg();
8174 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
8175 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
8176 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
8177 .addReg(CarryInReg);
8178 }
8179
8180 Register CarryOutReg = Inst.getOperand(1).getReg();
8181
8182 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
8183 MRI.getRegClass(Inst.getOperand(0).getReg())));
8184 MachineInstr *CarryOp =
8185 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
8186 .addReg(CarryOutReg, RegState::Define)
8187 .add(Inst.getOperand(2))
8188 .add(Inst.getOperand(3))
8189 .addReg(CarryInReg)
8190 .addImm(0);
8191 legalizeOperands(*CarryOp);
8192 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
8193 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8194 Inst.eraseFromParent();
8195 }
8196 return;
8197 case AMDGPU::S_UADDO_PSEUDO:
8198 case AMDGPU::S_USUBO_PSEUDO: {
8199 MachineOperand &Dest0 = Inst.getOperand(0);
8200 MachineOperand &Dest1 = Inst.getOperand(1);
8201 MachineOperand &Src0 = Inst.getOperand(2);
8202 MachineOperand &Src1 = Inst.getOperand(3);
8203
8204 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
8205 ? AMDGPU::V_ADD_CO_U32_e64
8206 : AMDGPU::V_SUB_CO_U32_e64;
8207 const TargetRegisterClass *NewRC =
8208 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
8209 Register DestReg = MRI.createVirtualRegister(NewRC);
8210 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
8211 .addReg(Dest1.getReg(), RegState::Define)
8212 .add(Src0)
8213 .add(Src1)
8214 .addImm(0); // clamp bit
8215
8216 legalizeOperands(*NewInstr, MDT);
8217 MRI.replaceRegWith(Dest0.getReg(), DestReg);
8218 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8219 Inst.eraseFromParent();
8220 }
8221 return;
8222 case AMDGPU::S_LSHL1_ADD_U32:
8223 case AMDGPU::S_LSHL2_ADD_U32:
8224 case AMDGPU::S_LSHL3_ADD_U32:
8225 case AMDGPU::S_LSHL4_ADD_U32: {
8226 MachineOperand &Dest = Inst.getOperand(0);
8227 MachineOperand &Src0 = Inst.getOperand(1);
8228 MachineOperand &Src1 = Inst.getOperand(2);
8229 unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
8230 : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
8231 : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
8232 : 4);
8233
8234 const TargetRegisterClass *NewRC =
8235 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
8236 Register DestReg = MRI.createVirtualRegister(NewRC);
8237 MachineInstr *NewInstr =
8238 BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
8239 .add(Src0)
8240 .addImm(ShiftAmt)
8241 .add(Src1);
8242
8243 legalizeOperands(*NewInstr, MDT);
8244 MRI.replaceRegWith(Dest.getReg(), DestReg);
8245 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8246 Inst.eraseFromParent();
8247 }
8248 return;
8249 case AMDGPU::S_CSELECT_B32:
8250 case AMDGPU::S_CSELECT_B64:
8251 lowerSelect(Worklist, Inst, MDT);
8252 Inst.eraseFromParent();
8253 return;
8254 case AMDGPU::S_CMP_EQ_I32:
8255 case AMDGPU::S_CMP_LG_I32:
8256 case AMDGPU::S_CMP_GT_I32:
8257 case AMDGPU::S_CMP_GE_I32:
8258 case AMDGPU::S_CMP_LT_I32:
8259 case AMDGPU::S_CMP_LE_I32:
8260 case AMDGPU::S_CMP_EQ_U32:
8261 case AMDGPU::S_CMP_LG_U32:
8262 case AMDGPU::S_CMP_GT_U32:
8263 case AMDGPU::S_CMP_GE_U32:
8264 case AMDGPU::S_CMP_LT_U32:
8265 case AMDGPU::S_CMP_LE_U32:
8266 case AMDGPU::S_CMP_EQ_U64:
8267 case AMDGPU::S_CMP_LG_U64:
8268 case AMDGPU::S_CMP_LT_F32:
8269 case AMDGPU::S_CMP_EQ_F32:
8270 case AMDGPU::S_CMP_LE_F32:
8271 case AMDGPU::S_CMP_GT_F32:
8272 case AMDGPU::S_CMP_LG_F32:
8273 case AMDGPU::S_CMP_GE_F32:
8274 case AMDGPU::S_CMP_O_F32:
8275 case AMDGPU::S_CMP_U_F32:
8276 case AMDGPU::S_CMP_NGE_F32:
8277 case AMDGPU::S_CMP_NLG_F32:
8278 case AMDGPU::S_CMP_NGT_F32:
8279 case AMDGPU::S_CMP_NLE_F32:
8280 case AMDGPU::S_CMP_NEQ_F32:
8281 case AMDGPU::S_CMP_NLT_F32: {
8282 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8283 auto NewInstr =
8284 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8285 .setMIFlags(Inst.getFlags());
8286 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
8287 0) {
8288 NewInstr
8289 .addImm(0) // src0_modifiers
8290 .add(Inst.getOperand(0)) // src0
8291 .addImm(0) // src1_modifiers
8292 .add(Inst.getOperand(1)) // src1
8293 .addImm(0); // clamp
8294 } else {
8295 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
8296 }
8297 legalizeOperands(*NewInstr, MDT);
8298 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8299 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8300 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8301 Inst.eraseFromParent();
8302 return;
8303 }
8304 case AMDGPU::S_CMP_LT_F16:
8305 case AMDGPU::S_CMP_EQ_F16:
8306 case AMDGPU::S_CMP_LE_F16:
8307 case AMDGPU::S_CMP_GT_F16:
8308 case AMDGPU::S_CMP_LG_F16:
8309 case AMDGPU::S_CMP_GE_F16:
8310 case AMDGPU::S_CMP_O_F16:
8311 case AMDGPU::S_CMP_U_F16:
8312 case AMDGPU::S_CMP_NGE_F16:
8313 case AMDGPU::S_CMP_NLG_F16:
8314 case AMDGPU::S_CMP_NGT_F16:
8315 case AMDGPU::S_CMP_NLE_F16:
8316 case AMDGPU::S_CMP_NEQ_F16:
8317 case AMDGPU::S_CMP_NLT_F16: {
8318 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8319 auto NewInstr =
8320 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8321 .setMIFlags(Inst.getFlags());
8322 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
8323 NewInstr
8324 .addImm(0) // src0_modifiers
8325 .add(Inst.getOperand(0)) // src0
8326 .addImm(0) // src1_modifiers
8327 .add(Inst.getOperand(1)) // src1
8328 .addImm(0); // clamp
8329 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8330 NewInstr.addImm(0); // op_sel0
8331 } else {
8332 NewInstr
8333 .add(Inst.getOperand(0))
8334 .add(Inst.getOperand(1));
8335 }
8336 legalizeOperandsVALUt16(*NewInstr, MRI);
8337 legalizeOperands(*NewInstr, MDT);
8338 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8339 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8340 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8341 Inst.eraseFromParent();
8342 return;
8343 }
8344 case AMDGPU::S_CVT_HI_F32_F16: {
8345 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8346 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8347 if (ST.useRealTrue16Insts()) {
8348 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
8349 .add(Inst.getOperand(1));
8350 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8351 .addImm(0) // src0_modifiers
8352 .addReg(TmpReg, {}, AMDGPU::hi16)
8353 .addImm(0) // clamp
8354 .addImm(0) // omod
8355 .addImm(0); // op_sel0
8356 } else {
8357 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8358 .addImm(16)
8359 .add(Inst.getOperand(1));
8360 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8361 .addImm(0) // src0_modifiers
8362 .addReg(TmpReg)
8363 .addImm(0) // clamp
8364 .addImm(0); // omod
8365 }
8366
8367 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8368 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8369 Inst.eraseFromParent();
8370 return;
8371 }
8372 case AMDGPU::S_MINIMUM_F32:
8373 case AMDGPU::S_MAXIMUM_F32: {
8374 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8375 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8376 .addImm(0) // src0_modifiers
8377 .add(Inst.getOperand(1))
8378 .addImm(0) // src1_modifiers
8379 .add(Inst.getOperand(2))
8380 .addImm(0) // clamp
8381 .addImm(0); // omod
8382 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8383
8384 legalizeOperands(*NewInstr, MDT);
8385 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8386 Inst.eraseFromParent();
8387 return;
8388 }
8389 case AMDGPU::S_MINIMUM_F16:
8390 case AMDGPU::S_MAXIMUM_F16: {
8391 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8392 ? &AMDGPU::VGPR_16RegClass
8393 : &AMDGPU::VGPR_32RegClass);
8394 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8395 .addImm(0) // src0_modifiers
8396 .add(Inst.getOperand(1))
8397 .addImm(0) // src1_modifiers
8398 .add(Inst.getOperand(2))
8399 .addImm(0) // clamp
8400 .addImm(0) // omod
8401 .addImm(0); // opsel0
8402 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8403 legalizeOperandsVALUt16(*NewInstr, MRI);
8404 legalizeOperands(*NewInstr, MDT);
8405 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8406 Inst.eraseFromParent();
8407 return;
8408 }
8409 case AMDGPU::V_S_EXP_F16_e64:
8410 case AMDGPU::V_S_LOG_F16_e64:
8411 case AMDGPU::V_S_RCP_F16_e64:
8412 case AMDGPU::V_S_RSQ_F16_e64:
8413 case AMDGPU::V_S_SQRT_F16_e64: {
8414 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8415 ? &AMDGPU::VGPR_16RegClass
8416 : &AMDGPU::VGPR_32RegClass);
8417 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8418 .add(Inst.getOperand(1)) // src0_modifiers
8419 .add(Inst.getOperand(2))
8420 .add(Inst.getOperand(3)) // clamp
8421 .add(Inst.getOperand(4)) // omod
8422 .setMIFlags(Inst.getFlags());
8423 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8424 NewInstr.addImm(0); // opsel0
8425 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8426 legalizeOperandsVALUt16(*NewInstr, MRI);
8427 legalizeOperands(*NewInstr, MDT);
8428 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8429 Inst.eraseFromParent();
8430 return;
8431 }
8432 }
8433
8434 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8435 // We cannot move this instruction to the VALU, so we should try to
8436 // legalize its operands instead.
8437 legalizeOperands(Inst, MDT);
8438 return;
8439 }
8440 // Handle converting generic instructions like COPY-to-SGPR into
8441 // COPY-to-VGPR.
8442 if (NewOpcode == Opcode) {
8443 Register DstReg = Inst.getOperand(0).getReg();
8444 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8445
8446 if (Inst.isCopy() && DstReg.isPhysical() &&
8447 Inst.getOperand(1).getReg().isVirtual()) {
8448 handleCopyToPhysHelper(Worklist, DstReg, Inst, MRI, WaterFalls,
8449 V2SPhyCopiesToErase);
8450 return;
8451 }
8452
8453 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
8454 Register NewDstReg = Inst.getOperand(1).getReg();
8455 const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
8456 if (const TargetRegisterClass *CommonRC =
8457 RI.getCommonSubClass(NewDstRC, SrcRC)) {
8458 // Instead of creating a copy where src and dst are the same register
8459 // class, we just replace all uses of dst with src. These kinds of
8460 // copies interfere with the heuristics MachineSink uses to decide
8461 // whether or not to split a critical edge. Since the pass assumes
8462 // that copies will end up as machine instructions and not be
8463 // eliminated.
8464 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8465 MRI.replaceRegWith(DstReg, NewDstReg);
8466 MRI.clearKillFlags(NewDstReg);
8467 Inst.getOperand(0).setReg(DstReg);
8468
8469 if (!MRI.constrainRegClass(NewDstReg, CommonRC))
8470 llvm_unreachable("failed to constrain register");
8471
8472 Inst.eraseFromParent();
8473
8474 for (MachineOperand &UseMO :
8475 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8476 MachineInstr &UseMI = *UseMO.getParent();
8477
8478 // Legalize t16 operands since replaceReg is called after
8479 // addUsersToVALU.
8481
8482 unsigned OpIdx = UseMI.getOperandNo(&UseMO);
8483 if (const TargetRegisterClass *OpRC =
8484 getRegClass(UseMI.getDesc(), OpIdx))
8485 MRI.constrainRegClass(NewDstReg, OpRC);
8486 }
8487
8488 return;
8489 }
8490 }
8491
8492 // If this is a v2s copy between 16bit and 32bit reg,
8493 // replace vgpr copy to reg_sequence/extract_subreg
8494 // This can be remove after we have sgpr16 in place
8495 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8496 Inst.getOperand(1).getReg().isVirtual() &&
8497 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8498 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8499 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8500 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8501 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8502 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8503 get(AMDGPU::IMPLICIT_DEF), Undef);
8504 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8505 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8506 .addReg(Inst.getOperand(1).getReg())
8507 .addImm(AMDGPU::lo16)
8508 .addReg(Undef)
8509 .addImm(AMDGPU::hi16);
8510 Inst.eraseFromParent();
8511 MRI.replaceRegWith(DstReg, NewDstReg);
8512 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8513 return;
8514 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8515 AMDGPU::lo16)) {
8516 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8517 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8518 MRI.replaceRegWith(DstReg, NewDstReg);
8519 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8520 return;
8521 }
8522 }
8523
8524 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8525 MRI.replaceRegWith(DstReg, NewDstReg);
8526 legalizeOperands(Inst, MDT);
8527 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8528 return;
8529 }
8530
8531 // Use the new VALU Opcode.
8532 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8533 .setMIFlags(Inst.getFlags());
8534 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8535 // Intersperse VOP3 modifiers among the SALU operands.
8536 NewInstr->addOperand(Inst.getOperand(0));
8537 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8538 AMDGPU::OpName::src0_modifiers) >= 0)
8539 NewInstr.addImm(0);
8540 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8541 const MachineOperand &Src = Inst.getOperand(1);
8542 NewInstr->addOperand(Src);
8543 }
8544
8545 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8546 // We are converting these to a BFE, so we need to add the missing
8547 // operands for the size and offset.
8548 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8549 NewInstr.addImm(0);
8550 NewInstr.addImm(Size);
8551 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8552 // The VALU version adds the second operand to the result, so insert an
8553 // extra 0 operand.
8554 NewInstr.addImm(0);
8555 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8556 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8557 // If we need to move this to VGPRs, we need to unpack the second
8558 // operand back into the 2 separate ones for bit offset and width.
8559 assert(OffsetWidthOp.isImm() &&
8560 "Scalar BFE is only implemented for constant width and offset");
8561 uint32_t Imm = OffsetWidthOp.getImm();
8562
8563 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8564 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8565 NewInstr.addImm(Offset);
8566 NewInstr.addImm(BitWidth);
8567 } else {
8568 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8569 AMDGPU::OpName::src1_modifiers) >= 0)
8570 NewInstr.addImm(0);
8571 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8572 NewInstr->addOperand(Inst.getOperand(2));
8573 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8574 AMDGPU::OpName::src2_modifiers) >= 0)
8575 NewInstr.addImm(0);
8576 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8577 NewInstr->addOperand(Inst.getOperand(3));
8578 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8579 NewInstr.addImm(0);
8580 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8581 NewInstr.addImm(0);
8582 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8583 NewInstr.addImm(0);
8584 }
8585 } else {
8586 // Just copy the SALU operands.
8587 for (const MachineOperand &Op : Inst.explicit_operands())
8588 NewInstr->addOperand(Op);
8589 }
8590
8591 // Remove any references to SCC. Vector instructions can't read from it, and
8592 // We're just about to add the implicit use / defs of VCC, and we don't want
8593 // both.
8594 for (MachineOperand &Op : Inst.implicit_operands()) {
8595 if (Op.getReg() == AMDGPU::SCC) {
8596 // Only propagate through live-def of SCC.
8597 if (Op.isDef() && !Op.isDead())
8598 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8599 if (Op.isUse())
8600 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8601 }
8602 }
8603 Inst.eraseFromParent();
8604 Register NewDstReg;
8605 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8606 Register DstReg = NewInstr->getOperand(0).getReg();
8607 assert(DstReg.isVirtual());
8608 // Update the destination register class.
8609 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8610 assert(NewDstRC);
8611 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8612 MRI.replaceRegWith(DstReg, NewDstReg);
8613 }
8614 fixImplicitOperands(*NewInstr);
8615
8616 legalizeOperandsVALUt16(*NewInstr, MRI);
8617
8618 // Legalize the operands
8619 legalizeOperands(*NewInstr, MDT);
8620 if (NewDstReg)
8621 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8622}
8623
8624// Add/sub require special handling to deal with carry outs.
8625std::pair<bool, MachineBasicBlock *>
8626SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8627 MachineDominatorTree *MDT) const {
8628 if (ST.hasAddNoCarryInsts()) {
8629 // Assume there is no user of scc since we don't select this in that case.
8630 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8631 // is used.
8632
8633 MachineBasicBlock &MBB = *Inst.getParent();
8634 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8635
8636 Register OldDstReg = Inst.getOperand(0).getReg();
8637 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8638
8639 unsigned Opc = Inst.getOpcode();
8640 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8641
8642 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8643 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8644
8645 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8646 Inst.removeOperand(3);
8647
8648 Inst.setDesc(get(NewOpc));
8649 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8650 Inst.addImplicitDefUseOperands(*MBB.getParent());
8651 MRI.replaceRegWith(OldDstReg, ResultReg);
8652 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8653
8654 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8655 return std::pair(true, NewBB);
8656 }
8657
8658 return std::pair(false, nullptr);
8659}
8660
8661void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8662 MachineDominatorTree *MDT) const {
8663
8664 MachineBasicBlock &MBB = *Inst.getParent();
8665 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8666 MachineBasicBlock::iterator MII = Inst;
8667 const DebugLoc &DL = Inst.getDebugLoc();
8668
8669 MachineOperand &Dest = Inst.getOperand(0);
8670 MachineOperand &Src0 = Inst.getOperand(1);
8671 MachineOperand &Src1 = Inst.getOperand(2);
8672 MachineOperand &Cond = Inst.getOperand(3);
8673
8674 Register CondReg = Cond.getReg();
8675 bool IsSCC = (CondReg == AMDGPU::SCC);
8676
8677 // If this is a trivial select where the condition is effectively not SCC
8678 // (CondReg is a source of copy to SCC), then the select is semantically
8679 // equivalent to copying CondReg. Hence, there is no need to create
8680 // V_CNDMASK, we can just use that and bail out.
8681 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8682 (Src1.getImm() == 0)) {
8683 MRI.replaceRegWith(Dest.getReg(), CondReg);
8684 return;
8685 }
8686
8687 Register NewCondReg = CondReg;
8688 if (IsSCC) {
8689 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8690 NewCondReg = MRI.createVirtualRegister(TC);
8691
8692 // Now look for the closest SCC def if it is a copy
8693 // replacing the CondReg with the COPY source register
8694 bool CopyFound = false;
8695 for (MachineInstr &CandI :
8697 Inst.getParent()->rend())) {
8698 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8699 -1) {
8700 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8701 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8702 .addReg(CandI.getOperand(1).getReg());
8703 CopyFound = true;
8704 }
8705 break;
8706 }
8707 }
8708 if (!CopyFound) {
8709 // SCC def is not a copy
8710 // Insert a trivial select instead of creating a copy, because a copy from
8711 // SCC would semantically mean just copying a single bit, but we may need
8712 // the result to be a vector condition mask that needs preserving.
8713 unsigned Opcode =
8714 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8715 auto NewSelect =
8716 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8717 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8718 }
8719 }
8720
8721 Register NewDestReg = MRI.createVirtualRegister(
8722 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8723 MachineInstr *NewInst;
8724 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8725 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8726 .addImm(0)
8727 .add(Src1) // False
8728 .addImm(0)
8729 .add(Src0) // True
8730 .addReg(NewCondReg);
8731 } else {
8732 NewInst =
8733 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8734 .add(Src1) // False
8735 .add(Src0) // True
8736 .addReg(NewCondReg);
8737 }
8738 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8739 legalizeOperands(*NewInst, MDT);
8740 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8741}
8742
8743void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8744 MachineInstr &Inst) const {
8745 MachineBasicBlock &MBB = *Inst.getParent();
8746 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8747 MachineBasicBlock::iterator MII = Inst;
8748 const DebugLoc &DL = Inst.getDebugLoc();
8749
8750 MachineOperand &Dest = Inst.getOperand(0);
8751 MachineOperand &Src = Inst.getOperand(1);
8752 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8753 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8754
8755 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8756 : AMDGPU::V_SUB_CO_U32_e32;
8757
8758 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8759 .addImm(0)
8760 .addReg(Src.getReg());
8761
8762 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8763 .addReg(Src.getReg())
8764 .addReg(TmpReg);
8765
8766 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8767 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8768}
8769
8770void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
8771 MachineInstr &Inst) const {
8772 MachineBasicBlock &MBB = *Inst.getParent();
8773 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8774 MachineBasicBlock::iterator MII = Inst;
8775 const DebugLoc &DL = Inst.getDebugLoc();
8776
8777 MachineOperand &Dest = Inst.getOperand(0);
8778 MachineOperand &Src1 = Inst.getOperand(1);
8779 MachineOperand &Src2 = Inst.getOperand(2);
8780 Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8781 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8782 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8783
8784 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8785 : AMDGPU::V_SUB_CO_U32_e32;
8786
8787 BuildMI(MBB, MII, DL, get(SubOp), SubResultReg)
8788 .addReg(Src1.getReg())
8789 .addReg(Src2.getReg());
8790
8791 BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg);
8792
8793 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8794 .addReg(SubResultReg)
8795 .addReg(TmpReg);
8796
8797 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8798 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8799}
8800
8801void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8802 MachineInstr &Inst) const {
8803 MachineBasicBlock &MBB = *Inst.getParent();
8804 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8805 MachineBasicBlock::iterator MII = Inst;
8806 const DebugLoc &DL = Inst.getDebugLoc();
8807
8808 MachineOperand &Dest = Inst.getOperand(0);
8809 MachineOperand &Src0 = Inst.getOperand(1);
8810 MachineOperand &Src1 = Inst.getOperand(2);
8811
8812 if (ST.hasDLInsts()) {
8813 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8814 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8815 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8816
8817 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8818 .add(Src0)
8819 .add(Src1);
8820
8821 MRI.replaceRegWith(Dest.getReg(), NewDest);
8822 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8823 } else {
8824 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8825 // invert either source and then perform the XOR. If either source is a
8826 // scalar register, then we can leave the inversion on the scalar unit to
8827 // achieve a better distribution of scalar and vector instructions.
8828 bool Src0IsSGPR = Src0.isReg() &&
8829 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8830 bool Src1IsSGPR = Src1.isReg() &&
8831 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8832 MachineInstr *Xor;
8833 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8834 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8835
8836 // Build a pair of scalar instructions and add them to the work list.
8837 // The next iteration over the work list will lower these to the vector
8838 // unit as necessary.
8839 if (Src0IsSGPR) {
8840 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8841 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8842 .addReg(Temp)
8843 .add(Src1);
8844 } else if (Src1IsSGPR) {
8845 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8846 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8847 .add(Src0)
8848 .addReg(Temp);
8849 } else {
8850 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8851 .add(Src0)
8852 .add(Src1);
8853 MachineInstr *Not =
8854 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8855 Worklist.insert(Not);
8856 }
8857
8858 MRI.replaceRegWith(Dest.getReg(), NewDest);
8859
8860 Worklist.insert(Xor);
8861
8862 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8863 }
8864}
8865
8866void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8867 MachineInstr &Inst,
8868 unsigned Opcode) const {
8869 MachineBasicBlock &MBB = *Inst.getParent();
8870 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8871 MachineBasicBlock::iterator MII = Inst;
8872 const DebugLoc &DL = Inst.getDebugLoc();
8873
8874 MachineOperand &Dest = Inst.getOperand(0);
8875 MachineOperand &Src0 = Inst.getOperand(1);
8876 MachineOperand &Src1 = Inst.getOperand(2);
8877
8878 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8879 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8880
8881 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8882 .add(Src0)
8883 .add(Src1);
8884
8885 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8886 .addReg(Interm);
8887
8888 Worklist.insert(&Op);
8889 Worklist.insert(&Not);
8890
8891 MRI.replaceRegWith(Dest.getReg(), NewDest);
8892 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8893}
8894
8895void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8896 MachineInstr &Inst,
8897 unsigned Opcode) const {
8898 MachineBasicBlock &MBB = *Inst.getParent();
8899 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8900 MachineBasicBlock::iterator MII = Inst;
8901 const DebugLoc &DL = Inst.getDebugLoc();
8902
8903 MachineOperand &Dest = Inst.getOperand(0);
8904 MachineOperand &Src0 = Inst.getOperand(1);
8905 MachineOperand &Src1 = Inst.getOperand(2);
8906
8907 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8908 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8909
8910 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8911 .add(Src1);
8912
8913 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8914 .add(Src0)
8915 .addReg(Interm);
8916
8917 Worklist.insert(&Not);
8918 Worklist.insert(&Op);
8919
8920 MRI.replaceRegWith(Dest.getReg(), NewDest);
8921 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8922}
8923
8924void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8925 MachineInstr &Inst, unsigned Opcode,
8926 bool Swap) const {
8927 MachineBasicBlock &MBB = *Inst.getParent();
8928 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8929
8930 MachineOperand &Dest = Inst.getOperand(0);
8931 MachineOperand &Src0 = Inst.getOperand(1);
8932 const DebugLoc &DL = Inst.getDebugLoc();
8933
8934 MachineBasicBlock::iterator MII = Inst;
8935
8936 const MCInstrDesc &InstDesc = get(Opcode);
8937 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8938 MRI.getRegClass(Src0.getReg()) :
8939 &AMDGPU::SGPR_32RegClass;
8940
8941 const TargetRegisterClass *Src0SubRC =
8942 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8943
8944 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8945 AMDGPU::sub0, Src0SubRC);
8946
8947 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8948 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8949 const TargetRegisterClass *NewDestSubRC =
8950 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8951
8952 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8953 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8954
8955 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8956 AMDGPU::sub1, Src0SubRC);
8957
8958 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8959 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8960
8961 if (Swap)
8962 std::swap(DestSub0, DestSub1);
8963
8964 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8965 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8966 .addReg(DestSub0)
8967 .addImm(AMDGPU::sub0)
8968 .addReg(DestSub1)
8969 .addImm(AMDGPU::sub1);
8970
8971 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8972
8973 Worklist.insert(&LoHalf);
8974 Worklist.insert(&HiHalf);
8975
8976 // We don't need to legalizeOperands here because for a single operand, src0
8977 // will support any kind of input.
8978
8979 // Move all users of this moved value.
8980 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8981}
8982
8983// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8984// split the s_mul_u64 in 32-bit vector multiplications.
8985void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8986 MachineInstr &Inst,
8987 MachineDominatorTree *MDT) const {
8988 MachineBasicBlock &MBB = *Inst.getParent();
8989 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8990
8991 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8992 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8993 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8994
8995 MachineOperand &Dest = Inst.getOperand(0);
8996 MachineOperand &Src0 = Inst.getOperand(1);
8997 MachineOperand &Src1 = Inst.getOperand(2);
8998 const DebugLoc &DL = Inst.getDebugLoc();
8999 MachineBasicBlock::iterator MII = Inst;
9000
9001 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
9002 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
9003 const TargetRegisterClass *Src0SubRC =
9004 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9005 if (RI.isSGPRClass(Src0SubRC))
9006 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
9007 const TargetRegisterClass *Src1SubRC =
9008 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9009 if (RI.isSGPRClass(Src1SubRC))
9010 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
9011
9012 // First, we extract the low 32-bit and high 32-bit values from each of the
9013 // operands.
9014 MachineOperand Op0L =
9015 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
9016 MachineOperand Op1L =
9017 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
9018 MachineOperand Op0H =
9019 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
9020 MachineOperand Op1H =
9021 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
9022
9023 // The multilication is done as follows:
9024 //
9025 // Op1H Op1L
9026 // * Op0H Op0L
9027 // --------------------
9028 // Op1H*Op0L Op1L*Op0L
9029 // + Op1H*Op0H Op1L*Op0H
9030 // -----------------------------------------
9031 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
9032 //
9033 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
9034 // value and that would overflow.
9035 // The low 32-bit value is Op1L*Op0L.
9036 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
9037
9038 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9039 MachineInstr *Op1L_Op0H =
9040 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
9041 .add(Op1L)
9042 .add(Op0H);
9043
9044 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9045 MachineInstr *Op1H_Op0L =
9046 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
9047 .add(Op1H)
9048 .add(Op0L);
9049
9050 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9051 MachineInstr *Carry =
9052 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
9053 .add(Op1L)
9054 .add(Op0L);
9055
9056 MachineInstr *LoHalf =
9057 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
9058 .add(Op1L)
9059 .add(Op0L);
9060
9061 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9062 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
9063 .addReg(Op1L_Op0H_Reg)
9064 .addReg(Op1H_Op0L_Reg);
9065
9066 MachineInstr *HiHalf =
9067 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
9068 .addReg(AddReg)
9069 .addReg(CarryReg);
9070
9071 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9072 .addReg(DestSub0)
9073 .addImm(AMDGPU::sub0)
9074 .addReg(DestSub1)
9075 .addImm(AMDGPU::sub1);
9076
9077 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9078
9079 // Try to legalize the operands in case we need to swap the order to keep it
9080 // valid.
9081 legalizeOperands(*Op1L_Op0H, MDT);
9082 legalizeOperands(*Op1H_Op0L, MDT);
9083 legalizeOperands(*Carry, MDT);
9084 legalizeOperands(*LoHalf, MDT);
9085 legalizeOperands(*Add, MDT);
9086 legalizeOperands(*HiHalf, MDT);
9087
9088 // Move all users of this moved value.
9089 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9090}
9091
9092// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
9093// multiplications.
9094void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
9095 MachineInstr &Inst,
9096 MachineDominatorTree *MDT) const {
9097 MachineBasicBlock &MBB = *Inst.getParent();
9098 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9099
9100 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9101 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9102 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9103
9104 MachineOperand &Dest = Inst.getOperand(0);
9105 MachineOperand &Src0 = Inst.getOperand(1);
9106 MachineOperand &Src1 = Inst.getOperand(2);
9107 const DebugLoc &DL = Inst.getDebugLoc();
9108 MachineBasicBlock::iterator MII = Inst;
9109
9110 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
9111 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
9112 const TargetRegisterClass *Src0SubRC =
9113 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9114 if (RI.isSGPRClass(Src0SubRC))
9115 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
9116 const TargetRegisterClass *Src1SubRC =
9117 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9118 if (RI.isSGPRClass(Src1SubRC))
9119 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
9120
9121 // First, we extract the low 32-bit and high 32-bit values from each of the
9122 // operands.
9123 MachineOperand Op0L =
9124 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
9125 MachineOperand Op1L =
9126 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
9127
9128 unsigned Opc = Inst.getOpcode();
9129 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
9130 ? AMDGPU::V_MUL_HI_U32_e64
9131 : AMDGPU::V_MUL_HI_I32_e64;
9132 MachineInstr *HiHalf =
9133 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
9134
9135 MachineInstr *LoHalf =
9136 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
9137 .add(Op1L)
9138 .add(Op0L);
9139
9140 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9141 .addReg(DestSub0)
9142 .addImm(AMDGPU::sub0)
9143 .addReg(DestSub1)
9144 .addImm(AMDGPU::sub1);
9145
9146 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9147
9148 // Try to legalize the operands in case we need to swap the order to keep it
9149 // valid.
9150 legalizeOperands(*HiHalf, MDT);
9151 legalizeOperands(*LoHalf, MDT);
9152
9153 // Move all users of this moved value.
9154 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9155}
9156
9157void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
9158 MachineInstr &Inst, unsigned Opcode,
9159 MachineDominatorTree *MDT) const {
9160 MachineBasicBlock &MBB = *Inst.getParent();
9161 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9162
9163 MachineOperand &Dest = Inst.getOperand(0);
9164 MachineOperand &Src0 = Inst.getOperand(1);
9165 MachineOperand &Src1 = Inst.getOperand(2);
9166 const DebugLoc &DL = Inst.getDebugLoc();
9167
9168 MachineBasicBlock::iterator MII = Inst;
9169
9170 const MCInstrDesc &InstDesc = get(Opcode);
9171 const TargetRegisterClass *Src0RC = Src0.isReg() ?
9172 MRI.getRegClass(Src0.getReg()) :
9173 &AMDGPU::SGPR_32RegClass;
9174
9175 const TargetRegisterClass *Src0SubRC =
9176 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9177 const TargetRegisterClass *Src1RC = Src1.isReg() ?
9178 MRI.getRegClass(Src1.getReg()) :
9179 &AMDGPU::SGPR_32RegClass;
9180
9181 const TargetRegisterClass *Src1SubRC =
9182 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9183
9184 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9185 AMDGPU::sub0, Src0SubRC);
9186 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9187 AMDGPU::sub0, Src1SubRC);
9188 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9189 AMDGPU::sub1, Src0SubRC);
9190 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9191 AMDGPU::sub1, Src1SubRC);
9192
9193 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9194 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
9195 const TargetRegisterClass *NewDestSubRC =
9196 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
9197
9198 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
9199 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
9200 .add(SrcReg0Sub0)
9201 .add(SrcReg1Sub0);
9202
9203 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
9204 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
9205 .add(SrcReg0Sub1)
9206 .add(SrcReg1Sub1);
9207
9208 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
9209 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9210 .addReg(DestSub0)
9211 .addImm(AMDGPU::sub0)
9212 .addReg(DestSub1)
9213 .addImm(AMDGPU::sub1);
9214
9215 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9216
9217 Worklist.insert(&LoHalf);
9218 Worklist.insert(&HiHalf);
9219
9220 // Move all users of this moved value.
9221 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9222}
9223
9224void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
9225 MachineInstr &Inst,
9226 MachineDominatorTree *MDT) const {
9227 MachineBasicBlock &MBB = *Inst.getParent();
9228 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9229
9230 MachineOperand &Dest = Inst.getOperand(0);
9231 MachineOperand &Src0 = Inst.getOperand(1);
9232 MachineOperand &Src1 = Inst.getOperand(2);
9233 const DebugLoc &DL = Inst.getDebugLoc();
9234
9235 MachineBasicBlock::iterator MII = Inst;
9236
9237 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9238
9239 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
9240
9241 MachineOperand* Op0;
9242 MachineOperand* Op1;
9243
9244 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
9245 Op0 = &Src0;
9246 Op1 = &Src1;
9247 } else {
9248 Op0 = &Src1;
9249 Op1 = &Src0;
9250 }
9251
9252 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
9253 .add(*Op0);
9254
9255 Register NewDest = MRI.createVirtualRegister(DestRC);
9256
9257 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
9258 .addReg(Interm)
9259 .add(*Op1);
9260
9261 MRI.replaceRegWith(Dest.getReg(), NewDest);
9262
9263 Worklist.insert(&Xor);
9264}
9265
9266void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
9267 MachineInstr &Inst) const {
9268 MachineBasicBlock &MBB = *Inst.getParent();
9269 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9270
9271 MachineBasicBlock::iterator MII = Inst;
9272 const DebugLoc &DL = Inst.getDebugLoc();
9273
9274 MachineOperand &Dest = Inst.getOperand(0);
9275 MachineOperand &Src = Inst.getOperand(1);
9276
9277 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
9278 const TargetRegisterClass *SrcRC = Src.isReg() ?
9279 MRI.getRegClass(Src.getReg()) :
9280 &AMDGPU::SGPR_32RegClass;
9281
9282 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9283 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9284
9285 const TargetRegisterClass *SrcSubRC =
9286 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9287
9288 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9289 AMDGPU::sub0, SrcSubRC);
9290 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9291 AMDGPU::sub1, SrcSubRC);
9292
9293 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
9294
9295 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
9296
9297 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9298
9299 // We don't need to legalize operands here. src0 for either instruction can be
9300 // an SGPR, and the second input is unused or determined here.
9301 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9302}
9303
9304void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
9305 MachineInstr &Inst) const {
9306 MachineBasicBlock &MBB = *Inst.getParent();
9307 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9308 MachineBasicBlock::iterator MII = Inst;
9309 const DebugLoc &DL = Inst.getDebugLoc();
9310
9311 MachineOperand &Dest = Inst.getOperand(0);
9312 uint32_t Imm = Inst.getOperand(2).getImm();
9313 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
9314 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
9315
9316 (void) Offset;
9317
9318 // Only sext_inreg cases handled.
9319 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
9320 Offset == 0 && "Not implemented");
9321
9322 if (BitWidth < 32) {
9323 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9324 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9325 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9326
9327 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
9328 .addReg(Inst.getOperand(1).getReg(), {}, AMDGPU::sub0)
9329 .addImm(0)
9330 .addImm(BitWidth);
9331
9332 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
9333 .addImm(31)
9334 .addReg(MidRegLo);
9335
9336 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9337 .addReg(MidRegLo)
9338 .addImm(AMDGPU::sub0)
9339 .addReg(MidRegHi)
9340 .addImm(AMDGPU::sub1);
9341
9342 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9343 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9344 return;
9345 }
9346
9347 MachineOperand &Src = Inst.getOperand(1);
9348 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9349 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9350
9351 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
9352 .addImm(31)
9353 .addReg(Src.getReg(), {}, AMDGPU::sub0);
9354
9355 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9356 .addReg(Src.getReg(), {}, AMDGPU::sub0)
9357 .addImm(AMDGPU::sub0)
9358 .addReg(TmpReg)
9359 .addImm(AMDGPU::sub1);
9360
9361 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9362 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9363}
9364
9365void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
9366 MachineInstr &Inst, unsigned Opcode,
9367 MachineDominatorTree *MDT) const {
9368 // (S_FLBIT_I32_B64 hi:lo) ->
9369 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
9370 // (S_FF1_I32_B64 hi:lo) ->
9371 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
9372
9373 MachineBasicBlock &MBB = *Inst.getParent();
9374 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9375 MachineBasicBlock::iterator MII = Inst;
9376 const DebugLoc &DL = Inst.getDebugLoc();
9377
9378 MachineOperand &Dest = Inst.getOperand(0);
9379 MachineOperand &Src = Inst.getOperand(1);
9380
9381 const MCInstrDesc &InstDesc = get(Opcode);
9382
9383 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9384 unsigned OpcodeAdd = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64
9385 : AMDGPU::V_ADD_CO_U32_e32;
9386
9387 const TargetRegisterClass *SrcRC =
9388 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9389 const TargetRegisterClass *SrcSubRC =
9390 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9391
9392 MachineOperand SrcRegSub0 =
9393 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
9394 MachineOperand SrcRegSub1 =
9395 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
9396
9397 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9398 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9399 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9400 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9401
9402 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
9403
9404 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
9405
9406 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
9407 .addReg(IsCtlz ? MidReg1 : MidReg2)
9408 .addImm(32)
9409 .addImm(1); // enable clamp
9410
9411 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9412 .addReg(MidReg3)
9413 .addReg(IsCtlz ? MidReg2 : MidReg1);
9414
9415 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9416
9417 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9418}
9419
9420void SIInstrInfo::addUsersToMoveToVALUWorklist(
9421 Register DstReg, MachineRegisterInfo &MRI,
9422 SIInstrWorklist &Worklist) const {
9423 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9424 MachineInstr &UseMI = *MO.getParent();
9425
9426 unsigned OpNo = 0;
9427
9428 switch (UseMI.getOpcode()) {
9429 case AMDGPU::COPY:
9430 case AMDGPU::WQM:
9431 case AMDGPU::SOFT_WQM:
9432 case AMDGPU::STRICT_WWM:
9433 case AMDGPU::STRICT_WQM:
9434 case AMDGPU::REG_SEQUENCE:
9435 case AMDGPU::PHI:
9436 case AMDGPU::INSERT_SUBREG:
9437 break;
9438 default:
9439 OpNo = MO.getOperandNo();
9440 break;
9441 }
9442
9443 const TargetRegisterClass *OpRC = getOpRegClass(UseMI, OpNo);
9444 MRI.constrainRegClass(DstReg, OpRC);
9445
9446 if (!RI.hasVectorRegisters(OpRC))
9447 Worklist.insert(&UseMI);
9448 else
9449 // Legalization could change user list.
9450 legalizeOperandsVALUt16(UseMI, OpNo, MRI);
9451 }
9452}
9453
9454void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9456 MachineInstr &Inst) const {
9457 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9458 MachineBasicBlock *MBB = Inst.getParent();
9459 MachineOperand &Src0 = Inst.getOperand(1);
9460 MachineOperand &Src1 = Inst.getOperand(2);
9461 const DebugLoc &DL = Inst.getDebugLoc();
9462
9463 if (ST.useRealTrue16Insts()) {
9464 Register SrcReg0, SrcReg1;
9465 if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
9466 SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9467 BuildMI(*MBB, Inst, DL,
9468 get(Src0.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), SrcReg0)
9469 .add(Src0);
9470 } else {
9471 SrcReg0 = Src0.getReg();
9472 }
9473
9474 if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
9475 SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9476 BuildMI(*MBB, Inst, DL,
9477 get(Src1.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), SrcReg1)
9478 .add(Src1);
9479 } else {
9480 SrcReg1 = Src1.getReg();
9481 }
9482
9483 bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
9484 bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
9485
9486 auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
9487 switch (Inst.getOpcode()) {
9488 case AMDGPU::S_PACK_LL_B32_B16:
9489 NewMI
9490 .addReg(SrcReg0, {},
9491 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9492 .addImm(AMDGPU::lo16)
9493 .addReg(SrcReg1, {},
9494 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9495 .addImm(AMDGPU::hi16);
9496 break;
9497 case AMDGPU::S_PACK_LH_B32_B16:
9498 NewMI
9499 .addReg(SrcReg0, {},
9500 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9501 .addImm(AMDGPU::lo16)
9502 .addReg(SrcReg1, {}, AMDGPU::hi16)
9503 .addImm(AMDGPU::hi16);
9504 break;
9505 case AMDGPU::S_PACK_HL_B32_B16:
9506 NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
9507 .addImm(AMDGPU::lo16)
9508 .addReg(SrcReg1, {},
9509 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9510 .addImm(AMDGPU::hi16);
9511 break;
9512 case AMDGPU::S_PACK_HH_B32_B16:
9513 NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
9514 .addImm(AMDGPU::lo16)
9515 .addReg(SrcReg1, {}, AMDGPU::hi16)
9516 .addImm(AMDGPU::hi16);
9517 break;
9518 default:
9519 llvm_unreachable("unhandled s_pack_* instruction");
9520 }
9521
9522 MachineOperand &Dest = Inst.getOperand(0);
9523 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9524 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9525 return;
9526 }
9527
9528 switch (Inst.getOpcode()) {
9529 case AMDGPU::S_PACK_LL_B32_B16: {
9530 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9531 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9532
9533 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9534 // 0.
9535 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9536 .addImm(0xffff);
9537
9538 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9539 .addReg(ImmReg, RegState::Kill)
9540 .add(Src0);
9541
9542 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9543 .add(Src1)
9544 .addImm(16)
9545 .addReg(TmpReg, RegState::Kill);
9546 break;
9547 }
9548 case AMDGPU::S_PACK_LH_B32_B16: {
9549 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9550 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9551 .addImm(0xffff);
9552 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9553 .addReg(ImmReg, RegState::Kill)
9554 .add(Src0)
9555 .add(Src1);
9556 break;
9557 }
9558 case AMDGPU::S_PACK_HL_B32_B16: {
9559 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9560 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9561 .addImm(16)
9562 .add(Src0);
9563 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9564 .add(Src1)
9565 .addImm(16)
9566 .addReg(TmpReg, RegState::Kill);
9567 break;
9568 }
9569 case AMDGPU::S_PACK_HH_B32_B16: {
9570 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9571 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9572 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9573 .addImm(16)
9574 .add(Src0);
9575 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9576 .addImm(0xffff0000);
9577 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9578 .add(Src1)
9579 .addReg(ImmReg, RegState::Kill)
9580 .addReg(TmpReg, RegState::Kill);
9581 break;
9582 }
9583 default:
9584 llvm_unreachable("unhandled s_pack_* instruction");
9585 }
9586
9587 MachineOperand &Dest = Inst.getOperand(0);
9588 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9589 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9590}
9591
9592void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
9593 MachineInstr &SCCDefInst,
9594 SIInstrWorklist &Worklist,
9595 Register NewCond) const {
9596
9597 // Ensure that def inst defines SCC, which is still live.
9598 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9599 !Op.isDead() && Op.getParent() == &SCCDefInst);
9600 SmallVector<MachineInstr *, 4> CopyToDelete;
9601 // This assumes that all the users of SCC are in the same block
9602 // as the SCC def.
9603 for (MachineInstr &MI : // Skip the def inst itself.
9604 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9605 SCCDefInst.getParent()->end())) {
9606 // Check if SCC is used first.
9607 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9608 if (SCCIdx != -1) {
9609 if (MI.isCopy()) {
9610 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9611 Register DestReg = MI.getOperand(0).getReg();
9612
9613 MRI.replaceRegWith(DestReg, NewCond);
9614 CopyToDelete.push_back(&MI);
9615 } else {
9616
9617 if (NewCond.isValid())
9618 MI.getOperand(SCCIdx).setReg(NewCond);
9619
9620 Worklist.insert(&MI);
9621 }
9622 }
9623 // Exit if we find another SCC def.
9624 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9625 break;
9626 }
9627 for (auto &Copy : CopyToDelete)
9628 Copy->eraseFromParent();
9629}
9630
9631// Instructions that use SCC may be converted to VALU instructions. When that
9632// happens, the SCC register is changed to VCC_LO. The instruction that defines
9633// SCC must be changed to an instruction that defines VCC. This function makes
9634// sure that the instruction that defines SCC is added to the moveToVALU
9635// worklist.
9636void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9637 SIInstrWorklist &Worklist) const {
9638 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9639 // then there is nothing to do because the defining instruction has been
9640 // converted to a VALU already. If SCC then that instruction needs to be
9641 // converted to a VALU.
9642 for (MachineInstr &MI :
9643 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9644 SCCUseInst->getParent()->rend())) {
9645 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9646 break;
9647 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9648 Worklist.insert(&MI);
9649 break;
9650 }
9651 }
9652}
9653
9654const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9655 const MachineInstr &Inst) const {
9656 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9657
9658 switch (Inst.getOpcode()) {
9659 // For target instructions, getOpRegClass just returns the virtual register
9660 // class associated with the operand, so we need to find an equivalent VGPR
9661 // register class in order to move the instruction to the VALU.
9662 case AMDGPU::COPY:
9663 case AMDGPU::PHI:
9664 case AMDGPU::REG_SEQUENCE:
9665 case AMDGPU::INSERT_SUBREG:
9666 case AMDGPU::WQM:
9667 case AMDGPU::SOFT_WQM:
9668 case AMDGPU::STRICT_WWM:
9669 case AMDGPU::STRICT_WQM: {
9670 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9671 if (RI.isAGPRClass(SrcRC)) {
9672 if (RI.isAGPRClass(NewDstRC))
9673 return nullptr;
9674
9675 switch (Inst.getOpcode()) {
9676 case AMDGPU::PHI:
9677 case AMDGPU::REG_SEQUENCE:
9678 case AMDGPU::INSERT_SUBREG:
9679 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9680 break;
9681 default:
9682 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9683 }
9684
9685 if (!NewDstRC)
9686 return nullptr;
9687 } else {
9688 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9689 return nullptr;
9690
9691 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9692 if (!NewDstRC)
9693 return nullptr;
9694 }
9695
9696 return NewDstRC;
9697 }
9698 default:
9699 return NewDstRC;
9700 }
9701}
9702
9703// Find the one SGPR operand we are allowed to use.
9704Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9705 int OpIndices[3]) const {
9706 const MCInstrDesc &Desc = MI.getDesc();
9707
9708 // Find the one SGPR operand we are allowed to use.
9709 //
9710 // First we need to consider the instruction's operand requirements before
9711 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9712 // of VCC, but we are still bound by the constant bus requirement to only use
9713 // one.
9714 //
9715 // If the operand's class is an SGPR, we can never move it.
9716
9717 Register SGPRReg = findImplicitSGPRRead(MI);
9718 if (SGPRReg)
9719 return SGPRReg;
9720
9721 Register UsedSGPRs[3] = {Register()};
9722 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9723
9724 for (unsigned i = 0; i < 3; ++i) {
9725 int Idx = OpIndices[i];
9726 if (Idx == -1)
9727 break;
9728
9729 const MachineOperand &MO = MI.getOperand(Idx);
9730 if (!MO.isReg())
9731 continue;
9732
9733 // Is this operand statically required to be an SGPR based on the operand
9734 // constraints?
9735 const TargetRegisterClass *OpRC =
9736 RI.getRegClass(getOpRegClassID(Desc.operands()[Idx]));
9737 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9738 if (IsRequiredSGPR)
9739 return MO.getReg();
9740
9741 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9742 Register Reg = MO.getReg();
9743 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9744 if (RI.isSGPRClass(RegRC))
9745 UsedSGPRs[i] = Reg;
9746 }
9747
9748 // We don't have a required SGPR operand, so we have a bit more freedom in
9749 // selecting operands to move.
9750
9751 // Try to select the most used SGPR. If an SGPR is equal to one of the
9752 // others, we choose that.
9753 //
9754 // e.g.
9755 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9756 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9757
9758 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9759 // prefer those.
9760
9761 if (UsedSGPRs[0]) {
9762 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9763 SGPRReg = UsedSGPRs[0];
9764 }
9765
9766 if (!SGPRReg && UsedSGPRs[1]) {
9767 if (UsedSGPRs[1] == UsedSGPRs[2])
9768 SGPRReg = UsedSGPRs[1];
9769 }
9770
9771 return SGPRReg;
9772}
9773
9775 AMDGPU::OpName OperandName) const {
9776 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9777 return nullptr;
9778
9779 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9780 if (Idx == -1)
9781 return nullptr;
9782
9783 return &MI.getOperand(Idx);
9784}
9785
9787 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9788 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9791 return (Format << 44) |
9792 (1ULL << 56) | // RESOURCE_LEVEL = 1
9793 (3ULL << 60); // OOB_SELECT = 3
9794 }
9795
9796 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9797 if (ST.isAmdHsaOS()) {
9798 // Set ATC = 1. GFX9 doesn't have this bit.
9799 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9800 RsrcDataFormat |= (1ULL << 56);
9801
9802 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9803 // BTW, it disables TC L2 and therefore decreases performance.
9804 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9805 RsrcDataFormat |= (2ULL << 59);
9806 }
9807
9808 return RsrcDataFormat;
9809}
9810
9814 0xffffffff; // Size;
9815
9816 // GFX9 doesn't have ELEMENT_SIZE.
9817 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9818 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9819 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9820 }
9821
9822 // IndexStride = 64 / 32.
9823 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9824 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9825
9826 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9827 // Clear them unless we want a huge stride.
9828 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9829 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9830 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9831
9832 return Rsrc23;
9833}
9834
9836 unsigned Opc = MI.getOpcode();
9837
9838 return isSMRD(Opc);
9839}
9840
9842 return get(Opc).mayLoad() &&
9843 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9844}
9845
9847 TypeSize &MemBytes) const {
9848 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9849 if (!Addr || !Addr->isFI())
9850 return Register();
9851
9852 assert(!MI.memoperands_empty() &&
9853 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9854
9855 FrameIndex = Addr->getIndex();
9856
9857 int VDataIdx =
9858 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
9859 MemBytes = TypeSize::getFixed(getOpSize(MI.getOpcode(), VDataIdx));
9860 return MI.getOperand(VDataIdx).getReg();
9861}
9862
9864 TypeSize &MemBytes) const {
9865 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9866 assert(Addr && Addr->isFI());
9867 FrameIndex = Addr->getIndex();
9868
9869 int DataIdx =
9870 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::data);
9871 MemBytes = TypeSize::getFixed(getOpSize(MI.getOpcode(), DataIdx));
9872 return MI.getOperand(DataIdx).getReg();
9873}
9874
9876 int &FrameIndex,
9877 TypeSize &MemBytes) const {
9878 if (!MI.mayLoad())
9879 return Register();
9880
9881 if (isMUBUF(MI) || isVGPRSpill(MI))
9882 return isStackAccess(MI, FrameIndex, MemBytes);
9883
9884 if (isSGPRSpill(MI))
9885 return isSGPRStackAccess(MI, FrameIndex, MemBytes);
9886
9887 return Register();
9888}
9889
9891 int &FrameIndex,
9892 TypeSize &MemBytes) const {
9893 if (!MI.mayStore())
9894 return Register();
9895
9896 if (isMUBUF(MI) || isVGPRSpill(MI))
9897 return isStackAccess(MI, FrameIndex, MemBytes);
9898
9899 if (isSGPRSpill(MI))
9900 return isSGPRStackAccess(MI, FrameIndex, MemBytes);
9901
9902 return Register();
9903}
9904
9906 unsigned Size = 0;
9908 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9909 while (++I != E && I->isInsideBundle()) {
9910 assert(!I->isBundle() && "No nested bundle!");
9912 }
9913
9914 return Size;
9915}
9916
9918 unsigned Opc = MI.getOpcode();
9920 unsigned DescSize = Desc.getSize();
9921
9922 // If we have a definitive size, we can use it. Otherwise we need to inspect
9923 // the operands to know the size.
9924 if (isFixedSize(MI)) {
9925 unsigned Size = DescSize;
9926
9927 // If we hit the buggy offset, an extra nop will be inserted in MC so
9928 // estimate the worst case.
9929 if (MI.isBranch() && ST.hasOffset3fBug())
9930 Size += 4;
9931
9932 return Size;
9933 }
9934
9935 // Instructions may have a 32-bit literal encoded after them. Check
9936 // operands that could ever be literals.
9937 if (isVALU(MI) || isSALU(MI)) {
9938 if (isDPP(MI))
9939 return DescSize;
9940 bool HasLiteral = false;
9941 unsigned LiteralSize = 4;
9942 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9943 const MachineOperand &Op = MI.getOperand(I);
9944 const MCOperandInfo &OpInfo = Desc.operands()[I];
9945 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9946 HasLiteral = true;
9947 if (ST.has64BitLiterals()) {
9948 switch (OpInfo.OperandType) {
9949 default:
9950 break;
9952 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9953 LiteralSize = 8;
9954 break;
9956 // A 32-bit literal is only valid when the value fits in BOTH signed
9957 // and unsigned 32-bit ranges [0, 2^31-1], matching the MC code
9958 // emitter's getLit64Encoding logic. This is because of the lack of
9959 // abilility to tell signedness of the literal, therefore we need to
9960 // be conservative and assume values outside this range require a
9961 // 64-bit literal encoding (8 bytes).
9962 if (!Op.isImm() || !isInt<32>(Op.getImm()) ||
9963 !isUInt<32>(Op.getImm()))
9964 LiteralSize = 8;
9965 break;
9966 }
9967 }
9968 break;
9969 }
9970 }
9971 return HasLiteral ? DescSize + LiteralSize : DescSize;
9972 }
9973
9974 // Check whether we have extra NSA words.
9975 if (isMIMG(MI)) {
9976 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9977 if (VAddr0Idx < 0)
9978 return 8;
9979
9980 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9981 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9982 }
9983
9984 switch (Opc) {
9985 case TargetOpcode::BUNDLE:
9986 return getInstBundleSize(MI);
9987 case TargetOpcode::INLINEASM:
9988 case TargetOpcode::INLINEASM_BR: {
9989 const MachineFunction *MF = MI.getMF();
9990 const char *AsmStr = MI.getOperand(0).getSymbolName();
9991 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
9992 }
9993 default:
9994 if (MI.isMetaInstruction())
9995 return 0;
9996
9997 // If D16 Pseudo inst, get correct MC code size
9998 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9999 if (D16Info) {
10000 // Assume d16_lo/hi inst are always in same size
10001 unsigned LoInstOpcode = D16Info->LoOp;
10002 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
10003 DescSize = Desc.getSize();
10004 }
10005
10006 // If FMA Pseudo inst, get correct MC code size
10007 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
10008 // All potential lowerings are the same size; arbitrarily pick one.
10009 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
10010 DescSize = Desc.getSize();
10011 }
10012
10013 return DescSize;
10014 }
10015}
10016
10018 if (!isFLAT(MI))
10019 return false;
10020
10021 if (MI.memoperands_empty())
10022 return true;
10023
10024 for (const MachineMemOperand *MMO : MI.memoperands()) {
10025 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
10026 return true;
10027 }
10028 return false;
10029}
10030
10033 static const std::pair<int, const char *> TargetIndices[] = {
10034 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
10035 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
10036 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
10037 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
10038 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
10039 return ArrayRef(TargetIndices);
10040}
10041
10042/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
10043/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
10046 const ScheduleDAG *DAG) const {
10047 return new GCNHazardRecognizer(DAG->MF);
10048}
10049
10050/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
10051/// pass.
10054 MachineLoopInfo *MLI) const {
10055 return new GCNHazardRecognizer(MF, MLI);
10056}
10057
10058// Called during:
10059// - pre-RA scheduling and post-RA scheduling
10062 const ScheduleDAGMI *DAG) const {
10063 // Borrowed from Arm Target
10064 // We would like to restrict this hazard recognizer to only
10065 // post-RA scheduling; we can tell that we're post-RA because we don't
10066 // track VRegLiveness.
10067 if (!DAG->hasVRegLiveness())
10068 return new GCNHazardRecognizer(DAG->MF);
10070}
10071
10072std::pair<unsigned, unsigned>
10074 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
10075}
10076
10079 static const std::pair<unsigned, const char *> TargetFlags[] = {
10080 {MO_GOTPCREL, "amdgpu-gotprel"},
10081 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
10082 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
10083 {MO_GOTPCREL64, "amdgpu-gotprel64"},
10084 {MO_REL32_LO, "amdgpu-rel32-lo"},
10085 {MO_REL32_HI, "amdgpu-rel32-hi"},
10086 {MO_REL64, "amdgpu-rel64"},
10087 {MO_ABS32_LO, "amdgpu-abs32-lo"},
10088 {MO_ABS32_HI, "amdgpu-abs32-hi"},
10089 {MO_ABS64, "amdgpu-abs64"},
10090 };
10091
10092 return ArrayRef(TargetFlags);
10093}
10094
10097 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
10098 {
10099 {MONoClobber, "amdgpu-noclobber"},
10100 {MOLastUse, "amdgpu-last-use"},
10101 {MOCooperative, "amdgpu-cooperative"},
10102 {MOThreadPrivate, "amdgpu-thread-private"},
10103 };
10104
10105 return ArrayRef(TargetFlags);
10106}
10107
10109 const MachineFunction &MF) const {
10111 assert(SrcReg.isVirtual());
10112 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
10113 return AMDGPU::WWM_COPY;
10114
10115 return AMDGPU::COPY;
10116}
10117
10119 uint32_t Opcode = MI.getOpcode();
10120 // Check if it is SGPR spill or wwm-register spill Opcode.
10121 if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode))
10122 return true;
10123
10124 const MachineFunction *MF = MI.getMF();
10125 const MachineRegisterInfo &MRI = MF->getRegInfo();
10127
10128 // See if this is Liverange split instruction inserted for SGPR or
10129 // wwm-register. The implicit def inserted for wwm-registers should also be
10130 // included as they can appear at the bb begin.
10131 bool IsLRSplitInst = MI.getFlag(MachineInstr::LRSplit);
10132 if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF)
10133 return false;
10134
10135 Register Reg = MI.getOperand(0).getReg();
10136 if (RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg)))
10137 return IsLRSplitInst;
10138
10139 return MFI->isWWMReg(Reg);
10140}
10141
10143 Register Reg) const {
10144 // We need to handle instructions which may be inserted during register
10145 // allocation to handle the prolog. The initial prolog instruction may have
10146 // been separated from the start of the block by spills and copies inserted
10147 // needed by the prolog. However, the insertions for scalar registers can
10148 // always be placed at the BB top as they are independent of the exec mask
10149 // value.
10150 bool IsNullOrVectorRegister = true;
10151 if (Reg) {
10152 const MachineFunction *MF = MI.getMF();
10153 const MachineRegisterInfo &MRI = MF->getRegInfo();
10154 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
10155 }
10156
10157 return IsNullOrVectorRegister &&
10158 (canAddToBBProlog(MI) ||
10159 (!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
10160 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
10161}
10162
10166 const DebugLoc &DL,
10167 Register DestReg) const {
10168 if (ST.hasAddNoCarryInsts())
10169 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
10170
10171 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
10172 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
10173 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
10174
10175 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
10176 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
10177}
10178
10181 const DebugLoc &DL,
10182 Register DestReg,
10183 RegScavenger &RS) const {
10184 if (ST.hasAddNoCarryInsts())
10185 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
10186
10187 // If available, prefer to use vcc.
10188 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
10189 ? Register(RI.getVCC())
10190 : RS.scavengeRegisterBackwards(
10191 *RI.getBoolRC(), I, /* RestoreAfter */ false,
10192 0, /* AllowSpill */ false);
10193
10194 // TODO: Users need to deal with this.
10195 if (!UnusedCarry.isValid())
10196 return MachineInstrBuilder();
10197
10198 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
10199 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
10200}
10201
10202bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
10203 switch (Opcode) {
10204 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
10205 case AMDGPU::SI_KILL_I1_TERMINATOR:
10206 return true;
10207 default:
10208 return false;
10209 }
10210}
10211
10213 switch (Opcode) {
10214 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
10215 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
10216 case AMDGPU::SI_KILL_I1_PSEUDO:
10217 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
10218 default:
10219 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
10220 }
10221}
10222
10223bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
10224 return Imm <= getMaxMUBUFImmOffset(ST);
10225}
10226
10228 // GFX12 field is non-negative 24-bit signed byte offset.
10229 const unsigned OffsetBits =
10230 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
10231 return (1 << OffsetBits) - 1;
10232}
10233
10235 if (!ST.isWave32())
10236 return;
10237
10238 if (MI.isInlineAsm())
10239 return;
10240
10241 if (MI.getNumOperands() < MI.getNumExplicitOperands())
10242 return;
10243
10244 for (auto &Op : MI.implicit_operands()) {
10245 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
10246 Op.setReg(AMDGPU::VCC_LO);
10247 }
10248}
10249
10251 if (!isSMRD(MI))
10252 return false;
10253
10254 // Check that it is using a buffer resource.
10255 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
10256 if (Idx == -1) // e.g. s_memtime
10257 return false;
10258
10259 const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]);
10260 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
10261}
10262
10263// Given Imm, split it into the values to put into the SOffset and ImmOffset
10264// fields in an MUBUF instruction. Return false if it is not possible (due to a
10265// hardware bug needing a workaround).
10266//
10267// The required alignment ensures that individual address components remain
10268// aligned if they are aligned to begin with. It also ensures that additional
10269// offsets within the given alignment can be added to the resulting ImmOffset.
10271 uint32_t &ImmOffset, Align Alignment) const {
10272 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
10273 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
10274 uint32_t Overflow = 0;
10275
10276 if (Imm > MaxImm) {
10277 if (Imm <= MaxImm + 64) {
10278 // Use an SOffset inline constant for 4..64
10279 Overflow = Imm - MaxImm;
10280 Imm = MaxImm;
10281 } else {
10282 // Try to keep the same value in SOffset for adjacent loads, so that
10283 // the corresponding register contents can be re-used.
10284 //
10285 // Load values with all low-bits (except for alignment bits) set into
10286 // SOffset, so that a larger range of values can be covered using
10287 // s_movk_i32.
10288 //
10289 // Atomic operations fail to work correctly when individual address
10290 // components are unaligned, even if their sum is aligned.
10291 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
10292 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
10293 Imm = Low;
10294 Overflow = High - Alignment.value();
10295 }
10296 }
10297
10298 if (Overflow > 0) {
10299 // There is a hardware bug in SI and CI which prevents address clamping in
10300 // MUBUF instructions from working correctly with SOffsets. The immediate
10301 // offset is unaffected.
10302 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
10303 return false;
10304
10305 // It is not possible to set immediate in SOffset field on some targets.
10306 if (ST.hasRestrictedSOffset())
10307 return false;
10308 }
10309
10310 ImmOffset = Imm;
10311 SOffset = Overflow;
10312 return true;
10313}
10314
10315// Depending on the used address space and instructions, some immediate offsets
10316// are allowed and some are not.
10317// Pre-GFX12, flat instruction offsets can only be non-negative, global and
10318// scratch instruction offsets can also be negative. On GFX12, offsets can be
10319// negative for all variants.
10320//
10321// There are several bugs related to these offsets:
10322// On gfx10.1, flat instructions that go into the global address space cannot
10323// use an offset.
10324//
10325// For scratch instructions, the address can be either an SGPR or a VGPR.
10326// The following offsets can be used, depending on the architecture (x means
10327// cannot be used):
10328// +----------------------------+------+------+
10329// | Address-Mode | SGPR | VGPR |
10330// +----------------------------+------+------+
10331// | gfx9 | | |
10332// | negative, 4-aligned offset | x | ok |
10333// | negative, unaligned offset | x | ok |
10334// +----------------------------+------+------+
10335// | gfx10 | | |
10336// | negative, 4-aligned offset | ok | ok |
10337// | negative, unaligned offset | ok | x |
10338// +----------------------------+------+------+
10339// | gfx10.3 | | |
10340// | negative, 4-aligned offset | ok | ok |
10341// | negative, unaligned offset | ok | ok |
10342// +----------------------------+------+------+
10343//
10344// This function ignores the addressing mode, so if an offset cannot be used in
10345// one addressing mode, it is considered illegal.
10346bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
10347 uint64_t FlatVariant) const {
10348 // TODO: Should 0 be special cased?
10349 if (!ST.hasFlatInstOffsets())
10350 return false;
10351
10352 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
10353 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
10354 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
10355 return false;
10356
10357 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10358 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
10359 (Offset % 4) != 0) {
10360 return false;
10361 }
10362
10363 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10364 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
10365 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
10366}
10367
10368// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
10369std::pair<int64_t, int64_t>
10370SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
10371 uint64_t FlatVariant) const {
10372 int64_t RemainderOffset = COffsetVal;
10373 int64_t ImmField = 0;
10374
10375 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10376 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
10377
10378 if (AllowNegative) {
10379 // Use signed division by a power of two to truncate towards 0.
10380 int64_t D = 1LL << NumBits;
10381 RemainderOffset = (COffsetVal / D) * D;
10382 ImmField = COffsetVal - RemainderOffset;
10383
10384 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10385 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
10386 (ImmField % 4) != 0) {
10387 // Make ImmField a multiple of 4
10388 RemainderOffset += ImmField % 4;
10389 ImmField -= ImmField % 4;
10390 }
10391 } else if (COffsetVal >= 0) {
10392 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
10393 RemainderOffset = COffsetVal - ImmField;
10394 }
10395
10396 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
10397 assert(RemainderOffset + ImmField == COffsetVal);
10398 return {ImmField, RemainderOffset};
10399}
10400
10402 if (ST.hasNegativeScratchOffsetBug() &&
10403 FlatVariant == SIInstrFlags::FlatScratch)
10404 return false;
10405
10406 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
10407}
10408
10409static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
10410 switch (ST.getGeneration()) {
10411 default:
10412 break;
10415 return SIEncodingFamily::SI;
10418 return SIEncodingFamily::VI;
10422 return ST.hasGFX11_7Insts() ? SIEncodingFamily::GFX1170
10425 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
10429 }
10430 llvm_unreachable("Unknown subtarget generation!");
10431}
10432
10433bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
10434 switch(MCOp) {
10435 // These opcodes use indirect register addressing so
10436 // they need special handling by codegen (currently missing).
10437 // Therefore it is too risky to allow these opcodes
10438 // to be selected by dpp combiner or sdwa peepholer.
10439 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10440 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10441 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10442 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10443 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10444 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10445 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10446 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10447 return true;
10448 default:
10449 return false;
10450 }
10451}
10452
10453#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10454 case OPCODE##_dpp: \
10455 case OPCODE##_e32: \
10456 case OPCODE##_e64: \
10457 case OPCODE##_e64_dpp: \
10458 case OPCODE##_sdwa:
10459
10460static bool isRenamedInGFX9(int Opcode) {
10461 switch (Opcode) {
10462 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10463 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10464 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10465 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10466 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10467 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10468 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10469 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10470 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10471 //
10472 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10473 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10474 case AMDGPU::V_FMA_F16_gfx9_e64:
10475 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10476 case AMDGPU::V_INTERP_P2_F16:
10477 case AMDGPU::V_MAD_F16_e64:
10478 case AMDGPU::V_MAD_U16_e64:
10479 case AMDGPU::V_MAD_I16_e64:
10480 return true;
10481 default:
10482 return false;
10483 }
10484}
10485
10486int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10487 assert(Opcode == (int)SIInstrInfo::getNonSoftWaitcntOpcode(Opcode) &&
10488 "SIInsertWaitcnts should have promoted soft waitcnt instructions!");
10489
10490 unsigned Gen = subtargetEncodingFamily(ST);
10491
10492 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10494
10495 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10496 // subtarget has UnpackedD16VMem feature.
10497 // TODO: remove this when we discard GFX80 encoding.
10498 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10500
10501 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10502 switch (ST.getGeneration()) {
10503 default:
10505 break;
10508 break;
10511 break;
10512 }
10513 }
10514
10515 if (isMAI(Opcode)) {
10516 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10517 if (MFMAOp != -1)
10518 Opcode = MFMAOp;
10519 }
10520
10521 int32_t MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10522
10523 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX11_7Insts())
10525
10526 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX1250Insts())
10528
10529 // -1 means that Opcode is already a native instruction.
10530 if (MCOp == -1)
10531 return Opcode;
10532
10533 if (ST.hasGFX90AInsts()) {
10534 uint32_t NMCOp = AMDGPU::INSTRUCTION_LIST_END;
10535 if (ST.hasGFX940Insts())
10537 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10539 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10541 if (NMCOp != AMDGPU::INSTRUCTION_LIST_END)
10542 MCOp = NMCOp;
10543 }
10544
10545 // INSTRUCTION_LIST_END means that Opcode is a pseudo instruction that has no
10546 // encoding in the given subtarget generation.
10547 if (MCOp == AMDGPU::INSTRUCTION_LIST_END)
10548 return -1;
10549
10550 if (isAsmOnlyOpcode(MCOp))
10551 return -1;
10552
10553 return MCOp;
10554}
10555
10556static
10558 assert(RegOpnd.isReg());
10559 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10560 getRegSubRegPair(RegOpnd);
10561}
10562
10565 assert(MI.isRegSequence());
10566 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10567 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10568 auto &RegOp = MI.getOperand(1 + 2 * I);
10569 return getRegOrUndef(RegOp);
10570 }
10572}
10573
10574// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10575// Following a subreg of reg:subreg isn't supported
10578 if (!RSR.SubReg)
10579 return false;
10580 switch (MI.getOpcode()) {
10581 default: break;
10582 case AMDGPU::REG_SEQUENCE:
10583 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10584 return true;
10585 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10586 case AMDGPU::INSERT_SUBREG:
10587 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10588 // inserted the subreg we're looking for
10589 RSR = getRegOrUndef(MI.getOperand(2));
10590 else { // the subreg in the rest of the reg
10591 auto R1 = getRegOrUndef(MI.getOperand(1));
10592 if (R1.SubReg) // subreg of subreg isn't supported
10593 return false;
10594 RSR.Reg = R1.Reg;
10595 }
10596 return true;
10597 }
10598 return false;
10599}
10600
10602 const MachineRegisterInfo &MRI) {
10603 assert(MRI.isSSA());
10604 if (!P.Reg.isVirtual())
10605 return nullptr;
10606
10607 auto RSR = P;
10608 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10609 while (auto *MI = DefInst) {
10610 DefInst = nullptr;
10611 switch (MI->getOpcode()) {
10612 case AMDGPU::COPY:
10613 case AMDGPU::V_MOV_B32_e32: {
10614 auto &Op1 = MI->getOperand(1);
10615 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10616 if (Op1.isUndef())
10617 return nullptr;
10618 RSR = getRegSubRegPair(Op1);
10619 DefInst = MRI.getVRegDef(RSR.Reg);
10620 }
10621 break;
10622 }
10623 default:
10624 if (followSubRegDef(*MI, RSR)) {
10625 if (!RSR.Reg)
10626 return nullptr;
10627 DefInst = MRI.getVRegDef(RSR.Reg);
10628 }
10629 }
10630 if (!DefInst)
10631 return MI;
10632 }
10633 return nullptr;
10634}
10635
10637 Register VReg,
10638 const MachineInstr &DefMI,
10639 const MachineInstr &UseMI) {
10640 assert(MRI.isSSA() && "Must be run on SSA");
10641
10642 auto *TRI = MRI.getTargetRegisterInfo();
10643 auto *DefBB = DefMI.getParent();
10644
10645 // Don't bother searching between blocks, although it is possible this block
10646 // doesn't modify exec.
10647 if (UseMI.getParent() != DefBB)
10648 return true;
10649
10650 const int MaxInstScan = 20;
10651 int NumInst = 0;
10652
10653 // Stop scan at the use.
10654 auto E = UseMI.getIterator();
10655 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10656 if (I->isDebugInstr())
10657 continue;
10658
10659 if (++NumInst > MaxInstScan)
10660 return true;
10661
10662 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10663 return true;
10664 }
10665
10666 return false;
10667}
10668
10670 Register VReg,
10671 const MachineInstr &DefMI) {
10672 assert(MRI.isSSA() && "Must be run on SSA");
10673
10674 auto *TRI = MRI.getTargetRegisterInfo();
10675 auto *DefBB = DefMI.getParent();
10676
10677 const int MaxUseScan = 10;
10678 int NumUse = 0;
10679
10680 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10681 auto &UseInst = *Use.getParent();
10682 // Don't bother searching between blocks, although it is possible this block
10683 // doesn't modify exec.
10684 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10685 return true;
10686
10687 if (++NumUse > MaxUseScan)
10688 return true;
10689 }
10690
10691 if (NumUse == 0)
10692 return false;
10693
10694 const int MaxInstScan = 20;
10695 int NumInst = 0;
10696
10697 // Stop scan when we have seen all the uses.
10698 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10699 assert(I != DefBB->end());
10700
10701 if (I->isDebugInstr())
10702 continue;
10703
10704 if (++NumInst > MaxInstScan)
10705 return true;
10706
10707 for (const MachineOperand &Op : I->operands()) {
10708 // We don't check reg masks here as they're used only on calls:
10709 // 1. EXEC is only considered const within one BB
10710 // 2. Call should be a terminator instruction if present in a BB
10711
10712 if (!Op.isReg())
10713 continue;
10714
10715 Register Reg = Op.getReg();
10716 if (Op.isUse()) {
10717 if (Reg == VReg && --NumUse == 0)
10718 return false;
10719 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10720 return true;
10721 }
10722 }
10723}
10724
10727 const DebugLoc &DL, Register Src, Register Dst) const {
10728 auto Cur = MBB.begin();
10729 if (Cur != MBB.end())
10730 do {
10731 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10732 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10733 ++Cur;
10734 } while (Cur != MBB.end() && Cur != LastPHIIt);
10735
10736 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10737 Dst);
10738}
10739
10742 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10743 if (InsPt != MBB.end() &&
10744 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10745 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10746 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10747 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10748 InsPt++;
10749 return BuildMI(MBB, InsPt, DL,
10750 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10751 .addReg(Src, {}, SrcSubReg)
10752 .addReg(AMDGPU::EXEC, RegState::Implicit);
10753 }
10754 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10755 Dst);
10756}
10757
10758bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10759
10762 MachineBasicBlock::iterator InsertPt, int FrameIndex, MachineInstr *&CopyMI,
10763 LiveIntervals *LIS, VirtRegMap *VRM) const {
10764 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10765 //
10766 // %0:sreg_32 = COPY $m0
10767 //
10768 // We explicitly chose SReg_32 for the virtual register so such a copy might
10769 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10770 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10771 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10772 // TargetInstrInfo::foldMemoryOperand() is going to try.
10773 // A similar issue also exists with spilling and reloading $exec registers.
10774 //
10775 // To prevent that, constrain the %0 register class here.
10776 if (isFullCopyInstr(MI)) {
10777 Register DstReg = MI.getOperand(0).getReg();
10778 Register SrcReg = MI.getOperand(1).getReg();
10779 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10780 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10781 MachineRegisterInfo &MRI = MF.getRegInfo();
10782 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10783 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10784 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10785 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10786 return nullptr;
10787 }
10788 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10789 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10790 return nullptr;
10791 }
10792 }
10793 }
10794
10795 return nullptr;
10796}
10797
10799 const MachineInstr &MI,
10800 unsigned *PredCost) const {
10801 if (MI.isBundle()) {
10803 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10804 unsigned Lat = 0, Count = 0;
10805 for (++I; I != E && I->isBundledWithPred(); ++I) {
10806 ++Count;
10807 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10808 }
10809 return Lat + Count - 1;
10810 }
10811
10812 return SchedModel.computeInstrLatency(&MI);
10813}
10814
10815const MachineOperand &
10817 if (const MachineOperand *CallAddrOp =
10818 getNamedOperand(MI, AMDGPU::OpName::src0))
10819 return *CallAddrOp;
10821}
10822
10825 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10826 unsigned Opcode = MI.getOpcode();
10827
10828 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10829 Register Dst = MI.getOperand(0).getReg();
10830 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10831 : MI.getOperand(1).getReg();
10832 LLT DstTy = MRI.getType(Dst);
10833 LLT SrcTy = MRI.getType(Src);
10834 unsigned DstAS = DstTy.getAddressSpace();
10835 unsigned SrcAS = SrcTy.getAddressSpace();
10836 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10837 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10838 ST.hasGloballyAddressableScratch()
10841 };
10842
10843 // If the target supports globally addressable scratch, the mapping from
10844 // scratch memory to the flat aperture changes therefore an address space cast
10845 // is no longer uniform.
10846 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10847 return HandleAddrSpaceCast(MI);
10848
10849 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10850 auto IID = GI->getIntrinsicID();
10855
10856 switch (IID) {
10857 case Intrinsic::amdgcn_addrspacecast_nonnull:
10858 return HandleAddrSpaceCast(MI);
10859 case Intrinsic::amdgcn_if:
10860 case Intrinsic::amdgcn_else:
10861 // FIXME: Uniform if second result
10862 break;
10863 }
10864
10866 }
10867
10868 // Loads from the private and flat address spaces are divergent, because
10869 // threads can execute the load instruction with the same inputs and get
10870 // different results.
10871 //
10872 // All other loads are not divergent, because if threads issue loads with the
10873 // same arguments, they will always get the same result.
10874 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10875 Opcode == AMDGPU::G_SEXTLOAD) {
10876 if (MI.memoperands_empty())
10877 return ValueUniformity::NeverUniform; // conservative assumption
10878
10879 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10880 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10881 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10882 })) {
10883 // At least one MMO in a non-global address space.
10885 }
10887 }
10888
10889 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10890 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10891 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10892 AMDGPU::isGenericAtomic(Opcode)) {
10894 }
10896}
10897
10899 if (!Formatter)
10900 Formatter = std::make_unique<AMDGPUMIRFormatter>(ST);
10901 return Formatter.get();
10902}
10903
10905
10906 if (isNeverUniform(MI))
10908
10909 unsigned opcode = MI.getOpcode();
10910 if (opcode == AMDGPU::V_READLANE_B32 ||
10911 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10912 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10914
10915 if (isCopyInstr(MI)) {
10916 const MachineOperand &srcOp = MI.getOperand(1);
10917 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10918 const TargetRegisterClass *regClass =
10919 RI.getPhysRegBaseClass(srcOp.getReg());
10920 return RI.isSGPRClass(regClass) ? ValueUniformity::AlwaysUniform
10922 }
10924 }
10925
10926 // GMIR handling
10927 if (MI.isPreISelOpcode())
10929
10930 // Atomics are divergent because they are executed sequentially: when an
10931 // atomic operation refers to the same address in each thread, then each
10932 // thread after the first sees the value written by the previous thread as
10933 // original value.
10934
10935 if (isAtomic(MI))
10937
10938 // Loads from the private and flat address spaces are divergent, because
10939 // threads can execute the load instruction with the same inputs and get
10940 // different results.
10941 if (isFLAT(MI) && MI.mayLoad()) {
10942 if (MI.memoperands_empty())
10943 return ValueUniformity::NeverUniform; // conservative assumption
10944
10945 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10946 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10947 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10948 })) {
10949 // At least one MMO in a non-global address space.
10951 }
10952
10954 }
10955
10956 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10957 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10958
10959 // FIXME: It's conceptually broken to report this for an instruction, and not
10960 // a specific def operand. For inline asm in particular, there could be mixed
10961 // uniform and divergent results.
10962 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10963 const MachineOperand &SrcOp = MI.getOperand(I);
10964 if (!SrcOp.isReg())
10965 continue;
10966
10967 Register Reg = SrcOp.getReg();
10968 if (!Reg || !SrcOp.readsReg())
10969 continue;
10970
10971 // If RegBank is null, this is unassigned or an unallocatable special
10972 // register, which are all scalars.
10973 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10974 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10976 }
10977
10978 // TODO: Uniformity check condtions above can be rearranged for more
10979 // redability
10980
10981 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10982 // currently turned into no-op COPYs by SelectionDAG ISel and are
10983 // therefore no longer recognizable.
10984
10986}
10987
10989 switch (MF.getFunction().getCallingConv()) {
10991 return 1;
10993 return 2;
10995 return 3;
10999 const Function &F = MF.getFunction();
11000 F.getContext().diagnose(DiagnosticInfoUnsupported(
11001 F, "ds_ordered_count unsupported for this calling conv"));
11002 [[fallthrough]];
11003 }
11006 case CallingConv::C:
11007 case CallingConv::Fast:
11008 default:
11009 // Assume other calling conventions are various compute callable functions
11010 return 0;
11011 }
11012}
11013
11015 Register &SrcReg2, int64_t &CmpMask,
11016 int64_t &CmpValue) const {
11017 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
11018 return false;
11019
11020 switch (MI.getOpcode()) {
11021 default:
11022 break;
11023 case AMDGPU::S_CMP_EQ_U32:
11024 case AMDGPU::S_CMP_EQ_I32:
11025 case AMDGPU::S_CMP_LG_U32:
11026 case AMDGPU::S_CMP_LG_I32:
11027 case AMDGPU::S_CMP_LT_U32:
11028 case AMDGPU::S_CMP_LT_I32:
11029 case AMDGPU::S_CMP_GT_U32:
11030 case AMDGPU::S_CMP_GT_I32:
11031 case AMDGPU::S_CMP_LE_U32:
11032 case AMDGPU::S_CMP_LE_I32:
11033 case AMDGPU::S_CMP_GE_U32:
11034 case AMDGPU::S_CMP_GE_I32:
11035 case AMDGPU::S_CMP_EQ_U64:
11036 case AMDGPU::S_CMP_LG_U64:
11037 SrcReg = MI.getOperand(0).getReg();
11038 if (MI.getOperand(1).isReg()) {
11039 if (MI.getOperand(1).getSubReg())
11040 return false;
11041 SrcReg2 = MI.getOperand(1).getReg();
11042 CmpValue = 0;
11043 } else if (MI.getOperand(1).isImm()) {
11044 SrcReg2 = Register();
11045 CmpValue = MI.getOperand(1).getImm();
11046 } else {
11047 return false;
11048 }
11049 CmpMask = ~0;
11050 return true;
11051 case AMDGPU::S_CMPK_EQ_U32:
11052 case AMDGPU::S_CMPK_EQ_I32:
11053 case AMDGPU::S_CMPK_LG_U32:
11054 case AMDGPU::S_CMPK_LG_I32:
11055 case AMDGPU::S_CMPK_LT_U32:
11056 case AMDGPU::S_CMPK_LT_I32:
11057 case AMDGPU::S_CMPK_GT_U32:
11058 case AMDGPU::S_CMPK_GT_I32:
11059 case AMDGPU::S_CMPK_LE_U32:
11060 case AMDGPU::S_CMPK_LE_I32:
11061 case AMDGPU::S_CMPK_GE_U32:
11062 case AMDGPU::S_CMPK_GE_I32:
11063 SrcReg = MI.getOperand(0).getReg();
11064 SrcReg2 = Register();
11065 CmpValue = MI.getOperand(1).getImm();
11066 CmpMask = ~0;
11067 return true;
11068 }
11069
11070 return false;
11071}
11072
11074 for (MachineBasicBlock *S : MBB->successors()) {
11075 if (S->isLiveIn(AMDGPU::SCC))
11076 return false;
11077 }
11078 return true;
11079}
11080
11081// Invert all uses of SCC following SCCDef because SCCDef may be deleted and
11082// (incoming SCC) = !(SCC defined by SCCDef).
11083// Return true if all uses can be re-written, false otherwise.
11084bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
11085 MachineBasicBlock *MBB = SCCDef->getParent();
11086 SmallVector<MachineInstr *> InvertInstr;
11087 bool SCCIsDead = false;
11088
11089 // Scan instructions for SCC uses that need to be inverted until SCC is dead.
11090 constexpr unsigned ScanLimit = 12;
11091 unsigned Count = 0;
11092 for (MachineInstr &MI :
11093 make_range(std::next(MachineBasicBlock::iterator(SCCDef)), MBB->end())) {
11094 if (++Count > ScanLimit)
11095 return false;
11096 if (MI.readsRegister(AMDGPU::SCC, &RI)) {
11097 if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
11098 MI.getOpcode() == AMDGPU::S_CSELECT_B64 ||
11099 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
11100 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC1)
11101 InvertInstr.push_back(&MI);
11102 else
11103 return false;
11104 }
11105 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
11106 SCCIsDead = true;
11107 break;
11108 }
11109 }
11110 if (!SCCIsDead && isSCCDeadOnExit(MBB))
11111 SCCIsDead = true;
11112
11113 // SCC may have more uses. Can't invert all of them.
11114 if (!SCCIsDead)
11115 return false;
11116
11117 // Invert uses
11118 for (MachineInstr *MI : InvertInstr) {
11119 if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
11120 MI->getOpcode() == AMDGPU::S_CSELECT_B64) {
11121 swapOperands(*MI);
11122 } else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
11123 MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) {
11124 MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
11125 ? AMDGPU::S_CBRANCH_SCC1
11126 : AMDGPU::S_CBRANCH_SCC0));
11127 } else {
11128 llvm_unreachable("SCC used but no inversion handling");
11129 }
11130 }
11131 return true;
11132}
11133
11134// SCC is already valid after SCCValid.
11135// SCCRedefine will redefine SCC to the same value already available after
11136// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
11137// update kill/dead flags if necessary.
11138bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
11139 bool NeedInversion) const {
11140 MachineInstr *KillsSCC = nullptr;
11141 if (SCCValid->getParent() != SCCRedefine->getParent())
11142 return false;
11143 for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
11144 SCCRedefine->getIterator())) {
11145 if (MI.modifiesRegister(AMDGPU::SCC, &RI))
11146 return false;
11147 if (MI.killsRegister(AMDGPU::SCC, &RI))
11148 KillsSCC = &MI;
11149 }
11150 if (NeedInversion && !invertSCCUse(SCCRedefine))
11151 return false;
11152 if (MachineOperand *SccDef =
11153 SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
11154 SccDef->setIsDead(false);
11155 if (KillsSCC)
11156 KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
11157 SCCRedefine->eraseFromParent();
11158 return true;
11159}
11160
11161static bool foldableSelect(const MachineInstr &Def) {
11162 if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
11163 Def.getOpcode() != AMDGPU::S_CSELECT_B64)
11164 return false;
11165 bool Op1IsNonZeroImm =
11166 Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
11167 bool Op2IsZeroImm =
11168 Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
11169 if (!Op1IsNonZeroImm || !Op2IsZeroImm)
11170 return false;
11171 return true;
11172}
11173
11174static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion,
11175 unsigned &NewDefOpc) {
11176 // S_ADD_U32 X, 1 sets SCC on carryout which can only happen if result==0.
11177 // S_ADD_I32 X, 1 can be converted to S_ADD_U32 X, 1 if SCC is dead.
11178 if (Def.getOpcode() != AMDGPU::S_ADD_I32 &&
11179 Def.getOpcode() != AMDGPU::S_ADD_U32)
11180 return false;
11181 const MachineOperand &AddSrc1 = Def.getOperand(1);
11182 const MachineOperand &AddSrc2 = Def.getOperand(2);
11183 int64_t addend;
11184
11185 if ((!AddSrc1.isImm() || AddSrc1.getImm() != 1) &&
11186 (!AddSrc2.isImm() || AddSrc2.getImm() != 1) &&
11187 (!getFoldableImm(&AddSrc1, addend) || addend != 1) &&
11188 (!getFoldableImm(&AddSrc2, addend) || addend != 1))
11189 return false;
11190
11191 if (Def.getOpcode() == AMDGPU::S_ADD_I32) {
11192 const MachineOperand *SccDef =
11193 Def.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
11194 if (!SccDef->isDead())
11195 return false;
11196 NewDefOpc = AMDGPU::S_ADD_U32;
11197 }
11198 NeedInversion = !NeedInversion;
11199 return true;
11200}
11201
11203 Register SrcReg2, int64_t CmpMask,
11204 int64_t CmpValue,
11205 const MachineRegisterInfo *MRI) const {
11206 if (!SrcReg || SrcReg.isPhysical())
11207 return false;
11208
11209 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
11210 return false;
11211
11212 const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
11213 this](bool NeedInversion) -> bool {
11214 if (CmpValue != 0)
11215 return false;
11216
11217 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11218 if (!Def)
11219 return false;
11220
11221 // For S_OP that set SCC = DST!=0, do the transformation
11222 //
11223 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11224 //
11225 // For (S_OP ...) that set SCC = DST==0, invert NeedInversion and
11226 // do the transformation:
11227 //
11228 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11229 //
11230 // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
11231 // for S_CSELECT* already has the same value that will be calculated by
11232 // s_cmp_lg_*
11233 //
11234 // s_cmp_[lg|eq]_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT*
11235 // (non-zero imm), 0)
11236
11237 unsigned NewDefOpc = Def->getOpcode();
11238 if (!setsSCCIfResultIsNonZero(*Def) &&
11239 !setsSCCIfResultIsZero(*Def, NeedInversion, NewDefOpc) &&
11240 !foldableSelect(*Def))
11241 return false;
11242
11243 if (!optimizeSCC(Def, &CmpInstr, NeedInversion))
11244 return false;
11245
11246 if (NewDefOpc != Def->getOpcode())
11247 Def->setDesc(get(NewDefOpc));
11248
11249 // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
11250 // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
11251 // 64-bit foldableSelect then delete s_or_b32 in the sequence:
11252 // sX = s_cselect_b64 (non-zero imm), 0
11253 // sLo = copy sX.sub0
11254 // sHi = copy sX.sub1
11255 // sY = s_or_b32 sLo, sHi
11256 if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
11257 MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
11258 const MachineOperand &OrOpnd1 = Def->getOperand(1);
11259 const MachineOperand &OrOpnd2 = Def->getOperand(2);
11260 if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
11261 MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
11262 MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
11263 if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
11264 Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
11265 Def2->getOperand(1).isReg() &&
11266 Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
11267 Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
11268 Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
11269 MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
11270 if (Select && foldableSelect(*Select))
11271 optimizeSCC(Select, Def, /*NeedInversion=*/false);
11272 }
11273 }
11274 }
11275 return true;
11276 };
11277
11278 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
11279 this](int64_t ExpectedValue, unsigned SrcSize,
11280 bool IsReversible, bool IsSigned) -> bool {
11281 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11282 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11283 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11284 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11285 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
11286 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11287 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11288 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11289 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11290 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
11291 //
11292 // Signed ge/gt are not used for the sign bit.
11293 //
11294 // If result of the AND is unused except in the compare:
11295 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
11296 //
11297 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11298 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11299 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
11300 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11301 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11302 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
11303
11304 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11305 if (!Def)
11306 return false;
11307
11308 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
11309 Def->getOpcode() != AMDGPU::S_AND_B64)
11310 return false;
11311
11312 int64_t Mask;
11313 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
11314 if (MO->isImm())
11315 Mask = MO->getImm();
11316 else if (!getFoldableImm(MO, Mask))
11317 return false;
11318 Mask &= maxUIntN(SrcSize);
11319 return isPowerOf2_64(Mask);
11320 };
11321
11322 MachineOperand *SrcOp = &Def->getOperand(1);
11323 if (isMask(SrcOp))
11324 SrcOp = &Def->getOperand(2);
11325 else if (isMask(&Def->getOperand(2)))
11326 SrcOp = &Def->getOperand(1);
11327 else
11328 return false;
11329
11330 // A valid Mask is required to have a single bit set, hence a non-zero and
11331 // power-of-two value. This verifies that we will not do 64-bit shift below.
11332 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
11333 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
11334 if (IsSigned && BitNo == SrcSize - 1)
11335 return false;
11336
11337 ExpectedValue <<= BitNo;
11338
11339 bool IsReversedCC = false;
11340 if (CmpValue != ExpectedValue) {
11341 if (!IsReversible)
11342 return false;
11343 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
11344 if (!IsReversedCC)
11345 return false;
11346 }
11347
11348 Register DefReg = Def->getOperand(0).getReg();
11349 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
11350 return false;
11351
11352 if (!optimizeSCC(Def, &CmpInstr, /*NeedInversion=*/false))
11353 return false;
11354
11355 if (!MRI->use_nodbg_empty(DefReg)) {
11356 assert(!IsReversedCC);
11357 return true;
11358 }
11359
11360 // Replace AND with unused result with a S_BITCMP.
11361 MachineBasicBlock *MBB = Def->getParent();
11362
11363 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
11364 : AMDGPU::S_BITCMP1_B32
11365 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
11366 : AMDGPU::S_BITCMP1_B64;
11367
11368 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
11369 .add(*SrcOp)
11370 .addImm(BitNo);
11371 Def->eraseFromParent();
11372
11373 return true;
11374 };
11375
11376 switch (CmpInstr.getOpcode()) {
11377 default:
11378 break;
11379 case AMDGPU::S_CMP_EQ_U32:
11380 case AMDGPU::S_CMP_EQ_I32:
11381 case AMDGPU::S_CMPK_EQ_U32:
11382 case AMDGPU::S_CMPK_EQ_I32:
11383 return optimizeCmpAnd(1, 32, true, false) ||
11384 optimizeCmpSelect(/*NeedInversion=*/true);
11385 case AMDGPU::S_CMP_GE_U32:
11386 case AMDGPU::S_CMPK_GE_U32:
11387 return optimizeCmpAnd(1, 32, false, false);
11388 case AMDGPU::S_CMP_GE_I32:
11389 case AMDGPU::S_CMPK_GE_I32:
11390 return optimizeCmpAnd(1, 32, false, true);
11391 case AMDGPU::S_CMP_EQ_U64:
11392 return optimizeCmpAnd(1, 64, true, false);
11393 case AMDGPU::S_CMP_LG_U32:
11394 case AMDGPU::S_CMP_LG_I32:
11395 case AMDGPU::S_CMPK_LG_U32:
11396 case AMDGPU::S_CMPK_LG_I32:
11397 return optimizeCmpAnd(0, 32, true, false) ||
11398 optimizeCmpSelect(/*NeedInversion=*/false);
11399 case AMDGPU::S_CMP_GT_U32:
11400 case AMDGPU::S_CMPK_GT_U32:
11401 return optimizeCmpAnd(0, 32, false, false);
11402 case AMDGPU::S_CMP_GT_I32:
11403 case AMDGPU::S_CMPK_GT_I32:
11404 return optimizeCmpAnd(0, 32, false, true);
11405 case AMDGPU::S_CMP_LG_U64:
11406 return optimizeCmpAnd(0, 64, true, false) ||
11407 optimizeCmpSelect(/*NeedInversion=*/false);
11408 }
11409
11410 return false;
11411}
11412
11414 AMDGPU::OpName OpName) const {
11415 if (!ST.needsAlignedVGPRs())
11416 return;
11417
11418 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
11419 if (OpNo < 0)
11420 return;
11421 MachineOperand &Op = MI.getOperand(OpNo);
11422 if (getOpSize(MI, OpNo) > 4)
11423 return;
11424
11425 // Add implicit aligned super-reg to force alignment on the data operand.
11426 const DebugLoc &DL = MI.getDebugLoc();
11427 MachineBasicBlock *BB = MI.getParent();
11429 Register DataReg = Op.getReg();
11430 bool IsAGPR = RI.isAGPR(MRI, DataReg);
11432 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
11433 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
11434 Register NewVR =
11435 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
11436 : &AMDGPU::VReg_64_Align2RegClass);
11437 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
11438 .addReg(DataReg, {}, Op.getSubReg())
11439 .addImm(AMDGPU::sub0)
11440 .addReg(Undef)
11441 .addImm(AMDGPU::sub1);
11442 Op.setReg(NewVR);
11443 Op.setSubReg(AMDGPU::sub0);
11444 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
11445}
11446
11448 if (isIGLP(*MI))
11449 return false;
11450
11452}
11453
11455 if (!isWMMA(MI) && !isSWMMAC(MI))
11456 return false;
11457
11458 if (ST.hasGFX1250Insts())
11459 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
11460
11461 return true;
11462}
11463
11465 unsigned Opcode = MI.getOpcode();
11466
11467 if (AMDGPU::isGFX12Plus(ST))
11468 return isDOT(MI) || isXDLWMMA(MI);
11469
11470 if (!isMAI(MI) || isDGEMM(Opcode) ||
11471 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
11472 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
11473 return false;
11474
11475 if (!ST.hasGFX940Insts())
11476 return true;
11477
11478 return AMDGPU::getMAIIsGFX940XDL(Opcode);
11479}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps, ArrayRef< Register > PhySGPRs={})
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
static MachineBasicBlock * generateWaterFallLoop(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr, ArrayRef< Register > PhySGPRs={})
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion, unsigned &NewDefOpc)
static bool isSCCDeadOnExit(MachineBasicBlock *MBB)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool foldableSelect(const MachineInstr &Def)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
#define LLVM_DEBUG(...)
Definition Debug.h:114
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:145
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:123
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:92
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
MIRFormater - Interface to format MIR operand based on target.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool isBundle() const
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI void eraseFromBundle()
Unlink 'this' from its basic block and delete it.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
LLVM_ABI void clearRegisterKills(Register Reg, const TargetRegisterInfo *RegInfo)
Clear all kill flags affecting Reg.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
iterator_range< use_nodbg_iterator > use_nodbg_operands(Register Reg) const
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI void moveOperands(MachineOperand *Dst, MachineOperand *Src, unsigned NumOps)
Move NumOps operands from Src to Dst, updating use-def lists as needed.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
bool reservedRegsFrozen() const
reservedRegsFrozen - Returns true after freezeReservedRegs() was called to ensure the set of reserved...
LLVM_ABI void clearVirtRegs()
clearVirtRegs - Remove all virtual registers (after physreg assignment).
iterator_range< use_instr_nodbg_iterator > use_nodbg_instructions(Register Reg) const
void setRegAllocationHint(Register VReg, unsigned Type, Register PrefReg)
setRegAllocationHint - Specify a register allocation hint for the specified virtual register.
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
void setSimpleHint(Register VReg, Register PrefReg)
Specify the preferred (target independent) register allocation hint for the specified virtual registe...
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
iterator_range< use_iterator > use_operands(Register Reg) const
LLVM_ABI void removeRegOperandFromUseList(MachineOperand *MO)
Remove MO from its use-def list.
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
LLVM_ABI void addRegOperandToUseList(MachineOperand *MO)
Add MO to the linked list of operands for its register.
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
bool canAddToBBProlog(const MachineInstr &MI) const
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
bool isXDLWMMA(const MachineInstr &MI) const
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, MachineInstr *&CopyMI, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
bool isSpill(uint32_t Opcode) const
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
bool mayAccessScratch(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getFoldableCopySrcIdx(const MachineInstr &MI)
unsigned getOpSize(uint32_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool setsSCCIfResultIsNonZero(const MachineInstr &MI)
const MIRFormatter * getMIRFormatter() const override
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
Register isStackAccess(const MachineInstr &MI, int &FrameIndex, TypeSize &MemBytes) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isNeverCoissue(MachineInstr &MI) const
static bool isBUF(const MachineInstr &MI)
void handleCopyToPhysHelper(SIInstrWorklist &Worklist, Register DstReg, MachineInstr &Inst, MachineRegisterInfo &MRI, DenseMap< MachineInstr *, V2PhysSCopyInfo > &WaterFalls, DenseMap< MachineInstr *, bool > &V2SPhyCopiesToErase) const
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
static bool isFLATGlobal(const MachineInstr &MI)
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex, TypeSize &MemBytes) const
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isTRANS(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void createWaterFallForSiCall(MachineInstr *MI, MachineDominatorTree *MDT, ArrayRef< MachineOperand * > ScalarOps, ArrayRef< Register > PhySGPRs={}) const
Wrapper function for generating waterfall for instruction MI This function take into consideration of...
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
bool isReMaterializableImpl(const MachineInstr &MI) const override
static bool isVOP3(const MCInstrDesc &Desc)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
static bool isMFMA(const MachineInstr &MI)
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
void mutateAndCleanupImplicit(MachineInstr &MI, const MCInstrDesc &NewDesc) const
ValueUniformity getGenericValueUniformity(const MachineInstr &MI) const
static bool isMAI(const MCInstrDesc &Desc)
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, LaneBitmask UsedLanes=LaneBitmask::getAll()) const override
static bool usesLGKM_CNT(const MachineInstr &MI)
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
const MachineOperand & getCalleeOperand(const MachineInstr &MI) const override
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
bool isAlwaysGDS(uint32_t Opcode) const
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
void createReadFirstLaneFromCopyToPhysReg(MachineRegisterInfo &MRI, Register DstReg, MachineInstr &Inst) const
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool isWWMRegSpillOpcode(uint32_t Opcode)
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
static bool usesVM_CNT(const MachineInstr &MI)
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
ValueUniformity getValueUniformity(const MachineInstr &MI) const final
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
std::optional< int64_t > getImmOrMaterializedImm(MachineOperand &Op) const
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst, DenseMap< MachineInstr *, V2PhysSCopyInfo > &WaterFalls, DenseMap< MachineInstr *, bool > &V2SPhyCopiesToErase) const
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
unsigned getScratchReservedForDynamicVGPRs() const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual const MachineOperand & getCalleeOperand(const MachineInstr &MI) const
Returns the callee operand from the given MI.
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, LaneBitmask UsedLanes=LaneBitmask::getAll()) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
bool isPKFMACF16InlineConstant(uint32_t Literal, bool IsGFX11Plus)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int32_t getCommuteRev(uint32_t Opcode)
LLVM_READONLY int32_t getCommuteOrig(uint32_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READONLY int32_t getGlobalVaddrOp(uint32_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READONLY int32_t getMFMAEarlyClobberOp(uint32_t Opcode)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY int32_t getIfAddr64Inst(uint32_t Opcode)
Check if Opcode is an Addr64 opcode.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
LLVM_READONLY int32_t getVOPe32(uint32_t Opcode)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
LLVM_READONLY int32_t getAddr64Inst(uint32_t Opcode)
int32_t getMCOpcode(uint32_t Opcode, unsigned Gen)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:234
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:204
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:227
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:213
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:208
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:203
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:209
@ OPERAND_REG_IMM_V2FP16_SPLAT
Definition SIDefines.h:212
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:219
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:214
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:207
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:228
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:239
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:240
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:215
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:251
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:206
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:224
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:220
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:226
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:245
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:216
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:241
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:223
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:205
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:231
LLVM_READONLY int32_t getBasicFromSDWAOp(uint32_t Opcode)
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, const MCSubtargetInfo &ST)
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:606
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:608
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:605
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:607
@ TI_CONSTDATA_START
Definition AMDGPU.h:604
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READONLY int32_t getFlatScratchInstSVfromSS(uint32_t Opcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:71
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:70
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:61
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:67
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:72
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:532
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:207
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:149
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, const MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI VirtRegInfo AnalyzeVirtRegInBundle(MachineInstr &MI, Register Reg, SmallVectorImpl< std::pair< MachineInstr *, unsigned > > *Ops=nullptr)
AnalyzeVirtRegInBundle - Analyze how the current instruction or bundle uses a virtual register.
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:118
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
constexpr RegState getUndefRegState(bool B)
ValueUniformity
Enum describing how values behave with respect to uniformity and divergence, to answer the question: ...
Definition Uniformity.h:18
@ AlwaysUniform
The result value is always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result value can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
MachineCycleInfo::CycleT MachineCycle
static const MachineMemOperand::Flags MOThreadPrivate
Mark the MMO of accesses to memory locations that are never written to by other threads.
Definition SIInstrInfo.h:63
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
Helper struct for the implementation of 3-address conversion to communicate updates made to instructi...
MachineInstr * RemoveMIUse
Other instruction whose def is no longer used by the converted instruction.
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
constexpr bool all() const
Definition LaneBitmask.h:54
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:67
MachineInstr * top() const
Definition SIInstrInfo.h:72
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:91
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.
VirtRegInfo - Information about a virtual register used by a set of operands.
bool Reads
Reads - One of the operands read the virtual register.
bool Writes
Writes - One of the operands writes the virtual register.