LLVM 23.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
33#include "llvm/IR/IntrinsicsAMDGPU.h"
34#include "llvm/MC/MCContext.h"
37
38using namespace llvm;
39
40#define DEBUG_TYPE "si-instr-info"
41
42#define GET_INSTRINFO_CTOR_DTOR
43#include "AMDGPUGenInstrInfo.inc"
44
45namespace llvm::AMDGPU {
46#define GET_D16ImageDimIntrinsics_IMPL
47#define GET_ImageDimIntrinsicTable_IMPL
48#define GET_RsrcIntrinsics_IMPL
49#include "AMDGPUGenSearchableTables.inc"
50} // namespace llvm::AMDGPU
51
52// Must be at least 4 to be able to branch over minimum unconditional branch
53// code. This is only for making it possible to write reasonably small tests for
54// long branches.
56BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
57 cl::desc("Restrict range of branch instructions (DEBUG)"));
58
60 "amdgpu-fix-16-bit-physreg-copies",
61 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
62 cl::init(true),
64
66 : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
67 AMDGPU::ADJCALLSTACKDOWN),
68 RI(ST), ST(ST) {
69 SchedModel.init(&ST);
70}
71
72//===----------------------------------------------------------------------===//
73// TargetInstrInfo callbacks
74//===----------------------------------------------------------------------===//
75
76static unsigned getNumOperandsNoGlue(SDNode *Node) {
77 unsigned N = Node->getNumOperands();
78 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
79 --N;
80 return N;
81}
82
83/// Returns true if both nodes have the same value for the given
84/// operand \p Op, or if both nodes do not have this operand.
86 AMDGPU::OpName OpName) {
87 unsigned Opc0 = N0->getMachineOpcode();
88 unsigned Opc1 = N1->getMachineOpcode();
89
90 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
91 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
92
93 if (Op0Idx == -1 && Op1Idx == -1)
94 return true;
95
96
97 if ((Op0Idx == -1 && Op1Idx != -1) ||
98 (Op1Idx == -1 && Op0Idx != -1))
99 return false;
100
101 // getNamedOperandIdx returns the index for the MachineInstr's operands,
102 // which includes the result as the first operand. We are indexing into the
103 // MachineSDNode's operands, so we need to skip the result operand to get
104 // the real index.
105 --Op0Idx;
106 --Op1Idx;
107
108 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
109}
110
111static bool canRemat(const MachineInstr &MI) {
112
116 return true;
117
118 if (SIInstrInfo::isSMRD(MI)) {
119 return !MI.memoperands_empty() &&
120 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
121 return MMO->isLoad() && MMO->isInvariant();
122 });
123 }
124
125 return false;
126}
127
129 const MachineInstr &MI) const {
130
131 if (canRemat(MI)) {
132 // Normally VALU use of exec would block the rematerialization, but that
133 // is OK in this case to have an implicit exec read as all VALU do.
134 // We really want all of the generic logic for this except for this.
135
136 // Another potential implicit use is mode register. The core logic of
137 // the RA will not attempt rematerialization if mode is set anywhere
138 // in the function, otherwise it is safe since mode is not changed.
139
140 // There is difference to generic method which does not allow
141 // rematerialization if there are virtual register uses. We allow this,
142 // therefore this method includes SOP instructions as well.
143 if (!MI.hasImplicitDef() &&
144 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
145 !MI.mayRaiseFPException())
146 return true;
147 }
148
150}
151
152// Returns true if the scalar result of a VALU instruction depends on exec.
153bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
154 // Ignore comparisons which are only used masked with exec.
155 // This allows some hoisting/sinking of VALU comparisons.
156 if (MI.isCompare()) {
157 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
158 if (!Dst)
159 return true;
160
161 Register DstReg = Dst->getReg();
162 if (!DstReg.isVirtual())
163 return true;
164
165 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
166 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
167 switch (Use.getOpcode()) {
168 case AMDGPU::S_AND_SAVEEXEC_B32:
169 case AMDGPU::S_AND_SAVEEXEC_B64:
170 break;
171 case AMDGPU::S_AND_B32:
172 case AMDGPU::S_AND_B64:
173 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
174 return true;
175 break;
176 default:
177 return true;
178 }
179 }
180 return false;
181 }
182
183 // If it is not convergent it does not depend on EXEC.
184 if (!MI.isConvergent())
185 return false;
186
187 switch (MI.getOpcode()) {
188 default:
189 break;
190 case AMDGPU::V_READFIRSTLANE_B32:
191 return true;
192 }
193
194 return false;
195}
196
198 // Any implicit use of exec by VALU is not a real register read.
199 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
200 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
201}
202
204 MachineBasicBlock *SuccToSinkTo,
205 MachineCycleInfo *CI) const {
206 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
207 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
208 return true;
209
210 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
211 // Check if sinking of MI would create temporal divergent use.
212 for (auto Op : MI.uses()) {
213 if (Op.isReg() && Op.getReg().isVirtual() &&
214 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
215 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
216
217 // SgprDef defined inside cycle
218 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
219 if (FromCycle == nullptr)
220 continue;
221
222 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
223 // Check if there is a FromCycle that contains SgprDef's basic block but
224 // does not contain SuccToSinkTo and also has divergent exit condition.
225 while (FromCycle && !FromCycle->contains(ToCycle)) {
227 FromCycle->getExitingBlocks(ExitingBlocks);
228
229 // FromCycle has divergent exit condition.
230 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
231 if (hasDivergentBranch(ExitingBlock))
232 return false;
233 }
234
235 FromCycle = FromCycle->getParentCycle();
236 }
237 }
238 }
239
240 return true;
241}
242
244 int64_t &Offset0,
245 int64_t &Offset1) const {
246 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
247 return false;
248
249 unsigned Opc0 = Load0->getMachineOpcode();
250 unsigned Opc1 = Load1->getMachineOpcode();
251
252 // Make sure both are actually loads.
253 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
254 return false;
255
256 // A mayLoad instruction without a def is not a load. Likely a prefetch.
257 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
258 return false;
259
260 if (isDS(Opc0) && isDS(Opc1)) {
261
262 // FIXME: Handle this case:
263 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
264 return false;
265
266 // Check base reg.
267 if (Load0->getOperand(0) != Load1->getOperand(0))
268 return false;
269
270 // Skip read2 / write2 variants for simplicity.
271 // TODO: We should report true if the used offsets are adjacent (excluded
272 // st64 versions).
273 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
274 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
275 if (Offset0Idx == -1 || Offset1Idx == -1)
276 return false;
277
278 // XXX - be careful of dataless loads
279 // getNamedOperandIdx returns the index for MachineInstrs. Since they
280 // include the output in the operand list, but SDNodes don't, we need to
281 // subtract the index by one.
282 Offset0Idx -= get(Opc0).NumDefs;
283 Offset1Idx -= get(Opc1).NumDefs;
284 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
285 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
286 return true;
287 }
288
289 if (isSMRD(Opc0) && isSMRD(Opc1)) {
290 // Skip time and cache invalidation instructions.
291 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
292 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
293 return false;
294
295 unsigned NumOps = getNumOperandsNoGlue(Load0);
296 if (NumOps != getNumOperandsNoGlue(Load1))
297 return false;
298
299 // Check base reg.
300 if (Load0->getOperand(0) != Load1->getOperand(0))
301 return false;
302
303 // Match register offsets, if both register and immediate offsets present.
304 assert(NumOps == 4 || NumOps == 5);
305 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
306 return false;
307
308 const ConstantSDNode *Load0Offset =
310 const ConstantSDNode *Load1Offset =
312
313 if (!Load0Offset || !Load1Offset)
314 return false;
315
316 Offset0 = Load0Offset->getZExtValue();
317 Offset1 = Load1Offset->getZExtValue();
318 return true;
319 }
320
321 // MUBUF and MTBUF can access the same addresses.
322 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
323
324 // MUBUF and MTBUF have vaddr at different indices.
325 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
326 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
327 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
328 return false;
329
330 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
331 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
332
333 if (OffIdx0 == -1 || OffIdx1 == -1)
334 return false;
335
336 // getNamedOperandIdx returns the index for MachineInstrs. Since they
337 // include the output in the operand list, but SDNodes don't, we need to
338 // subtract the index by one.
339 OffIdx0 -= get(Opc0).NumDefs;
340 OffIdx1 -= get(Opc1).NumDefs;
341
342 SDValue Off0 = Load0->getOperand(OffIdx0);
343 SDValue Off1 = Load1->getOperand(OffIdx1);
344
345 // The offset might be a FrameIndexSDNode.
346 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
347 return false;
348
349 Offset0 = Off0->getAsZExtVal();
350 Offset1 = Off1->getAsZExtVal();
351 return true;
352 }
353
354 return false;
355}
356
357static bool isStride64(unsigned Opc) {
358 switch (Opc) {
359 case AMDGPU::DS_READ2ST64_B32:
360 case AMDGPU::DS_READ2ST64_B64:
361 case AMDGPU::DS_WRITE2ST64_B32:
362 case AMDGPU::DS_WRITE2ST64_B64:
363 return true;
364 default:
365 return false;
366 }
367}
368
371 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
372 const TargetRegisterInfo *TRI) const {
373 if (!LdSt.mayLoadOrStore())
374 return false;
375
376 unsigned Opc = LdSt.getOpcode();
377 OffsetIsScalable = false;
378 const MachineOperand *BaseOp, *OffsetOp;
379 int DataOpIdx;
380
381 if (isDS(LdSt)) {
382 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
383 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
384 if (OffsetOp) {
385 // Normal, single offset LDS instruction.
386 if (!BaseOp) {
387 // DS_CONSUME/DS_APPEND use M0 for the base address.
388 // TODO: find the implicit use operand for M0 and use that as BaseOp?
389 return false;
390 }
391 BaseOps.push_back(BaseOp);
392 Offset = OffsetOp->getImm();
393 // Get appropriate operand, and compute width accordingly.
394 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
395 if (DataOpIdx == -1)
396 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
397 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
398 Width = LocationSize::precise(64);
399 else
400 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
401 } else {
402 // The 2 offset instructions use offset0 and offset1 instead. We can treat
403 // these as a load with a single offset if the 2 offsets are consecutive.
404 // We will use this for some partially aligned loads.
405 const MachineOperand *Offset0Op =
406 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
407 const MachineOperand *Offset1Op =
408 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
409
410 unsigned Offset0 = Offset0Op->getImm() & 0xff;
411 unsigned Offset1 = Offset1Op->getImm() & 0xff;
412 if (Offset0 + 1 != Offset1)
413 return false;
414
415 // Each of these offsets is in element sized units, so we need to convert
416 // to bytes of the individual reads.
417
418 unsigned EltSize;
419 if (LdSt.mayLoad())
420 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
421 else {
422 assert(LdSt.mayStore());
423 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
424 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
425 }
426
427 if (isStride64(Opc))
428 EltSize *= 64;
429
430 BaseOps.push_back(BaseOp);
431 Offset = EltSize * Offset0;
432 // Get appropriate operand(s), and compute width accordingly.
433 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
434 if (DataOpIdx == -1) {
435 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
436 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
437 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
438 Width = LocationSize::precise(
439 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
440 } else {
441 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
442 }
443 }
444 return true;
445 }
446
447 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
448 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
449 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
450 return false;
451 BaseOps.push_back(RSrc);
452 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
453 if (BaseOp && !BaseOp->isFI())
454 BaseOps.push_back(BaseOp);
455 const MachineOperand *OffsetImm =
456 getNamedOperand(LdSt, AMDGPU::OpName::offset);
457 Offset = OffsetImm->getImm();
458 const MachineOperand *SOffset =
459 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
460 if (SOffset) {
461 if (SOffset->isReg())
462 BaseOps.push_back(SOffset);
463 else
464 Offset += SOffset->getImm();
465 }
466 // Get appropriate operand, and compute width accordingly.
467 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
468 if (DataOpIdx == -1)
469 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
470 if (DataOpIdx == -1) // LDS DMA
471 return false;
472 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
473 return true;
474 }
475
476 if (isImage(LdSt)) {
477 auto RsrcOpName =
478 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
479 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
480 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
481 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
482 if (VAddr0Idx >= 0) {
483 // GFX10 possible NSA encoding.
484 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
485 BaseOps.push_back(&LdSt.getOperand(I));
486 } else {
487 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
488 }
489 Offset = 0;
490 // Get appropriate operand, and compute width accordingly.
491 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
492 if (DataOpIdx == -1)
493 return false; // no return sampler
494 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
495 return true;
496 }
497
498 if (isSMRD(LdSt)) {
499 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
500 if (!BaseOp) // e.g. S_MEMTIME
501 return false;
502 BaseOps.push_back(BaseOp);
503 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
504 Offset = OffsetOp ? OffsetOp->getImm() : 0;
505 // Get appropriate operand, and compute width accordingly.
506 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
507 if (DataOpIdx == -1)
508 return false;
509 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
510 return true;
511 }
512
513 if (isFLAT(LdSt)) {
514 // Instructions have either vaddr or saddr or both or none.
515 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
516 if (BaseOp)
517 BaseOps.push_back(BaseOp);
518 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
519 if (BaseOp)
520 BaseOps.push_back(BaseOp);
521 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
522 // Get appropriate operand, and compute width accordingly.
523 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
524 if (DataOpIdx == -1)
525 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
526 if (DataOpIdx == -1) // LDS DMA
527 return false;
528 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
529 return true;
530 }
531
532 return false;
533}
534
535static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
537 const MachineInstr &MI2,
539 // Only examine the first "base" operand of each instruction, on the
540 // assumption that it represents the real base address of the memory access.
541 // Other operands are typically offsets or indices from this base address.
542 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
543 return true;
544
545 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
546 return false;
547
548 auto *MO1 = *MI1.memoperands_begin();
549 auto *MO2 = *MI2.memoperands_begin();
550 if (MO1->getAddrSpace() != MO2->getAddrSpace())
551 return false;
552
553 const auto *Base1 = MO1->getValue();
554 const auto *Base2 = MO2->getValue();
555 if (!Base1 || !Base2)
556 return false;
557 Base1 = getUnderlyingObject(Base1);
558 Base2 = getUnderlyingObject(Base2);
559
560 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
561 return false;
562
563 return Base1 == Base2;
564}
565
567 int64_t Offset1, bool OffsetIsScalable1,
569 int64_t Offset2, bool OffsetIsScalable2,
570 unsigned ClusterSize,
571 unsigned NumBytes) const {
572 // If the mem ops (to be clustered) do not have the same base ptr, then they
573 // should not be clustered
574 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
575 if (!BaseOps1.empty() && !BaseOps2.empty()) {
576 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
577 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
578 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
579 return false;
580
581 const SIMachineFunctionInfo *MFI =
582 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
583 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
584 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
585 // If only one base op is empty, they do not have the same base ptr
586 return false;
587 }
588
589 // In order to avoid register pressure, on an average, the number of DWORDS
590 // loaded together by all clustered mem ops should not exceed
591 // MaxMemoryClusterDWords. This is an empirical value based on certain
592 // observations and performance related experiments.
593 // The good thing about this heuristic is - it avoids clustering of too many
594 // sub-word loads, and also avoids clustering of wide loads. Below is the
595 // brief summary of how the heuristic behaves for various `LoadSize` when
596 // MaxMemoryClusterDWords is 8.
597 //
598 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
599 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
600 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
601 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
602 // (5) LoadSize >= 17: do not cluster
603 const unsigned LoadSize = NumBytes / ClusterSize;
604 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
605 return NumDWords <= MaxMemoryClusterDWords;
606}
607
608// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
609// the first 16 loads will be interleaved with the stores, and the next 16 will
610// be clustered as expected. It should really split into 2 16 store batches.
611//
612// Loads are clustered until this returns false, rather than trying to schedule
613// groups of stores. This also means we have to deal with saying different
614// address space loads should be clustered, and ones which might cause bank
615// conflicts.
616//
617// This might be deprecated so it might not be worth that much effort to fix.
619 int64_t Offset0, int64_t Offset1,
620 unsigned NumLoads) const {
621 assert(Offset1 > Offset0 &&
622 "Second offset should be larger than first offset!");
623 // If we have less than 16 loads in a row, and the offsets are within 64
624 // bytes, then schedule together.
625
626 // A cacheline is 64 bytes (for global memory).
627 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
628}
629
632 const DebugLoc &DL, MCRegister DestReg,
633 MCRegister SrcReg, bool KillSrc,
634 const char *Msg = "illegal VGPR to SGPR copy") {
635 MachineFunction *MF = MBB.getParent();
636
638 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
639
640 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
641 .addReg(SrcReg, getKillRegState(KillSrc));
642}
643
644/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
645/// possible to have a direct copy in these cases on GFX908, so an intermediate
646/// VGPR copy is required.
650 const DebugLoc &DL, MCRegister DestReg,
651 MCRegister SrcReg, bool KillSrc,
652 RegScavenger &RS, bool RegsOverlap,
653 Register ImpDefSuperReg = Register(),
654 Register ImpUseSuperReg = Register()) {
655 assert((TII.getSubtarget().hasMAIInsts() &&
656 !TII.getSubtarget().hasGFX90AInsts()) &&
657 "Expected GFX908 subtarget.");
658
659 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
660 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
661 "Source register of the copy should be either an SGPR or an AGPR.");
662
663 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
664 "Destination register of the copy should be an AGPR.");
665
666 const SIRegisterInfo &RI = TII.getRegisterInfo();
667
668 // First try to find defining accvgpr_write to avoid temporary registers.
669 // In the case of copies of overlapping AGPRs, we conservatively do not
670 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
671 // an accvgpr_write used for this same copy due to implicit-defs
672 if (!RegsOverlap) {
673 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
674 --Def;
675
676 if (!Def->modifiesRegister(SrcReg, &RI))
677 continue;
678
679 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
680 Def->getOperand(0).getReg() != SrcReg)
681 break;
682
683 MachineOperand &DefOp = Def->getOperand(1);
684 assert(DefOp.isReg() || DefOp.isImm());
685
686 if (DefOp.isReg()) {
687 bool SafeToPropagate = true;
688 // Check that register source operand is not clobbered before MI.
689 // Immediate operands are always safe to propagate.
690 for (auto I = Def; I != MI && SafeToPropagate; ++I)
691 if (I->modifiesRegister(DefOp.getReg(), &RI))
692 SafeToPropagate = false;
693
694 if (!SafeToPropagate)
695 break;
696
697 for (auto I = Def; I != MI; ++I)
698 I->clearRegisterKills(DefOp.getReg(), &RI);
699 }
700
701 MachineInstrBuilder Builder =
702 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
703 .add(DefOp);
704 if (ImpDefSuperReg)
705 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
706
707 if (ImpUseSuperReg) {
708 Builder.addReg(ImpUseSuperReg,
710 }
711
712 return;
713 }
714 }
715
716 RS.enterBasicBlockEnd(MBB);
717 RS.backward(std::next(MI));
718
719 // Ideally we want to have three registers for a long reg_sequence copy
720 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
721 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
722 *MBB.getParent());
723
724 // Registers in the sequence are allocated contiguously so we can just
725 // use register number to pick one of three round-robin temps.
726 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
727 Register Tmp =
728 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
729 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
730 "VGPR used for an intermediate copy should have been reserved.");
731
732 // Only loop through if there are any free registers left. We don't want to
733 // spill.
734 while (RegNo--) {
735 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
736 /* RestoreAfter */ false, 0,
737 /* AllowSpill */ false);
738 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
739 break;
740 Tmp = Tmp2;
741 RS.setRegUsed(Tmp);
742 }
743
744 // Insert copy to temporary VGPR.
745 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
746 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
747 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
748 } else {
749 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
750 }
751
752 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
753 .addReg(SrcReg, getKillRegState(KillSrc));
754 if (ImpUseSuperReg) {
755 UseBuilder.addReg(ImpUseSuperReg,
757 }
758
759 MachineInstrBuilder DefBuilder
760 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
761 .addReg(Tmp, RegState::Kill);
762
763 if (ImpDefSuperReg)
764 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
765}
766
769 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
770 const TargetRegisterClass *RC, bool Forward) {
771 const SIRegisterInfo &RI = TII.getRegisterInfo();
772 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
774 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
775
776 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
777 int16_t SubIdx = BaseIndices[Idx];
778 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
779 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
780 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
781 unsigned Opcode = AMDGPU::S_MOV_B32;
782
783 // Is SGPR aligned? If so try to combine with next.
784 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
785 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
786 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
787 // Can use SGPR64 copy
788 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
789 SubIdx = RI.getSubRegFromChannel(Channel, 2);
790 DestSubReg = RI.getSubReg(DestReg, SubIdx);
791 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
792 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
793 Opcode = AMDGPU::S_MOV_B64;
794 Idx++;
795 }
796
797 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
798 .addReg(SrcSubReg)
799 .addReg(SrcReg, RegState::Implicit);
800
801 if (!FirstMI)
802 FirstMI = LastMI;
803
804 if (!Forward)
805 I--;
806 }
807
808 assert(FirstMI && LastMI);
809 if (!Forward)
810 std::swap(FirstMI, LastMI);
811
812 FirstMI->addOperand(
813 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
814
815 if (KillSrc)
816 LastMI->addRegisterKilled(SrcReg, &RI);
817}
818
821 const DebugLoc &DL, Register DestReg,
822 Register SrcReg, bool KillSrc, bool RenamableDest,
823 bool RenamableSrc) const {
824 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
825 unsigned Size = RI.getRegSizeInBits(*RC);
826 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
827 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
828
829 // The rest of copyPhysReg assumes Src and Dst size are the same size.
830 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
831 // we remove Fix16BitCopies and this code block?
832 if (Fix16BitCopies) {
833 if (((Size == 16) != (SrcSize == 16))) {
834 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
835 assert(ST.useRealTrue16Insts());
836 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
837 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
838 RegToFix = SubReg;
839
840 if (DestReg == SrcReg) {
841 // Identity copy. Insert empty bundle since ExpandPostRA expects an
842 // instruction here.
843 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
844 return;
845 }
846 RC = RI.getPhysRegBaseClass(DestReg);
847 Size = RI.getRegSizeInBits(*RC);
848 SrcRC = RI.getPhysRegBaseClass(SrcReg);
849 SrcSize = RI.getRegSizeInBits(*SrcRC);
850 }
851 }
852
853 if (RC == &AMDGPU::VGPR_32RegClass) {
854 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
855 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
856 AMDGPU::AGPR_32RegClass.contains(SrcReg));
857 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
858 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
859 BuildMI(MBB, MI, DL, get(Opc), DestReg)
860 .addReg(SrcReg, getKillRegState(KillSrc));
861 return;
862 }
863
864 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
865 RC == &AMDGPU::SReg_32RegClass) {
866 if (SrcReg == AMDGPU::SCC) {
867 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
868 .addImm(1)
869 .addImm(0);
870 return;
871 }
872
873 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
874 if (DestReg == AMDGPU::VCC_LO) {
875 // FIXME: Hack until VReg_1 removed.
876 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
877 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
878 .addImm(0)
879 .addReg(SrcReg, getKillRegState(KillSrc));
880 return;
881 }
882
883 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
884 return;
885 }
886
887 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
888 .addReg(SrcReg, getKillRegState(KillSrc));
889 return;
890 }
891
892 if (RC == &AMDGPU::SReg_64RegClass) {
893 if (SrcReg == AMDGPU::SCC) {
894 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
895 .addImm(1)
896 .addImm(0);
897 return;
898 }
899
900 if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
901 if (DestReg == AMDGPU::VCC) {
902 // FIXME: Hack until VReg_1 removed.
903 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
904 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
905 .addImm(0)
906 .addReg(SrcReg, getKillRegState(KillSrc));
907 return;
908 }
909
910 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
911 return;
912 }
913
914 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
915 .addReg(SrcReg, getKillRegState(KillSrc));
916 return;
917 }
918
919 if (DestReg == AMDGPU::SCC) {
920 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
921 // but SelectionDAG emits such copies for i1 sources.
922 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
923 // This copy can only be produced by patterns
924 // with explicit SCC, which are known to be enabled
925 // only for subtargets with S_CMP_LG_U64 present.
926 assert(ST.hasScalarCompareEq64());
927 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
928 .addReg(SrcReg, getKillRegState(KillSrc))
929 .addImm(0);
930 } else {
931 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
932 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
933 .addReg(SrcReg, getKillRegState(KillSrc))
934 .addImm(0);
935 }
936
937 return;
938 }
939
940 if (RC == &AMDGPU::AGPR_32RegClass) {
941 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
942 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
943 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
944 .addReg(SrcReg, getKillRegState(KillSrc));
945 return;
946 }
947
948 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
949 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
950 .addReg(SrcReg, getKillRegState(KillSrc));
951 return;
952 }
953
954 // FIXME: Pass should maintain scavenger to avoid scan through the block on
955 // every AGPR spill.
956 RegScavenger RS;
957 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
958 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
959 return;
960 }
961
962 if (Size == 16) {
963 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
964 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
965 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
966
967 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
968 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
969 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
970 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
971 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
972 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
973 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
974 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
975
976 if (IsSGPRDst) {
977 if (!IsSGPRSrc) {
978 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
979 return;
980 }
981
982 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
983 .addReg(NewSrcReg, getKillRegState(KillSrc));
984 return;
985 }
986
987 if (IsAGPRDst || IsAGPRSrc) {
988 if (!DstLow || !SrcLow) {
989 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
990 "Cannot use hi16 subreg with an AGPR!");
991 }
992
993 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
994 return;
995 }
996
997 if (ST.useRealTrue16Insts()) {
998 if (IsSGPRSrc) {
999 assert(SrcLow);
1000 SrcReg = NewSrcReg;
1001 }
1002 // Use the smaller instruction encoding if possible.
1003 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
1004 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
1005 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1006 .addReg(SrcReg);
1007 } else {
1008 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1009 .addImm(0) // src0_modifiers
1010 .addReg(SrcReg)
1011 .addImm(0); // op_sel
1012 }
1013 return;
1014 }
1015
1016 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1017 if (!DstLow || !SrcLow) {
1018 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1019 "Cannot use hi16 subreg on VI!");
1020 }
1021
1022 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1023 .addReg(NewSrcReg, getKillRegState(KillSrc));
1024 return;
1025 }
1026
1027 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1028 .addImm(0) // src0_modifiers
1029 .addReg(NewSrcReg)
1030 .addImm(0) // clamp
1037 // First implicit operand is $exec.
1038 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1039 return;
1040 }
1041
1042 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1043 if (ST.hasMovB64()) {
1044 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1045 .addReg(SrcReg, getKillRegState(KillSrc));
1046 return;
1047 }
1048 if (ST.hasPkMovB32()) {
1049 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1051 .addReg(SrcReg)
1053 .addReg(SrcReg)
1054 .addImm(0) // op_sel_lo
1055 .addImm(0) // op_sel_hi
1056 .addImm(0) // neg_lo
1057 .addImm(0) // neg_hi
1058 .addImm(0) // clamp
1059 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1060 return;
1061 }
1062 }
1063
1064 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1065 if (RI.isSGPRClass(RC)) {
1066 if (!RI.isSGPRClass(SrcRC)) {
1067 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1068 return;
1069 }
1070 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1071 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1072 Forward);
1073 return;
1074 }
1075
1076 unsigned EltSize = 4;
1077 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1078 if (RI.isAGPRClass(RC)) {
1079 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1080 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1081 else if (RI.hasVGPRs(SrcRC) ||
1082 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1083 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1084 else
1085 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1086 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1087 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1088 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1089 (RI.isProperlyAlignedRC(*RC) &&
1090 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1091 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1092 if (ST.hasMovB64()) {
1093 Opcode = AMDGPU::V_MOV_B64_e32;
1094 EltSize = 8;
1095 } else if (ST.hasPkMovB32()) {
1096 Opcode = AMDGPU::V_PK_MOV_B32;
1097 EltSize = 8;
1098 }
1099 }
1100
1101 // For the cases where we need an intermediate instruction/temporary register
1102 // (destination is an AGPR), we need a scavenger.
1103 //
1104 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1105 // whole block for every handled copy.
1106 std::unique_ptr<RegScavenger> RS;
1107 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1108 RS = std::make_unique<RegScavenger>();
1109
1110 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1111
1112 // If there is an overlap, we can't kill the super-register on the last
1113 // instruction, since it will also kill the components made live by this def.
1114 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1115 const bool CanKillSuperReg = KillSrc && !Overlap;
1116
1117 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1118 unsigned SubIdx;
1119 if (Forward)
1120 SubIdx = SubIndices[Idx];
1121 else
1122 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1123 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1124 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1125 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1126
1127 bool IsFirstSubreg = Idx == 0;
1128 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1129
1130 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1131 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1132 Register ImpUseSuper = SrcReg;
1133 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1134 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1135 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1137 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1139 .addReg(SrcSubReg)
1141 .addReg(SrcSubReg)
1142 .addImm(0) // op_sel_lo
1143 .addImm(0) // op_sel_hi
1144 .addImm(0) // neg_lo
1145 .addImm(0) // neg_hi
1146 .addImm(0) // clamp
1147 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1148 if (IsFirstSubreg)
1150 } else {
1151 MachineInstrBuilder Builder =
1152 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1153 if (IsFirstSubreg)
1154 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1155
1156 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1157 }
1158 }
1159}
1160
1161int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1162 int32_t NewOpc;
1163
1164 // Try to map original to commuted opcode
1165 NewOpc = AMDGPU::getCommuteRev(Opcode);
1166 if (NewOpc != -1)
1167 // Check if the commuted (REV) opcode exists on the target.
1168 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1169
1170 // Try to map commuted to original opcode
1171 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1172 if (NewOpc != -1)
1173 // Check if the original (non-REV) opcode exists on the target.
1174 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1175
1176 return Opcode;
1177}
1178
1179const TargetRegisterClass *
1181 return &AMDGPU::VGPR_32RegClass;
1182}
1183
1186 const DebugLoc &DL, Register DstReg,
1188 Register TrueReg,
1189 Register FalseReg) const {
1190 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1191 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1193 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1194 "Not a VGPR32 reg");
1195
1196 if (Cond.size() == 1) {
1197 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1198 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1199 .add(Cond[0]);
1200 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1201 .addImm(0)
1202 .addReg(FalseReg)
1203 .addImm(0)
1204 .addReg(TrueReg)
1205 .addReg(SReg);
1206 } else if (Cond.size() == 2) {
1207 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1208 switch (Cond[0].getImm()) {
1209 case SIInstrInfo::SCC_TRUE: {
1210 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1211 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1212 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1213 .addImm(0)
1214 .addReg(FalseReg)
1215 .addImm(0)
1216 .addReg(TrueReg)
1217 .addReg(SReg);
1218 break;
1219 }
1220 case SIInstrInfo::SCC_FALSE: {
1221 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1222 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1223 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1224 .addImm(0)
1225 .addReg(FalseReg)
1226 .addImm(0)
1227 .addReg(TrueReg)
1228 .addReg(SReg);
1229 break;
1230 }
1231 case SIInstrInfo::VCCNZ: {
1232 MachineOperand RegOp = Cond[1];
1233 RegOp.setImplicit(false);
1234 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1235 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1236 .add(RegOp);
1237 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1238 .addImm(0)
1239 .addReg(FalseReg)
1240 .addImm(0)
1241 .addReg(TrueReg)
1242 .addReg(SReg);
1243 break;
1244 }
1245 case SIInstrInfo::VCCZ: {
1246 MachineOperand RegOp = Cond[1];
1247 RegOp.setImplicit(false);
1248 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1249 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1250 .add(RegOp);
1251 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1252 .addImm(0)
1253 .addReg(TrueReg)
1254 .addImm(0)
1255 .addReg(FalseReg)
1256 .addReg(SReg);
1257 break;
1258 }
1259 case SIInstrInfo::EXECNZ: {
1260 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1261 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1262 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1263 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1264 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1265 .addImm(0)
1266 .addReg(FalseReg)
1267 .addImm(0)
1268 .addReg(TrueReg)
1269 .addReg(SReg);
1270 break;
1271 }
1272 case SIInstrInfo::EXECZ: {
1273 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1274 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1275 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1276 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1277 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1278 .addImm(0)
1279 .addReg(FalseReg)
1280 .addImm(0)
1281 .addReg(TrueReg)
1282 .addReg(SReg);
1283 llvm_unreachable("Unhandled branch predicate EXECZ");
1284 break;
1285 }
1286 default:
1287 llvm_unreachable("invalid branch predicate");
1288 }
1289 } else {
1290 llvm_unreachable("Can only handle Cond size 1 or 2");
1291 }
1292}
1293
1296 const DebugLoc &DL,
1297 Register SrcReg, int Value) const {
1298 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1299 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1300 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1301 .addImm(Value)
1302 .addReg(SrcReg);
1303
1304 return Reg;
1305}
1306
1309 const DebugLoc &DL,
1310 Register SrcReg, int Value) const {
1311 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1312 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1313 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1314 .addImm(Value)
1315 .addReg(SrcReg);
1316
1317 return Reg;
1318}
1319
1321 const Register Reg,
1322 int64_t &ImmVal) const {
1323 switch (MI.getOpcode()) {
1324 case AMDGPU::V_MOV_B32_e32:
1325 case AMDGPU::S_MOV_B32:
1326 case AMDGPU::S_MOVK_I32:
1327 case AMDGPU::S_MOV_B64:
1328 case AMDGPU::V_MOV_B64_e32:
1329 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1330 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1331 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1332 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1333 case AMDGPU::V_MOV_B64_PSEUDO:
1334 case AMDGPU::V_MOV_B16_t16_e32: {
1335 const MachineOperand &Src0 = MI.getOperand(1);
1336 if (Src0.isImm()) {
1337 ImmVal = Src0.getImm();
1338 return MI.getOperand(0).getReg() == Reg;
1339 }
1340
1341 return false;
1342 }
1343 case AMDGPU::V_MOV_B16_t16_e64: {
1344 const MachineOperand &Src0 = MI.getOperand(2);
1345 if (Src0.isImm() && !MI.getOperand(1).getImm()) {
1346 ImmVal = Src0.getImm();
1347 return MI.getOperand(0).getReg() == Reg;
1348 }
1349
1350 return false;
1351 }
1352 case AMDGPU::S_BREV_B32:
1353 case AMDGPU::V_BFREV_B32_e32:
1354 case AMDGPU::V_BFREV_B32_e64: {
1355 const MachineOperand &Src0 = MI.getOperand(1);
1356 if (Src0.isImm()) {
1357 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1358 return MI.getOperand(0).getReg() == Reg;
1359 }
1360
1361 return false;
1362 }
1363 case AMDGPU::S_NOT_B32:
1364 case AMDGPU::V_NOT_B32_e32:
1365 case AMDGPU::V_NOT_B32_e64: {
1366 const MachineOperand &Src0 = MI.getOperand(1);
1367 if (Src0.isImm()) {
1368 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1369 return MI.getOperand(0).getReg() == Reg;
1370 }
1371
1372 return false;
1373 }
1374 default:
1375 return false;
1376 }
1377}
1378
1379std::optional<int64_t>
1381 if (Op.isImm())
1382 return Op.getImm();
1383
1384 if (!Op.isReg() || !Op.getReg().isVirtual())
1385 return std::nullopt;
1386 MachineRegisterInfo &MRI = Op.getParent()->getMF()->getRegInfo();
1387 const MachineInstr *Def = MRI.getVRegDef(Op.getReg());
1388 if (Def && Def->isMoveImmediate()) {
1389 const MachineOperand &ImmSrc = Def->getOperand(1);
1390 if (ImmSrc.isImm())
1391 return extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg());
1392 }
1393
1394 return std::nullopt;
1395}
1396
1398
1399 if (RI.isAGPRClass(DstRC))
1400 return AMDGPU::COPY;
1401 if (RI.getRegSizeInBits(*DstRC) == 16) {
1402 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1403 // before RA.
1404 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1405 }
1406 if (RI.getRegSizeInBits(*DstRC) == 32)
1407 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1408 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1409 return AMDGPU::S_MOV_B64;
1410 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1411 return AMDGPU::V_MOV_B64_PSEUDO;
1412 return AMDGPU::COPY;
1413}
1414
1415const MCInstrDesc &
1417 bool IsIndirectSrc) const {
1418 if (IsIndirectSrc) {
1419 if (VecSize <= 32) // 4 bytes
1420 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1421 if (VecSize <= 64) // 8 bytes
1422 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1423 if (VecSize <= 96) // 12 bytes
1424 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1425 if (VecSize <= 128) // 16 bytes
1426 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1427 if (VecSize <= 160) // 20 bytes
1428 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1429 if (VecSize <= 192) // 24 bytes
1430 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6);
1431 if (VecSize <= 224) // 28 bytes
1432 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7);
1433 if (VecSize <= 256) // 32 bytes
1434 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1435 if (VecSize <= 288) // 36 bytes
1436 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1437 if (VecSize <= 320) // 40 bytes
1438 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1439 if (VecSize <= 352) // 44 bytes
1440 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1441 if (VecSize <= 384) // 48 bytes
1442 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1443 if (VecSize <= 512) // 64 bytes
1444 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1445 if (VecSize <= 1024) // 128 bytes
1446 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1447
1448 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1449 }
1450
1451 if (VecSize <= 32) // 4 bytes
1452 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1453 if (VecSize <= 64) // 8 bytes
1454 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1455 if (VecSize <= 96) // 12 bytes
1456 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1457 if (VecSize <= 128) // 16 bytes
1458 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1459 if (VecSize <= 160) // 20 bytes
1460 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1461 if (VecSize <= 192) // 24 bytes
1462 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6);
1463 if (VecSize <= 224) // 28 bytes
1464 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7);
1465 if (VecSize <= 256) // 32 bytes
1466 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1467 if (VecSize <= 288) // 36 bytes
1468 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1469 if (VecSize <= 320) // 40 bytes
1470 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1471 if (VecSize <= 352) // 44 bytes
1472 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1473 if (VecSize <= 384) // 48 bytes
1474 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1475 if (VecSize <= 512) // 64 bytes
1476 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1477 if (VecSize <= 1024) // 128 bytes
1478 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1479
1480 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1481}
1482
1483static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1484 if (VecSize <= 32) // 4 bytes
1485 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1486 if (VecSize <= 64) // 8 bytes
1487 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1488 if (VecSize <= 96) // 12 bytes
1489 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1490 if (VecSize <= 128) // 16 bytes
1491 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1492 if (VecSize <= 160) // 20 bytes
1493 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1494 if (VecSize <= 192) // 24 bytes
1495 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1496 if (VecSize <= 224) // 28 bytes
1497 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1498 if (VecSize <= 256) // 32 bytes
1499 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1500 if (VecSize <= 288) // 36 bytes
1501 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1502 if (VecSize <= 320) // 40 bytes
1503 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1504 if (VecSize <= 352) // 44 bytes
1505 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1506 if (VecSize <= 384) // 48 bytes
1507 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1508 if (VecSize <= 512) // 64 bytes
1509 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1510 if (VecSize <= 1024) // 128 bytes
1511 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1512
1513 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1514}
1515
1516static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1517 if (VecSize <= 32) // 4 bytes
1518 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1519 if (VecSize <= 64) // 8 bytes
1520 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1521 if (VecSize <= 96) // 12 bytes
1522 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1523 if (VecSize <= 128) // 16 bytes
1524 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1525 if (VecSize <= 160) // 20 bytes
1526 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1527 if (VecSize <= 192) // 24 bytes
1528 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1529 if (VecSize <= 224) // 28 bytes
1530 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1531 if (VecSize <= 256) // 32 bytes
1532 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1533 if (VecSize <= 288) // 36 bytes
1534 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1535 if (VecSize <= 320) // 40 bytes
1536 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1537 if (VecSize <= 352) // 44 bytes
1538 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1539 if (VecSize <= 384) // 48 bytes
1540 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1541 if (VecSize <= 512) // 64 bytes
1542 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1543 if (VecSize <= 1024) // 128 bytes
1544 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1545
1546 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1547}
1548
1549static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1550 if (VecSize <= 64) // 8 bytes
1551 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1552 if (VecSize <= 128) // 16 bytes
1553 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1554 if (VecSize <= 256) // 32 bytes
1555 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1556 if (VecSize <= 512) // 64 bytes
1557 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1558 if (VecSize <= 1024) // 128 bytes
1559 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1560
1561 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1562}
1563
1564const MCInstrDesc &
1565SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1566 bool IsSGPR) const {
1567 if (IsSGPR) {
1568 switch (EltSize) {
1569 case 32:
1570 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1571 case 64:
1572 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1573 default:
1574 llvm_unreachable("invalid reg indexing elt size");
1575 }
1576 }
1577
1578 assert(EltSize == 32 && "invalid reg indexing elt size");
1580}
1581
1582static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1583 switch (Size) {
1584 case 4:
1585 return AMDGPU::SI_SPILL_S32_SAVE;
1586 case 8:
1587 return AMDGPU::SI_SPILL_S64_SAVE;
1588 case 12:
1589 return AMDGPU::SI_SPILL_S96_SAVE;
1590 case 16:
1591 return AMDGPU::SI_SPILL_S128_SAVE;
1592 case 20:
1593 return AMDGPU::SI_SPILL_S160_SAVE;
1594 case 24:
1595 return AMDGPU::SI_SPILL_S192_SAVE;
1596 case 28:
1597 return AMDGPU::SI_SPILL_S224_SAVE;
1598 case 32:
1599 return AMDGPU::SI_SPILL_S256_SAVE;
1600 case 36:
1601 return AMDGPU::SI_SPILL_S288_SAVE;
1602 case 40:
1603 return AMDGPU::SI_SPILL_S320_SAVE;
1604 case 44:
1605 return AMDGPU::SI_SPILL_S352_SAVE;
1606 case 48:
1607 return AMDGPU::SI_SPILL_S384_SAVE;
1608 case 64:
1609 return AMDGPU::SI_SPILL_S512_SAVE;
1610 case 128:
1611 return AMDGPU::SI_SPILL_S1024_SAVE;
1612 default:
1613 llvm_unreachable("unknown register size");
1614 }
1615}
1616
1617static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1618 switch (Size) {
1619 case 2:
1620 return AMDGPU::SI_SPILL_V16_SAVE;
1621 case 4:
1622 return AMDGPU::SI_SPILL_V32_SAVE;
1623 case 8:
1624 return AMDGPU::SI_SPILL_V64_SAVE;
1625 case 12:
1626 return AMDGPU::SI_SPILL_V96_SAVE;
1627 case 16:
1628 return AMDGPU::SI_SPILL_V128_SAVE;
1629 case 20:
1630 return AMDGPU::SI_SPILL_V160_SAVE;
1631 case 24:
1632 return AMDGPU::SI_SPILL_V192_SAVE;
1633 case 28:
1634 return AMDGPU::SI_SPILL_V224_SAVE;
1635 case 32:
1636 return AMDGPU::SI_SPILL_V256_SAVE;
1637 case 36:
1638 return AMDGPU::SI_SPILL_V288_SAVE;
1639 case 40:
1640 return AMDGPU::SI_SPILL_V320_SAVE;
1641 case 44:
1642 return AMDGPU::SI_SPILL_V352_SAVE;
1643 case 48:
1644 return AMDGPU::SI_SPILL_V384_SAVE;
1645 case 64:
1646 return AMDGPU::SI_SPILL_V512_SAVE;
1647 case 128:
1648 return AMDGPU::SI_SPILL_V1024_SAVE;
1649 default:
1650 llvm_unreachable("unknown register size");
1651 }
1652}
1653
1654static unsigned getAVSpillSaveOpcode(unsigned Size) {
1655 switch (Size) {
1656 case 4:
1657 return AMDGPU::SI_SPILL_AV32_SAVE;
1658 case 8:
1659 return AMDGPU::SI_SPILL_AV64_SAVE;
1660 case 12:
1661 return AMDGPU::SI_SPILL_AV96_SAVE;
1662 case 16:
1663 return AMDGPU::SI_SPILL_AV128_SAVE;
1664 case 20:
1665 return AMDGPU::SI_SPILL_AV160_SAVE;
1666 case 24:
1667 return AMDGPU::SI_SPILL_AV192_SAVE;
1668 case 28:
1669 return AMDGPU::SI_SPILL_AV224_SAVE;
1670 case 32:
1671 return AMDGPU::SI_SPILL_AV256_SAVE;
1672 case 36:
1673 return AMDGPU::SI_SPILL_AV288_SAVE;
1674 case 40:
1675 return AMDGPU::SI_SPILL_AV320_SAVE;
1676 case 44:
1677 return AMDGPU::SI_SPILL_AV352_SAVE;
1678 case 48:
1679 return AMDGPU::SI_SPILL_AV384_SAVE;
1680 case 64:
1681 return AMDGPU::SI_SPILL_AV512_SAVE;
1682 case 128:
1683 return AMDGPU::SI_SPILL_AV1024_SAVE;
1684 default:
1685 llvm_unreachable("unknown register size");
1686 }
1687}
1688
1689static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1690 bool IsVectorSuperClass) {
1691 // Currently, there is only 32-bit WWM register spills needed.
1692 if (Size != 4)
1693 llvm_unreachable("unknown wwm register spill size");
1694
1695 if (IsVectorSuperClass)
1696 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1697
1698 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1699}
1700
1702 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1703 const SIMachineFunctionInfo &MFI) const {
1704 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1705
1706 // Choose the right opcode if spilling a WWM register.
1708 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1709
1710 // TODO: Check if AGPRs are available
1711 if (ST.hasMAIInsts())
1712 return getAVSpillSaveOpcode(Size);
1713
1715}
1716
1719 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1720 MachineInstr::MIFlag Flags) const {
1721 MachineFunction *MF = MBB.getParent();
1723 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1724 const DebugLoc &DL = MBB.findDebugLoc(MI);
1725
1726 MachinePointerInfo PtrInfo
1727 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1729 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1730 FrameInfo.getObjectAlign(FrameIndex));
1731 unsigned SpillSize = RI.getSpillSize(*RC);
1732
1733 MachineRegisterInfo &MRI = MF->getRegInfo();
1734 if (RI.isSGPRClass(RC)) {
1735 MFI->setHasSpilledSGPRs();
1736 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1737 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1738 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1739
1740 // We are only allowed to create one new instruction when spilling
1741 // registers, so we need to use pseudo instruction for spilling SGPRs.
1742 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1743
1744 // The SGPR spill/restore instructions only work on number sgprs, so we need
1745 // to make sure we are using the correct register class.
1746 if (SrcReg.isVirtual() && SpillSize == 4) {
1747 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1748 }
1749
1750 BuildMI(MBB, MI, DL, OpDesc)
1751 .addReg(SrcReg, getKillRegState(isKill)) // data
1752 .addFrameIndex(FrameIndex) // addr
1753 .addMemOperand(MMO)
1755
1756 if (RI.spillSGPRToVGPR())
1757 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1758 return;
1759 }
1760
1761 unsigned Opcode =
1762 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1763 MFI->setHasSpilledVGPRs();
1764
1765 BuildMI(MBB, MI, DL, get(Opcode))
1766 .addReg(SrcReg, getKillRegState(isKill)) // data
1767 .addFrameIndex(FrameIndex) // addr
1768 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1769 .addImm(0) // offset
1770 .addMemOperand(MMO);
1771}
1772
1773static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1774 switch (Size) {
1775 case 4:
1776 return AMDGPU::SI_SPILL_S32_RESTORE;
1777 case 8:
1778 return AMDGPU::SI_SPILL_S64_RESTORE;
1779 case 12:
1780 return AMDGPU::SI_SPILL_S96_RESTORE;
1781 case 16:
1782 return AMDGPU::SI_SPILL_S128_RESTORE;
1783 case 20:
1784 return AMDGPU::SI_SPILL_S160_RESTORE;
1785 case 24:
1786 return AMDGPU::SI_SPILL_S192_RESTORE;
1787 case 28:
1788 return AMDGPU::SI_SPILL_S224_RESTORE;
1789 case 32:
1790 return AMDGPU::SI_SPILL_S256_RESTORE;
1791 case 36:
1792 return AMDGPU::SI_SPILL_S288_RESTORE;
1793 case 40:
1794 return AMDGPU::SI_SPILL_S320_RESTORE;
1795 case 44:
1796 return AMDGPU::SI_SPILL_S352_RESTORE;
1797 case 48:
1798 return AMDGPU::SI_SPILL_S384_RESTORE;
1799 case 64:
1800 return AMDGPU::SI_SPILL_S512_RESTORE;
1801 case 128:
1802 return AMDGPU::SI_SPILL_S1024_RESTORE;
1803 default:
1804 llvm_unreachable("unknown register size");
1805 }
1806}
1807
1808static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1809 switch (Size) {
1810 case 2:
1811 return AMDGPU::SI_SPILL_V16_RESTORE;
1812 case 4:
1813 return AMDGPU::SI_SPILL_V32_RESTORE;
1814 case 8:
1815 return AMDGPU::SI_SPILL_V64_RESTORE;
1816 case 12:
1817 return AMDGPU::SI_SPILL_V96_RESTORE;
1818 case 16:
1819 return AMDGPU::SI_SPILL_V128_RESTORE;
1820 case 20:
1821 return AMDGPU::SI_SPILL_V160_RESTORE;
1822 case 24:
1823 return AMDGPU::SI_SPILL_V192_RESTORE;
1824 case 28:
1825 return AMDGPU::SI_SPILL_V224_RESTORE;
1826 case 32:
1827 return AMDGPU::SI_SPILL_V256_RESTORE;
1828 case 36:
1829 return AMDGPU::SI_SPILL_V288_RESTORE;
1830 case 40:
1831 return AMDGPU::SI_SPILL_V320_RESTORE;
1832 case 44:
1833 return AMDGPU::SI_SPILL_V352_RESTORE;
1834 case 48:
1835 return AMDGPU::SI_SPILL_V384_RESTORE;
1836 case 64:
1837 return AMDGPU::SI_SPILL_V512_RESTORE;
1838 case 128:
1839 return AMDGPU::SI_SPILL_V1024_RESTORE;
1840 default:
1841 llvm_unreachable("unknown register size");
1842 }
1843}
1844
1845static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1846 switch (Size) {
1847 case 4:
1848 return AMDGPU::SI_SPILL_AV32_RESTORE;
1849 case 8:
1850 return AMDGPU::SI_SPILL_AV64_RESTORE;
1851 case 12:
1852 return AMDGPU::SI_SPILL_AV96_RESTORE;
1853 case 16:
1854 return AMDGPU::SI_SPILL_AV128_RESTORE;
1855 case 20:
1856 return AMDGPU::SI_SPILL_AV160_RESTORE;
1857 case 24:
1858 return AMDGPU::SI_SPILL_AV192_RESTORE;
1859 case 28:
1860 return AMDGPU::SI_SPILL_AV224_RESTORE;
1861 case 32:
1862 return AMDGPU::SI_SPILL_AV256_RESTORE;
1863 case 36:
1864 return AMDGPU::SI_SPILL_AV288_RESTORE;
1865 case 40:
1866 return AMDGPU::SI_SPILL_AV320_RESTORE;
1867 case 44:
1868 return AMDGPU::SI_SPILL_AV352_RESTORE;
1869 case 48:
1870 return AMDGPU::SI_SPILL_AV384_RESTORE;
1871 case 64:
1872 return AMDGPU::SI_SPILL_AV512_RESTORE;
1873 case 128:
1874 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1875 default:
1876 llvm_unreachable("unknown register size");
1877 }
1878}
1879
1880static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1881 bool IsVectorSuperClass) {
1882 // Currently, there is only 32-bit WWM register spills needed.
1883 if (Size != 4)
1884 llvm_unreachable("unknown wwm register spill size");
1885
1886 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1887 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1888
1889 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1890}
1891
1893 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1894 const SIMachineFunctionInfo &MFI) const {
1895 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1896
1897 // Choose the right opcode if restoring a WWM register.
1899 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1900
1901 // TODO: Check if AGPRs are available
1902 if (ST.hasMAIInsts())
1904
1905 assert(!RI.isAGPRClass(RC));
1907}
1908
1911 Register DestReg, int FrameIndex,
1912 const TargetRegisterClass *RC,
1913 Register VReg, unsigned SubReg,
1914 MachineInstr::MIFlag Flags) const {
1915 MachineFunction *MF = MBB.getParent();
1917 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1918 const DebugLoc &DL = MBB.findDebugLoc(MI);
1919 unsigned SpillSize = RI.getSpillSize(*RC);
1920
1921 MachinePointerInfo PtrInfo
1922 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1923
1925 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1926 FrameInfo.getObjectAlign(FrameIndex));
1927
1928 if (RI.isSGPRClass(RC)) {
1929 MFI->setHasSpilledSGPRs();
1930 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1931 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1932 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1933
1934 // FIXME: Maybe this should not include a memoperand because it will be
1935 // lowered to non-memory instructions.
1936 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1937 if (DestReg.isVirtual() && SpillSize == 4) {
1938 MachineRegisterInfo &MRI = MF->getRegInfo();
1939 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1940 }
1941
1942 if (RI.spillSGPRToVGPR())
1943 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1944 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1945 .addFrameIndex(FrameIndex) // addr
1946 .addMemOperand(MMO)
1948
1949 return;
1950 }
1951
1952 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1953 SpillSize, *MFI);
1954 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1955 .addFrameIndex(FrameIndex) // vaddr
1956 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1957 .addImm(0) // offset
1958 .addMemOperand(MMO);
1959}
1960
1965
1968 unsigned Quantity) const {
1969 DebugLoc DL = MBB.findDebugLoc(MI);
1970 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1971 while (Quantity > 0) {
1972 unsigned Arg = std::min(Quantity, MaxSNopCount);
1973 Quantity -= Arg;
1974 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1975 }
1976}
1977
1979 auto *MF = MBB.getParent();
1980 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1981
1982 assert(Info->isEntryFunction());
1983
1984 if (MBB.succ_empty()) {
1985 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1986 if (HasNoTerminator) {
1987 if (Info->returnsVoid()) {
1988 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1989 } else {
1990 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1991 }
1992 }
1993 }
1994}
1995
1999 const DebugLoc &DL) const {
2000 MachineFunction *MF = MBB.getParent();
2001 constexpr unsigned DoorbellIDMask = 0x3ff;
2002 constexpr unsigned ECQueueWaveAbort = 0x400;
2003
2004 MachineBasicBlock *TrapBB = &MBB;
2005 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2006
2007 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
2008 MBB.splitAt(MI, /*UpdateLiveIns=*/false);
2009 TrapBB = MF->CreateMachineBasicBlock();
2010 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
2011 MF->push_back(TrapBB);
2012 MBB.addSuccessor(TrapBB);
2013 }
2014 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2015 // will be a nop.
2016 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
2017 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2018 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2019 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2020 DoorbellReg)
2022 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2023 .addUse(AMDGPU::M0);
2024 Register DoorbellRegMasked =
2025 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2026 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2027 .addUse(DoorbellReg)
2028 .addImm(DoorbellIDMask);
2029 Register SetWaveAbortBit =
2030 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2031 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2032 .addUse(DoorbellRegMasked)
2033 .addImm(ECQueueWaveAbort);
2034 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2035 .addUse(SetWaveAbortBit);
2036 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2038 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2039 .addUse(AMDGPU::TTMP2);
2040 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2041 TrapBB->addSuccessor(HaltLoopBB);
2042
2043 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2044 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2045 .addMBB(HaltLoopBB);
2046 MF->push_back(HaltLoopBB);
2047 HaltLoopBB->addSuccessor(HaltLoopBB);
2048
2049 return MBB.getNextNode();
2050}
2051
2053 switch (MI.getOpcode()) {
2054 default:
2055 if (MI.isMetaInstruction())
2056 return 0;
2057 return 1; // FIXME: Do wait states equal cycles?
2058
2059 case AMDGPU::S_NOP:
2060 return MI.getOperand(0).getImm() + 1;
2061 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2062 // hazard, even if one exist, won't really be visible. Should we handle it?
2063 }
2064}
2065
2067 MachineBasicBlock &MBB = *MI.getParent();
2068 DebugLoc DL = MBB.findDebugLoc(MI);
2070 switch (MI.getOpcode()) {
2071 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2072 case AMDGPU::S_MOV_B64_term:
2073 // This is only a terminator to get the correct spill code placement during
2074 // register allocation.
2075 MI.setDesc(get(AMDGPU::S_MOV_B64));
2076 break;
2077
2078 case AMDGPU::S_MOV_B32_term:
2079 // This is only a terminator to get the correct spill code placement during
2080 // register allocation.
2081 MI.setDesc(get(AMDGPU::S_MOV_B32));
2082 break;
2083
2084 case AMDGPU::S_XOR_B64_term:
2085 // This is only a terminator to get the correct spill code placement during
2086 // register allocation.
2087 MI.setDesc(get(AMDGPU::S_XOR_B64));
2088 break;
2089
2090 case AMDGPU::S_XOR_B32_term:
2091 // This is only a terminator to get the correct spill code placement during
2092 // register allocation.
2093 MI.setDesc(get(AMDGPU::S_XOR_B32));
2094 break;
2095 case AMDGPU::S_OR_B64_term:
2096 // This is only a terminator to get the correct spill code placement during
2097 // register allocation.
2098 MI.setDesc(get(AMDGPU::S_OR_B64));
2099 break;
2100 case AMDGPU::S_OR_B32_term:
2101 // This is only a terminator to get the correct spill code placement during
2102 // register allocation.
2103 MI.setDesc(get(AMDGPU::S_OR_B32));
2104 break;
2105
2106 case AMDGPU::S_ANDN2_B64_term:
2107 // This is only a terminator to get the correct spill code placement during
2108 // register allocation.
2109 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2110 break;
2111
2112 case AMDGPU::S_ANDN2_B32_term:
2113 // This is only a terminator to get the correct spill code placement during
2114 // register allocation.
2115 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2116 break;
2117
2118 case AMDGPU::S_AND_B64_term:
2119 // This is only a terminator to get the correct spill code placement during
2120 // register allocation.
2121 MI.setDesc(get(AMDGPU::S_AND_B64));
2122 break;
2123
2124 case AMDGPU::S_AND_B32_term:
2125 // This is only a terminator to get the correct spill code placement during
2126 // register allocation.
2127 MI.setDesc(get(AMDGPU::S_AND_B32));
2128 break;
2129
2130 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2131 // This is only a terminator to get the correct spill code placement during
2132 // register allocation.
2133 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2134 break;
2135
2136 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2137 // This is only a terminator to get the correct spill code placement during
2138 // register allocation.
2139 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2140 break;
2141
2142 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2143 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2144 break;
2145
2146 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2147 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2148 break;
2149 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2150 Register Dst = MI.getOperand(0).getReg();
2151 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2152 MI.setDesc(
2153 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2154 break;
2155 }
2156 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2157 Register Dst = MI.getOperand(0).getReg();
2158 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2159 int64_t Imm = MI.getOperand(1).getImm();
2160
2161 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2162 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2163 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2166 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2167 .addImm(SignExtend64<32>(Imm >> 32))
2169 MI.eraseFromParent();
2170 break;
2171 }
2172
2173 [[fallthrough]];
2174 }
2175 case AMDGPU::V_MOV_B64_PSEUDO: {
2176 Register Dst = MI.getOperand(0).getReg();
2177 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2178 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2179
2180 const MCInstrDesc &Mov64Desc = get(AMDGPU::V_MOV_B64_e32);
2181 const TargetRegisterClass *Mov64RC = getRegClass(Mov64Desc, /*OpNum=*/0);
2182
2183 const MachineOperand &SrcOp = MI.getOperand(1);
2184 // FIXME: Will this work for 64-bit floating point immediates?
2185 assert(!SrcOp.isFPImm());
2186 if (ST.hasMovB64() && Mov64RC->contains(Dst)) {
2187 MI.setDesc(Mov64Desc);
2188 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2189 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2190 break;
2191 }
2192 if (SrcOp.isImm()) {
2193 APInt Imm(64, SrcOp.getImm());
2194 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2195 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2196 const MCInstrDesc &PkMovDesc = get(AMDGPU::V_PK_MOV_B32);
2197 const TargetRegisterClass *PkMovRC = getRegClass(PkMovDesc, /*OpNum=*/0);
2198
2199 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo) &&
2200 PkMovRC->contains(Dst)) {
2201 BuildMI(MBB, MI, DL, PkMovDesc, Dst)
2203 .addImm(Lo.getSExtValue())
2205 .addImm(Lo.getSExtValue())
2206 .addImm(0) // op_sel_lo
2207 .addImm(0) // op_sel_hi
2208 .addImm(0) // neg_lo
2209 .addImm(0) // neg_hi
2210 .addImm(0); // clamp
2211 } else {
2212 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2213 .addImm(Lo.getSExtValue())
2215 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2216 .addImm(Hi.getSExtValue())
2218 }
2219 } else {
2220 assert(SrcOp.isReg());
2221 if (ST.hasPkMovB32() &&
2222 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2223 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2224 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2225 .addReg(SrcOp.getReg())
2227 .addReg(SrcOp.getReg())
2228 .addImm(0) // op_sel_lo
2229 .addImm(0) // op_sel_hi
2230 .addImm(0) // neg_lo
2231 .addImm(0) // neg_hi
2232 .addImm(0); // clamp
2233 } else {
2234 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2235 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2237 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2238 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2240 }
2241 }
2242 MI.eraseFromParent();
2243 break;
2244 }
2245 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2247 break;
2248 }
2249 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2250 const MachineOperand &SrcOp = MI.getOperand(1);
2251 assert(!SrcOp.isFPImm());
2252
2253 if (ST.has64BitLiterals()) {
2254 MI.setDesc(get(AMDGPU::S_MOV_B64));
2255 break;
2256 }
2257
2258 APInt Imm(64, SrcOp.getImm());
2259 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2260 MI.setDesc(get(AMDGPU::S_MOV_B64));
2261 break;
2262 }
2263
2264 Register Dst = MI.getOperand(0).getReg();
2265 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2266 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2267
2268 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2269 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2270 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2271 .addImm(Lo.getSExtValue())
2273 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2274 .addImm(Hi.getSExtValue())
2276 MI.eraseFromParent();
2277 break;
2278 }
2279 case AMDGPU::V_SET_INACTIVE_B32: {
2280 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2281 Register DstReg = MI.getOperand(0).getReg();
2282 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2283 .add(MI.getOperand(3))
2284 .add(MI.getOperand(4))
2285 .add(MI.getOperand(1))
2286 .add(MI.getOperand(2))
2287 .add(MI.getOperand(5));
2288 MI.eraseFromParent();
2289 break;
2290 }
2291 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2292 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2293 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2294 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2295 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2296 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2297 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2298 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2299 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2300 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2301 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2302 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2303 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2304 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2305 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2306 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2307 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2308 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2309 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2310 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2311 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2312 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2313 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2314 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2315 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2316 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2317 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2318 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2319 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2320 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2321 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2322 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2323 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2324 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2325
2326 unsigned Opc;
2327 if (RI.hasVGPRs(EltRC)) {
2328 Opc = AMDGPU::V_MOVRELD_B32_e32;
2329 } else {
2330 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2331 : AMDGPU::S_MOVRELD_B32;
2332 }
2333
2334 const MCInstrDesc &OpDesc = get(Opc);
2335 Register VecReg = MI.getOperand(0).getReg();
2336 bool IsUndef = MI.getOperand(1).isUndef();
2337 unsigned SubReg = MI.getOperand(3).getImm();
2338 assert(VecReg == MI.getOperand(1).getReg());
2339
2341 BuildMI(MBB, MI, DL, OpDesc)
2342 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2343 .add(MI.getOperand(2))
2345 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2346
2347 const int ImpDefIdx =
2348 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2349 const int ImpUseIdx = ImpDefIdx + 1;
2350 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2351 MI.eraseFromParent();
2352 break;
2353 }
2354 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2355 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2356 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2357 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2358 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2359 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6:
2360 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7:
2361 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2362 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2363 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2364 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2365 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2366 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2367 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2368 assert(ST.useVGPRIndexMode());
2369 Register VecReg = MI.getOperand(0).getReg();
2370 bool IsUndef = MI.getOperand(1).isUndef();
2371 MachineOperand &Idx = MI.getOperand(3);
2372 Register SubReg = MI.getOperand(4).getImm();
2373
2374 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2375 .add(Idx)
2377 SetOn->getOperand(3).setIsUndef();
2378
2379 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2381 BuildMI(MBB, MI, DL, OpDesc)
2382 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2383 .add(MI.getOperand(2))
2385 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2386
2387 const int ImpDefIdx =
2388 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2389 const int ImpUseIdx = ImpDefIdx + 1;
2390 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2391
2392 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2393
2394 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2395
2396 MI.eraseFromParent();
2397 break;
2398 }
2399 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2400 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2401 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2402 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2403 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2404 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6:
2405 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7:
2406 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2407 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2408 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2409 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2410 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2411 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2412 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2413 assert(ST.useVGPRIndexMode());
2414 Register Dst = MI.getOperand(0).getReg();
2415 Register VecReg = MI.getOperand(1).getReg();
2416 bool IsUndef = MI.getOperand(1).isUndef();
2417 Register SubReg = MI.getOperand(3).getImm();
2418
2419 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2420 .add(MI.getOperand(2))
2422 SetOn->getOperand(3).setIsUndef();
2423
2424 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2425 .addDef(Dst)
2426 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2427 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2428
2429 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2430
2431 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2432
2433 MI.eraseFromParent();
2434 break;
2435 }
2436 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2437 MachineFunction &MF = *MBB.getParent();
2438 Register Reg = MI.getOperand(0).getReg();
2439 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2440 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2441 MachineOperand OpLo = MI.getOperand(1);
2442 MachineOperand OpHi = MI.getOperand(2);
2443
2444 // Create a bundle so these instructions won't be re-ordered by the
2445 // post-RA scheduler.
2446 MIBundleBuilder Bundler(MBB, MI);
2447 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2448
2449 // What we want here is an offset from the value returned by s_getpc (which
2450 // is the address of the s_add_u32 instruction) to the global variable, but
2451 // since the encoding of $symbol starts 4 bytes after the start of the
2452 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2453 // small. This requires us to add 4 to the global variable offset in order
2454 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2455 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2456 // instruction.
2457
2458 int64_t Adjust = 0;
2459 if (ST.hasGetPCZeroExtension()) {
2460 // Fix up hardware that does not sign-extend the 48-bit PC value by
2461 // inserting: s_sext_i32_i16 reghi, reghi
2462 Bundler.append(
2463 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2464 Adjust += 4;
2465 }
2466
2467 if (OpLo.isGlobal())
2468 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2469 Bundler.append(
2470 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2471
2472 if (OpHi.isGlobal())
2473 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2474 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2475 .addReg(RegHi)
2476 .add(OpHi));
2477
2478 finalizeBundle(MBB, Bundler.begin());
2479
2480 MI.eraseFromParent();
2481 break;
2482 }
2483 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2484 MachineFunction &MF = *MBB.getParent();
2485 Register Reg = MI.getOperand(0).getReg();
2486 MachineOperand Op = MI.getOperand(1);
2487
2488 // Create a bundle so these instructions won't be re-ordered by the
2489 // post-RA scheduler.
2490 MIBundleBuilder Bundler(MBB, MI);
2491 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2492 if (Op.isGlobal())
2493 Op.setOffset(Op.getOffset() + 4);
2494 Bundler.append(
2495 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2496
2497 finalizeBundle(MBB, Bundler.begin());
2498
2499 MI.eraseFromParent();
2500 break;
2501 }
2502 case AMDGPU::ENTER_STRICT_WWM: {
2503 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2504 // Whole Wave Mode is entered.
2505 MI.setDesc(get(LMC.OrSaveExecOpc));
2506 break;
2507 }
2508 case AMDGPU::ENTER_STRICT_WQM: {
2509 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2510 // STRICT_WQM is entered.
2511 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2512 .addReg(LMC.ExecReg);
2513 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2514
2515 MI.eraseFromParent();
2516 break;
2517 }
2518 case AMDGPU::EXIT_STRICT_WWM:
2519 case AMDGPU::EXIT_STRICT_WQM: {
2520 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2521 // WWM/STICT_WQM is exited.
2522 MI.setDesc(get(LMC.MovOpc));
2523 break;
2524 }
2525 case AMDGPU::SI_RETURN: {
2526 const MachineFunction *MF = MBB.getParent();
2527 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2528 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2529 // Hiding the return address use with SI_RETURN may lead to extra kills in
2530 // the function and missing live-ins. We are fine in practice because callee
2531 // saved register handling ensures the register value is restored before
2532 // RET, but we need the undef flag here to appease the MachineVerifier
2533 // liveness checks.
2535 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2536 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2537
2538 MIB.copyImplicitOps(MI);
2539 MI.eraseFromParent();
2540 break;
2541 }
2542
2543 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2544 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2545 MI.setDesc(get(AMDGPU::S_MUL_U64));
2546 break;
2547
2548 case AMDGPU::S_GETPC_B64_pseudo:
2549 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2550 if (ST.hasGetPCZeroExtension()) {
2551 Register Dst = MI.getOperand(0).getReg();
2552 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2553 // Fix up hardware that does not sign-extend the 48-bit PC value by
2554 // inserting: s_sext_i32_i16 dsthi, dsthi
2555 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2556 DstHi)
2557 .addReg(DstHi);
2558 }
2559 break;
2560
2561 case AMDGPU::V_MAX_BF16_PSEUDO_e64: {
2562 assert(ST.hasBF16PackedInsts());
2563 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2564 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2565 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2566 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2567 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2568 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2569 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2570 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2571 break;
2572 }
2573
2574 case AMDGPU::GET_STACK_BASE:
2575 // The stack starts at offset 0 unless we need to reserve some space at the
2576 // bottom.
2577 if (ST.getFrameLowering()->mayReserveScratchForCWSR(*MBB.getParent())) {
2578 // When CWSR is used in dynamic VGPR mode, the trap handler needs to save
2579 // some of the VGPRs. The size of the required scratch space has already
2580 // been computed by prolog epilog insertion.
2581 const SIMachineFunctionInfo *MFI =
2582 MBB.getParent()->getInfo<SIMachineFunctionInfo>();
2583 unsigned VGPRSize = MFI->getScratchReservedForDynamicVGPRs();
2584 Register DestReg = MI.getOperand(0).getReg();
2585 BuildMI(MBB, MI, DL, get(AMDGPU::S_GETREG_B32), DestReg)
2588 // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
2589 // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
2590 // SCC, so we need to check for 0 manually.
2591 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(DestReg);
2592 // Change the implicif-def of SCC to an explicit use (but first remove
2593 // the dead flag if present).
2594 MI.getOperand(MI.getNumExplicitOperands()).setIsDead(false);
2595 MI.getOperand(MI.getNumExplicitOperands()).setIsUse();
2596 MI.setDesc(get(AMDGPU::S_CMOVK_I32));
2597 MI.addOperand(MachineOperand::CreateImm(VGPRSize));
2598 } else {
2599 MI.setDesc(get(AMDGPU::S_MOV_B32));
2600 MI.addOperand(MachineOperand::CreateImm(0));
2601 MI.removeOperand(
2602 MI.getNumExplicitOperands()); // Drop implicit def of SCC.
2603 }
2604 break;
2605 }
2606
2607 return true;
2608}
2609
2612 unsigned SubIdx,
2613 const MachineInstr &Orig) const {
2614
2615 // Try shrinking the instruction to remat only the part needed for current
2616 // context.
2617 // TODO: Handle more cases.
2618 unsigned Opcode = Orig.getOpcode();
2619 switch (Opcode) {
2620 case AMDGPU::S_LOAD_DWORDX16_IMM:
2621 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2622 if (SubIdx != 0)
2623 break;
2624
2625 if (I == MBB.end())
2626 break;
2627
2628 if (I->isBundled())
2629 break;
2630
2631 // Look for a single use of the register that is also a subreg.
2632 Register RegToFind = Orig.getOperand(0).getReg();
2633 MachineOperand *UseMO = nullptr;
2634 for (auto &CandMO : I->operands()) {
2635 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2636 continue;
2637 if (UseMO) {
2638 UseMO = nullptr;
2639 break;
2640 }
2641 UseMO = &CandMO;
2642 }
2643 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2644 break;
2645
2646 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2647 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2648
2649 MachineFunction *MF = MBB.getParent();
2650 MachineRegisterInfo &MRI = MF->getRegInfo();
2651 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2652
2653 unsigned NewOpcode = -1;
2654 if (SubregSize == 256)
2655 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2656 else if (SubregSize == 128)
2657 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2658 else
2659 break;
2660
2661 const MCInstrDesc &TID = get(NewOpcode);
2662 const TargetRegisterClass *NewRC =
2663 RI.getAllocatableClass(getRegClass(TID, 0));
2664 MRI.setRegClass(DestReg, NewRC);
2665
2666 UseMO->setReg(DestReg);
2667 UseMO->setSubReg(AMDGPU::NoSubRegister);
2668
2669 // Use a smaller load with the desired size, possibly with updated offset.
2670 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2671 MI->setDesc(TID);
2672 MI->getOperand(0).setReg(DestReg);
2673 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2674 if (Offset) {
2675 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2676 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2677 OffsetMO->setImm(FinalOffset);
2678 }
2680 for (const MachineMemOperand *MemOp : Orig.memoperands())
2681 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2682 SubregSize / 8));
2683 MI->setMemRefs(*MF, NewMMOs);
2684
2685 MBB.insert(I, MI);
2686 return;
2687 }
2688
2689 default:
2690 break;
2691 }
2692
2693 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig);
2694}
2695
2696std::pair<MachineInstr*, MachineInstr*>
2698 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2699
2700 if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2702 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2703 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2704 return std::pair(&MI, nullptr);
2705 }
2706
2707 MachineBasicBlock &MBB = *MI.getParent();
2708 DebugLoc DL = MBB.findDebugLoc(MI);
2709 MachineFunction *MF = MBB.getParent();
2710 MachineRegisterInfo &MRI = MF->getRegInfo();
2711 Register Dst = MI.getOperand(0).getReg();
2712 unsigned Part = 0;
2713 MachineInstr *Split[2];
2714
2715 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2716 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2717 if (Dst.isPhysical()) {
2718 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2719 } else {
2720 assert(MRI.isSSA());
2721 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2722 MovDPP.addDef(Tmp);
2723 }
2724
2725 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2726 const MachineOperand &SrcOp = MI.getOperand(I);
2727 assert(!SrcOp.isFPImm());
2728 if (SrcOp.isImm()) {
2729 APInt Imm(64, SrcOp.getImm());
2730 Imm.ashrInPlace(Part * 32);
2731 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2732 } else {
2733 assert(SrcOp.isReg());
2734 Register Src = SrcOp.getReg();
2735 if (Src.isPhysical())
2736 MovDPP.addReg(RI.getSubReg(Src, Sub));
2737 else
2738 MovDPP.addReg(Src, getUndefRegState(SrcOp.isUndef()), Sub);
2739 }
2740 }
2741
2742 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2743 MovDPP.addImm(MO.getImm());
2744
2745 Split[Part] = MovDPP;
2746 ++Part;
2747 }
2748
2749 if (Dst.isVirtual())
2750 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2751 .addReg(Split[0]->getOperand(0).getReg())
2752 .addImm(AMDGPU::sub0)
2753 .addReg(Split[1]->getOperand(0).getReg())
2754 .addImm(AMDGPU::sub1);
2755
2756 MI.eraseFromParent();
2757 return std::pair(Split[0], Split[1]);
2758}
2759
2760std::optional<DestSourcePair>
2762 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2763 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2764
2765 return std::nullopt;
2766}
2767
2769 AMDGPU::OpName Src0OpName,
2770 MachineOperand &Src1,
2771 AMDGPU::OpName Src1OpName) const {
2772 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2773 if (!Src0Mods)
2774 return false;
2775
2776 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2777 assert(Src1Mods &&
2778 "All commutable instructions have both src0 and src1 modifiers");
2779
2780 int Src0ModsVal = Src0Mods->getImm();
2781 int Src1ModsVal = Src1Mods->getImm();
2782
2783 Src1Mods->setImm(Src0ModsVal);
2784 Src0Mods->setImm(Src1ModsVal);
2785 return true;
2786}
2787
2789 MachineOperand &RegOp,
2790 MachineOperand &NonRegOp) {
2791 Register Reg = RegOp.getReg();
2792 unsigned SubReg = RegOp.getSubReg();
2793 bool IsKill = RegOp.isKill();
2794 bool IsDead = RegOp.isDead();
2795 bool IsUndef = RegOp.isUndef();
2796 bool IsDebug = RegOp.isDebug();
2797
2798 if (NonRegOp.isImm())
2799 RegOp.ChangeToImmediate(NonRegOp.getImm());
2800 else if (NonRegOp.isFI())
2801 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2802 else if (NonRegOp.isGlobal()) {
2803 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2804 NonRegOp.getTargetFlags());
2805 } else
2806 return nullptr;
2807
2808 // Make sure we don't reinterpret a subreg index in the target flags.
2809 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2810
2811 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2812 NonRegOp.setSubReg(SubReg);
2813
2814 return &MI;
2815}
2816
2818 MachineOperand &NonRegOp1,
2819 MachineOperand &NonRegOp2) {
2820 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2821 int64_t NonRegVal = NonRegOp1.getImm();
2822
2823 NonRegOp1.setImm(NonRegOp2.getImm());
2824 NonRegOp2.setImm(NonRegVal);
2825 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2826 NonRegOp2.setTargetFlags(TargetFlags);
2827 return &MI;
2828}
2829
2830bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2831 unsigned OpIdx1) const {
2832 const MCInstrDesc &InstDesc = MI.getDesc();
2833 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2834 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2835
2836 unsigned Opc = MI.getOpcode();
2837 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2838
2839 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2840 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2841
2842 // Swap doesn't breach constant bus or literal limits
2843 // It may move literal to position other than src0, this is not allowed
2844 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2845 // FIXME: After gfx9, literal can be in place other than Src0
2846 if (isVALU(MI)) {
2847 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2848 !isInlineConstant(MO0, OpInfo1))
2849 return false;
2850 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2851 !isInlineConstant(MO1, OpInfo0))
2852 return false;
2853 }
2854
2855 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2856 if (OpInfo1.RegClass == -1)
2857 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2858 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2859 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2860 }
2861 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2862 if (OpInfo0.RegClass == -1)
2863 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2864 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2865 isLegalRegOperand(MI, OpIdx0, MO1);
2866 }
2867
2868 // No need to check 64-bit literals since swapping does not bring new
2869 // 64-bit literals into current instruction to fold to 32-bit
2870
2871 return isImmOperandLegal(MI, OpIdx1, MO0);
2872}
2873
2875 unsigned Src0Idx,
2876 unsigned Src1Idx) const {
2877 assert(!NewMI && "this should never be used");
2878
2879 unsigned Opc = MI.getOpcode();
2880 int CommutedOpcode = commuteOpcode(Opc);
2881 if (CommutedOpcode == -1)
2882 return nullptr;
2883
2884 if (Src0Idx > Src1Idx)
2885 std::swap(Src0Idx, Src1Idx);
2886
2887 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2888 static_cast<int>(Src0Idx) &&
2889 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2890 static_cast<int>(Src1Idx) &&
2891 "inconsistency with findCommutedOpIndices");
2892
2893 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2894 return nullptr;
2895
2896 MachineInstr *CommutedMI = nullptr;
2897 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2898 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2899 if (Src0.isReg() && Src1.isReg()) {
2900 // Be sure to copy the source modifiers to the right place.
2901 CommutedMI =
2902 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2903 } else if (Src0.isReg() && !Src1.isReg()) {
2904 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2905 } else if (!Src0.isReg() && Src1.isReg()) {
2906 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2907 } else if (Src0.isImm() && Src1.isImm()) {
2908 CommutedMI = swapImmOperands(MI, Src0, Src1);
2909 } else {
2910 // FIXME: Found two non registers to commute. This does happen.
2911 return nullptr;
2912 }
2913
2914 if (CommutedMI) {
2915 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2916 Src1, AMDGPU::OpName::src1_modifiers);
2917
2918 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2919 AMDGPU::OpName::src1_sel);
2920
2921 CommutedMI->setDesc(get(CommutedOpcode));
2922 }
2923
2924 return CommutedMI;
2925}
2926
2927// This needs to be implemented because the source modifiers may be inserted
2928// between the true commutable operands, and the base
2929// TargetInstrInfo::commuteInstruction uses it.
2931 unsigned &SrcOpIdx0,
2932 unsigned &SrcOpIdx1) const {
2933 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2934}
2935
2937 unsigned &SrcOpIdx0,
2938 unsigned &SrcOpIdx1) const {
2939 if (!Desc.isCommutable())
2940 return false;
2941
2942 unsigned Opc = Desc.getOpcode();
2943 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2944 if (Src0Idx == -1)
2945 return false;
2946
2947 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2948 if (Src1Idx == -1)
2949 return false;
2950
2951 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2952}
2953
2955 int64_t BrOffset) const {
2956 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2957 // because its dest block is unanalyzable.
2958 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2959
2960 // Convert to dwords.
2961 BrOffset /= 4;
2962
2963 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2964 // from the next instruction.
2965 BrOffset -= 1;
2966
2967 return isIntN(BranchOffsetBits, BrOffset);
2968}
2969
2972 return MI.getOperand(0).getMBB();
2973}
2974
2976 for (const MachineInstr &MI : MBB->terminators()) {
2977 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2978 MI.getOpcode() == AMDGPU::SI_LOOP)
2979 return true;
2980 }
2981 return false;
2982}
2983
2985 MachineBasicBlock &DestBB,
2986 MachineBasicBlock &RestoreBB,
2987 const DebugLoc &DL, int64_t BrOffset,
2988 RegScavenger *RS) const {
2989 assert(MBB.empty() &&
2990 "new block should be inserted for expanding unconditional branch");
2991 assert(MBB.pred_size() == 1);
2992 assert(RestoreBB.empty() &&
2993 "restore block should be inserted for restoring clobbered registers");
2994
2995 MachineFunction *MF = MBB.getParent();
2996 MachineRegisterInfo &MRI = MF->getRegInfo();
2998 auto I = MBB.end();
2999 auto &MCCtx = MF->getContext();
3000
3001 if (ST.useAddPC64Inst()) {
3002 MCSymbol *Offset =
3003 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
3004 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
3006 MCSymbol *PostAddPCLabel =
3007 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
3008 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
3009 auto *OffsetExpr = MCBinaryExpr::createSub(
3010 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
3011 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
3012 Offset->setVariableValue(OffsetExpr);
3013 return;
3014 }
3015
3016 assert(RS && "RegScavenger required for long branching");
3017
3018 // FIXME: Virtual register workaround for RegScavenger not working with empty
3019 // blocks.
3020 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3021
3022 // Note: as this is used after hazard recognizer we need to apply some hazard
3023 // workarounds directly.
3024 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
3025 ST.hasVALUReadSGPRHazard();
3026 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
3027 if (FlushSGPRWrites)
3028 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
3030 };
3031
3032 // We need to compute the offset relative to the instruction immediately after
3033 // s_getpc_b64. Insert pc arithmetic code before last terminator.
3034 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
3035 ApplyHazardWorkarounds();
3036
3037 MCSymbol *PostGetPCLabel =
3038 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
3039 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
3040
3041 MCSymbol *OffsetLo =
3042 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
3043 MCSymbol *OffsetHi =
3044 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
3045 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
3046 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
3047 .addReg(PCReg, {}, AMDGPU::sub0)
3048 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
3049 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
3050 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
3051 .addReg(PCReg, {}, AMDGPU::sub1)
3052 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
3053 ApplyHazardWorkarounds();
3054
3055 // Insert the indirect branch after the other terminator.
3056 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
3057 .addReg(PCReg);
3058
3059 // If a spill is needed for the pc register pair, we need to insert a spill
3060 // restore block right before the destination block, and insert a short branch
3061 // into the old destination block's fallthrough predecessor.
3062 // e.g.:
3063 //
3064 // s_cbranch_scc0 skip_long_branch:
3065 //
3066 // long_branch_bb:
3067 // spill s[8:9]
3068 // s_getpc_b64 s[8:9]
3069 // s_add_u32 s8, s8, restore_bb
3070 // s_addc_u32 s9, s9, 0
3071 // s_setpc_b64 s[8:9]
3072 //
3073 // skip_long_branch:
3074 // foo;
3075 //
3076 // .....
3077 //
3078 // dest_bb_fallthrough_predecessor:
3079 // bar;
3080 // s_branch dest_bb
3081 //
3082 // restore_bb:
3083 // restore s[8:9]
3084 // fallthrough dest_bb
3085 ///
3086 // dest_bb:
3087 // buzz;
3088
3089 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3090 Register Scav;
3091
3092 // If we've previously reserved a register for long branches
3093 // avoid running the scavenger and just use those registers
3094 if (LongBranchReservedReg) {
3095 RS->enterBasicBlock(MBB);
3096 Scav = LongBranchReservedReg;
3097 } else {
3098 RS->enterBasicBlockEnd(MBB);
3099 Scav = RS->scavengeRegisterBackwards(
3100 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3101 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3102 }
3103 if (Scav) {
3104 RS->setRegUsed(Scav);
3105 MRI.replaceRegWith(PCReg, Scav);
3106 MRI.clearVirtRegs();
3107 } else {
3108 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3109 // SGPR spill.
3110 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3111 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3112 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3113 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3114 MRI.clearVirtRegs();
3115 }
3116
3117 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3118 // Now, the distance could be defined.
3120 MCSymbolRefExpr::create(DestLabel, MCCtx),
3121 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3122 // Add offset assignments.
3123 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3124 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3125 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3126 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3127}
3128
3129unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3130 switch (Cond) {
3131 case SIInstrInfo::SCC_TRUE:
3132 return AMDGPU::S_CBRANCH_SCC1;
3133 case SIInstrInfo::SCC_FALSE:
3134 return AMDGPU::S_CBRANCH_SCC0;
3135 case SIInstrInfo::VCCNZ:
3136 return AMDGPU::S_CBRANCH_VCCNZ;
3137 case SIInstrInfo::VCCZ:
3138 return AMDGPU::S_CBRANCH_VCCZ;
3139 case SIInstrInfo::EXECNZ:
3140 return AMDGPU::S_CBRANCH_EXECNZ;
3141 case SIInstrInfo::EXECZ:
3142 return AMDGPU::S_CBRANCH_EXECZ;
3143 default:
3144 llvm_unreachable("invalid branch predicate");
3145 }
3146}
3147
3148SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3149 switch (Opcode) {
3150 case AMDGPU::S_CBRANCH_SCC0:
3151 return SCC_FALSE;
3152 case AMDGPU::S_CBRANCH_SCC1:
3153 return SCC_TRUE;
3154 case AMDGPU::S_CBRANCH_VCCNZ:
3155 return VCCNZ;
3156 case AMDGPU::S_CBRANCH_VCCZ:
3157 return VCCZ;
3158 case AMDGPU::S_CBRANCH_EXECNZ:
3159 return EXECNZ;
3160 case AMDGPU::S_CBRANCH_EXECZ:
3161 return EXECZ;
3162 default:
3163 return INVALID_BR;
3164 }
3165}
3166
3170 MachineBasicBlock *&FBB,
3172 bool AllowModify) const {
3173 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3174 // Unconditional Branch
3175 TBB = I->getOperand(0).getMBB();
3176 return false;
3177 }
3178
3179 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3180 if (Pred == INVALID_BR)
3181 return true;
3182
3183 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3184 Cond.push_back(MachineOperand::CreateImm(Pred));
3185 Cond.push_back(I->getOperand(1)); // Save the branch register.
3186
3187 ++I;
3188
3189 if (I == MBB.end()) {
3190 // Conditional branch followed by fall-through.
3191 TBB = CondBB;
3192 return false;
3193 }
3194
3195 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3196 TBB = CondBB;
3197 FBB = I->getOperand(0).getMBB();
3198 return false;
3199 }
3200
3201 return true;
3202}
3203
3205 MachineBasicBlock *&FBB,
3207 bool AllowModify) const {
3208 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3209 auto E = MBB.end();
3210 if (I == E)
3211 return false;
3212
3213 // Skip over the instructions that are artificially terminators for special
3214 // exec management.
3215 while (I != E && !I->isBranch() && !I->isReturn()) {
3216 switch (I->getOpcode()) {
3217 case AMDGPU::S_MOV_B64_term:
3218 case AMDGPU::S_XOR_B64_term:
3219 case AMDGPU::S_OR_B64_term:
3220 case AMDGPU::S_ANDN2_B64_term:
3221 case AMDGPU::S_AND_B64_term:
3222 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3223 case AMDGPU::S_MOV_B32_term:
3224 case AMDGPU::S_XOR_B32_term:
3225 case AMDGPU::S_OR_B32_term:
3226 case AMDGPU::S_ANDN2_B32_term:
3227 case AMDGPU::S_AND_B32_term:
3228 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3229 break;
3230 case AMDGPU::SI_IF:
3231 case AMDGPU::SI_ELSE:
3232 case AMDGPU::SI_KILL_I1_TERMINATOR:
3233 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3234 // FIXME: It's messy that these need to be considered here at all.
3235 return true;
3236 default:
3237 llvm_unreachable("unexpected non-branch terminator inst");
3238 }
3239
3240 ++I;
3241 }
3242
3243 if (I == E)
3244 return false;
3245
3246 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3247}
3248
3250 int *BytesRemoved) const {
3251 unsigned Count = 0;
3252 unsigned RemovedSize = 0;
3253 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3254 // Skip over artificial terminators when removing instructions.
3255 if (MI.isBranch() || MI.isReturn()) {
3256 RemovedSize += getInstSizeInBytes(MI);
3257 MI.eraseFromParent();
3258 ++Count;
3259 }
3260 }
3261
3262 if (BytesRemoved)
3263 *BytesRemoved = RemovedSize;
3264
3265 return Count;
3266}
3267
3268// Copy the flags onto the implicit condition register operand.
3270 const MachineOperand &OrigCond) {
3271 CondReg.setIsUndef(OrigCond.isUndef());
3272 CondReg.setIsKill(OrigCond.isKill());
3273}
3274
3277 MachineBasicBlock *FBB,
3279 const DebugLoc &DL,
3280 int *BytesAdded) const {
3281 if (!FBB && Cond.empty()) {
3282 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3283 .addMBB(TBB);
3284 if (BytesAdded)
3285 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3286 return 1;
3287 }
3288
3289 assert(TBB && Cond[0].isImm());
3290
3291 unsigned Opcode
3292 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3293
3294 if (!FBB) {
3295 MachineInstr *CondBr =
3296 BuildMI(&MBB, DL, get(Opcode))
3297 .addMBB(TBB);
3298
3299 // Copy the flags onto the implicit condition register operand.
3300 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3301 fixImplicitOperands(*CondBr);
3302
3303 if (BytesAdded)
3304 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3305 return 1;
3306 }
3307
3308 assert(TBB && FBB);
3309
3310 MachineInstr *CondBr =
3311 BuildMI(&MBB, DL, get(Opcode))
3312 .addMBB(TBB);
3313 fixImplicitOperands(*CondBr);
3314 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3315 .addMBB(FBB);
3316
3317 MachineOperand &CondReg = CondBr->getOperand(1);
3318 CondReg.setIsUndef(Cond[1].isUndef());
3319 CondReg.setIsKill(Cond[1].isKill());
3320
3321 if (BytesAdded)
3322 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3323
3324 return 2;
3325}
3326
3329 if (Cond.size() != 2) {
3330 return true;
3331 }
3332
3333 if (Cond[0].isImm()) {
3334 Cond[0].setImm(-Cond[0].getImm());
3335 return false;
3336 }
3337
3338 return true;
3339}
3340
3343 Register DstReg, Register TrueReg,
3344 Register FalseReg, int &CondCycles,
3345 int &TrueCycles, int &FalseCycles) const {
3346 switch (Cond[0].getImm()) {
3347 case VCCNZ:
3348 case VCCZ: {
3349 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3350 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3351 if (MRI.getRegClass(FalseReg) != RC)
3352 return false;
3353
3354 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3355 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3356
3357 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3358 return RI.hasVGPRs(RC) && NumInsts <= 6;
3359 }
3360 case SCC_TRUE:
3361 case SCC_FALSE: {
3362 // FIXME: We could insert for VGPRs if we could replace the original compare
3363 // with a vector one.
3364 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3365 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3366 if (MRI.getRegClass(FalseReg) != RC)
3367 return false;
3368
3369 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3370
3371 // Multiples of 8 can do s_cselect_b64
3372 if (NumInsts % 2 == 0)
3373 NumInsts /= 2;
3374
3375 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3376 return RI.isSGPRClass(RC);
3377 }
3378 default:
3379 return false;
3380 }
3381}
3382
3386 Register TrueReg, Register FalseReg) const {
3387 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3388 if (Pred == VCCZ || Pred == SCC_FALSE) {
3389 Pred = static_cast<BranchPredicate>(-Pred);
3390 std::swap(TrueReg, FalseReg);
3391 }
3392
3393 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3394 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3395 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3396
3397 if (DstSize == 32) {
3399 if (Pred == SCC_TRUE) {
3400 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3401 .addReg(TrueReg)
3402 .addReg(FalseReg);
3403 } else {
3404 // Instruction's operands are backwards from what is expected.
3405 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3406 .addReg(FalseReg)
3407 .addReg(TrueReg);
3408 }
3409
3410 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3411 return;
3412 }
3413
3414 if (DstSize == 64 && Pred == SCC_TRUE) {
3416 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3417 .addReg(TrueReg)
3418 .addReg(FalseReg);
3419
3420 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3421 return;
3422 }
3423
3424 static const int16_t Sub0_15[] = {
3425 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3426 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3427 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3428 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3429 };
3430
3431 static const int16_t Sub0_15_64[] = {
3432 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3433 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3434 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3435 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3436 };
3437
3438 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3439 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3440 const int16_t *SubIndices = Sub0_15;
3441 int NElts = DstSize / 32;
3442
3443 // 64-bit select is only available for SALU.
3444 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3445 if (Pred == SCC_TRUE) {
3446 if (NElts % 2) {
3447 SelOp = AMDGPU::S_CSELECT_B32;
3448 EltRC = &AMDGPU::SGPR_32RegClass;
3449 } else {
3450 SelOp = AMDGPU::S_CSELECT_B64;
3451 EltRC = &AMDGPU::SGPR_64RegClass;
3452 SubIndices = Sub0_15_64;
3453 NElts /= 2;
3454 }
3455 }
3456
3458 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3459
3460 I = MIB->getIterator();
3461
3463 for (int Idx = 0; Idx != NElts; ++Idx) {
3464 Register DstElt = MRI.createVirtualRegister(EltRC);
3465 Regs.push_back(DstElt);
3466
3467 unsigned SubIdx = SubIndices[Idx];
3468
3470 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3471 Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
3472 .addReg(FalseReg, {}, SubIdx)
3473 .addReg(TrueReg, {}, SubIdx);
3474 } else {
3475 Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
3476 .addReg(TrueReg, {}, SubIdx)
3477 .addReg(FalseReg, {}, SubIdx);
3478 }
3479
3480 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3482
3483 MIB.addReg(DstElt)
3484 .addImm(SubIdx);
3485 }
3486}
3487
3489 switch (MI.getOpcode()) {
3490 case AMDGPU::V_MOV_B16_t16_e32:
3491 case AMDGPU::V_MOV_B16_t16_e64:
3492 case AMDGPU::V_MOV_B32_e32:
3493 case AMDGPU::V_MOV_B32_e64:
3494 case AMDGPU::V_MOV_B64_PSEUDO:
3495 case AMDGPU::V_MOV_B64_e32:
3496 case AMDGPU::V_MOV_B64_e64:
3497 case AMDGPU::S_MOV_B32:
3498 case AMDGPU::S_MOV_B64:
3499 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3500 case AMDGPU::COPY:
3501 case AMDGPU::WWM_COPY:
3502 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3503 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3504 case AMDGPU::V_ACCVGPR_MOV_B32:
3505 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3506 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3507 return true;
3508 default:
3509 return false;
3510 }
3511}
3512
3514 switch (MI.getOpcode()) {
3515 case AMDGPU::V_MOV_B16_t16_e32:
3516 case AMDGPU::V_MOV_B16_t16_e64:
3517 return 2;
3518 case AMDGPU::V_MOV_B32_e32:
3519 case AMDGPU::V_MOV_B32_e64:
3520 case AMDGPU::V_MOV_B64_PSEUDO:
3521 case AMDGPU::V_MOV_B64_e32:
3522 case AMDGPU::V_MOV_B64_e64:
3523 case AMDGPU::S_MOV_B32:
3524 case AMDGPU::S_MOV_B64:
3525 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3526 case AMDGPU::COPY:
3527 case AMDGPU::WWM_COPY:
3528 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3529 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3530 case AMDGPU::V_ACCVGPR_MOV_B32:
3531 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3532 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3533 return 1;
3534 default:
3535 llvm_unreachable("MI is not a foldable copy");
3536 }
3537}
3538
3539static constexpr AMDGPU::OpName ModifierOpNames[] = {
3540 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3541 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3542 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3543
3545 unsigned Opc = MI.getOpcode();
3546 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3547 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3548 if (Idx >= 0)
3549 MI.removeOperand(Idx);
3550 }
3551}
3552
3554 const MCInstrDesc &NewDesc) const {
3555 MI.setDesc(NewDesc);
3556
3557 // Remove any leftover implicit operands from mutating the instruction. e.g.
3558 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
3559 // anymore.
3560 const MCInstrDesc &Desc = MI.getDesc();
3561 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
3562 Desc.implicit_defs().size();
3563
3564 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
3565 MI.removeOperand(I);
3566}
3567
3568std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3569 unsigned SubRegIndex) {
3570 switch (SubRegIndex) {
3571 case AMDGPU::NoSubRegister:
3572 return Imm;
3573 case AMDGPU::sub0:
3574 return SignExtend64<32>(Imm);
3575 case AMDGPU::sub1:
3576 return SignExtend64<32>(Imm >> 32);
3577 case AMDGPU::lo16:
3578 return SignExtend64<16>(Imm);
3579 case AMDGPU::hi16:
3580 return SignExtend64<16>(Imm >> 16);
3581 case AMDGPU::sub1_lo16:
3582 return SignExtend64<16>(Imm >> 32);
3583 case AMDGPU::sub1_hi16:
3584 return SignExtend64<16>(Imm >> 48);
3585 default:
3586 return std::nullopt;
3587 }
3588
3589 llvm_unreachable("covered subregister switch");
3590}
3591
3592static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3593 switch (Opc) {
3594 case AMDGPU::V_MAC_F16_e32:
3595 case AMDGPU::V_MAC_F16_e64:
3596 case AMDGPU::V_MAD_F16_e64:
3597 return AMDGPU::V_MADAK_F16;
3598 case AMDGPU::V_MAC_F32_e32:
3599 case AMDGPU::V_MAC_F32_e64:
3600 case AMDGPU::V_MAD_F32_e64:
3601 return AMDGPU::V_MADAK_F32;
3602 case AMDGPU::V_FMAC_F32_e32:
3603 case AMDGPU::V_FMAC_F32_e64:
3604 case AMDGPU::V_FMA_F32_e64:
3605 return AMDGPU::V_FMAAK_F32;
3606 case AMDGPU::V_FMAC_F16_e32:
3607 case AMDGPU::V_FMAC_F16_e64:
3608 case AMDGPU::V_FMAC_F16_t16_e64:
3609 case AMDGPU::V_FMAC_F16_fake16_e64:
3610 case AMDGPU::V_FMAC_F16_t16_e32:
3611 case AMDGPU::V_FMAC_F16_fake16_e32:
3612 case AMDGPU::V_FMA_F16_e64:
3613 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3614 ? AMDGPU::V_FMAAK_F16_t16
3615 : AMDGPU::V_FMAAK_F16_fake16
3616 : AMDGPU::V_FMAAK_F16;
3617 case AMDGPU::V_FMAC_F64_e32:
3618 case AMDGPU::V_FMAC_F64_e64:
3619 case AMDGPU::V_FMA_F64_e64:
3620 return AMDGPU::V_FMAAK_F64;
3621 default:
3622 llvm_unreachable("invalid instruction");
3623 }
3624}
3625
3626static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3627 switch (Opc) {
3628 case AMDGPU::V_MAC_F16_e32:
3629 case AMDGPU::V_MAC_F16_e64:
3630 case AMDGPU::V_MAD_F16_e64:
3631 return AMDGPU::V_MADMK_F16;
3632 case AMDGPU::V_MAC_F32_e32:
3633 case AMDGPU::V_MAC_F32_e64:
3634 case AMDGPU::V_MAD_F32_e64:
3635 return AMDGPU::V_MADMK_F32;
3636 case AMDGPU::V_FMAC_F32_e32:
3637 case AMDGPU::V_FMAC_F32_e64:
3638 case AMDGPU::V_FMA_F32_e64:
3639 return AMDGPU::V_FMAMK_F32;
3640 case AMDGPU::V_FMAC_F16_e32:
3641 case AMDGPU::V_FMAC_F16_e64:
3642 case AMDGPU::V_FMAC_F16_t16_e64:
3643 case AMDGPU::V_FMAC_F16_fake16_e64:
3644 case AMDGPU::V_FMAC_F16_t16_e32:
3645 case AMDGPU::V_FMAC_F16_fake16_e32:
3646 case AMDGPU::V_FMA_F16_e64:
3647 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3648 ? AMDGPU::V_FMAMK_F16_t16
3649 : AMDGPU::V_FMAMK_F16_fake16
3650 : AMDGPU::V_FMAMK_F16;
3651 case AMDGPU::V_FMAC_F64_e32:
3652 case AMDGPU::V_FMAC_F64_e64:
3653 case AMDGPU::V_FMA_F64_e64:
3654 return AMDGPU::V_FMAMK_F64;
3655 default:
3656 llvm_unreachable("invalid instruction");
3657 }
3658}
3659
3661 Register Reg, MachineRegisterInfo *MRI) const {
3662 int64_t Imm;
3663 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3664 return false;
3665
3666 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3667
3668 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3669
3670 unsigned Opc = UseMI.getOpcode();
3671 if (Opc == AMDGPU::COPY) {
3672 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3673
3674 Register DstReg = UseMI.getOperand(0).getReg();
3675 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3676
3677 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3678
3679 if (HasMultipleUses) {
3680 // TODO: This should fold in more cases with multiple use, but we need to
3681 // more carefully consider what those uses are.
3682 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3683
3684 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3685 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3686 return false;
3687
3688 // Most of the time folding a 32-bit inline constant is free (though this
3689 // might not be true if we can't later fold it into a real user).
3690 //
3691 // FIXME: This isInlineConstant check is imprecise if
3692 // getConstValDefinedInReg handled the tricky non-mov cases.
3693 if (ImmDefSize == 32 &&
3695 return false;
3696 }
3697
3698 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3699 RI.getSubRegIdxSize(UseSubReg) == 16;
3700
3701 if (Is16Bit) {
3702 if (RI.hasVGPRs(DstRC))
3703 return false; // Do not clobber vgpr_hi16
3704
3705 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3706 return false;
3707 }
3708
3709 MachineFunction *MF = UseMI.getMF();
3710
3711 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3712 MCRegister MovDstPhysReg =
3713 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3714
3715 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3716
3717 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3718 for (unsigned MovOp :
3719 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3720 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3721 const MCInstrDesc &MovDesc = get(MovOp);
3722
3723 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0);
3724 if (Is16Bit) {
3725 // We just need to find a correctly sized register class, so the
3726 // subregister index compatibility doesn't matter since we're statically
3727 // extracting the immediate value.
3728 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3729 if (!MovDstRC)
3730 continue;
3731
3732 if (MovDstPhysReg) {
3733 // FIXME: We probably should not do this. If there is a live value in
3734 // the high half of the register, it will be corrupted.
3735 MovDstPhysReg =
3736 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3737 if (!MovDstPhysReg)
3738 continue;
3739 }
3740 }
3741
3742 // Result class isn't the right size, try the next instruction.
3743 if (MovDstPhysReg) {
3744 if (!MovDstRC->contains(MovDstPhysReg))
3745 return false;
3746 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3747 // TODO: This will be overly conservative in the case of 16-bit virtual
3748 // SGPRs. We could hack up the virtual register uses to use a compatible
3749 // 32-bit class.
3750 continue;
3751 }
3752
3753 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3754
3755 // Ensure the interpreted immediate value is a valid operand in the new
3756 // mov.
3757 //
3758 // FIXME: isImmOperandLegal should have form that doesn't require existing
3759 // MachineInstr or MachineOperand
3760 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3761 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3762 break;
3763
3764 NewOpc = MovOp;
3765 break;
3766 }
3767
3768 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3769 return false;
3770
3771 if (Is16Bit) {
3772 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3773 if (MovDstPhysReg)
3774 UseMI.getOperand(0).setReg(MovDstPhysReg);
3775 assert(UseMI.getOperand(1).getReg().isVirtual());
3776 }
3777
3778 const MCInstrDesc &NewMCID = get(NewOpc);
3779 UseMI.setDesc(NewMCID);
3780 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3781 UseMI.addImplicitDefUseOperands(*MF);
3782 return true;
3783 }
3784
3785 if (HasMultipleUses)
3786 return false;
3787
3788 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3789 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3790 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3791 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3792 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3793 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3794 Opc == AMDGPU::V_FMAC_F64_e64) {
3795 // Don't fold if we are using source or output modifiers. The new VOP2
3796 // instructions don't have them.
3798 return false;
3799
3800 // If this is a free constant, there's no reason to do this.
3801 // TODO: We could fold this here instead of letting SIFoldOperands do it
3802 // later.
3803 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3804
3805 // Any src operand can be used for the legality check.
3806 if (isInlineConstant(UseMI, Src0Idx, Imm))
3807 return false;
3808
3809 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3810
3811 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3812 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3813
3814 auto CopyRegOperandToNarrowerRC =
3815 [MRI, this](MachineInstr &MI, unsigned OpNo,
3816 const TargetRegisterClass *NewRC) -> void {
3817 if (!MI.getOperand(OpNo).isReg())
3818 return;
3819 Register Reg = MI.getOperand(OpNo).getReg();
3820 const TargetRegisterClass *RC = RI.getRegClassForReg(*MRI, Reg);
3821 if (RI.getCommonSubClass(RC, NewRC) != NewRC)
3822 return;
3823 Register Tmp = MRI->createVirtualRegister(NewRC);
3824 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
3825 get(AMDGPU::COPY), Tmp)
3826 .addReg(Reg);
3827 MI.getOperand(OpNo).setReg(Tmp);
3828 MI.getOperand(OpNo).setIsKill();
3829 };
3830
3831 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3832 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3833 (Src1->isReg() && Src1->getReg() == Reg)) {
3834 MachineOperand *RegSrc =
3835 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3836 if (!RegSrc->isReg())
3837 return false;
3838 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3839 ST.getConstantBusLimit(Opc) < 2)
3840 return false;
3841
3842 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3843 return false;
3844
3845 // If src2 is also a literal constant then we have to choose which one to
3846 // fold. In general it is better to choose madak so that the other literal
3847 // can be materialized in an sgpr instead of a vgpr:
3848 // s_mov_b32 s0, literal
3849 // v_madak_f32 v0, s0, v0, literal
3850 // Instead of:
3851 // v_mov_b32 v1, literal
3852 // v_madmk_f32 v0, v0, literal, v1
3853 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3854 if (Def && Def->isMoveImmediate() &&
3855 !isInlineConstant(Def->getOperand(1)))
3856 return false;
3857
3858 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3859 if (pseudoToMCOpcode(NewOpc) == -1)
3860 return false;
3861
3862 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3863 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3864
3865 // FIXME: This would be a lot easier if we could return a new instruction
3866 // instead of having to modify in place.
3867
3868 Register SrcReg = RegSrc->getReg();
3869 unsigned SrcSubReg = RegSrc->getSubReg();
3870 Src0->setReg(SrcReg);
3871 Src0->setSubReg(SrcSubReg);
3872 Src0->setIsKill(RegSrc->isKill());
3873
3874 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3875 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3876 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3877 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3878 UseMI.untieRegOperand(
3879 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3880
3881 Src1->ChangeToImmediate(*SubRegImm);
3882
3884 UseMI.setDesc(get(NewOpc));
3885
3886 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3887 NewOpc == AMDGPU::V_FMAMK_F16_fake16) {
3888 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3889 Register Tmp = MRI->createVirtualRegister(NewRC);
3890 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3891 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3892 UseMI.getOperand(0).getReg())
3893 .addReg(Tmp, RegState::Kill);
3894 UseMI.getOperand(0).setReg(Tmp);
3895 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3896 CopyRegOperandToNarrowerRC(UseMI, 3, NewRC);
3897 }
3898
3899 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3900 if (DeleteDef)
3901 DefMI.eraseFromParent();
3902
3903 return true;
3904 }
3905
3906 // Added part is the constant: Use v_madak_{f16, f32}.
3907 if (Src2->isReg() && Src2->getReg() == Reg) {
3908 if (ST.getConstantBusLimit(Opc) < 2) {
3909 // Not allowed to use constant bus for another operand.
3910 // We can however allow an inline immediate as src0.
3911 bool Src0Inlined = false;
3912 if (Src0->isReg()) {
3913 // Try to inline constant if possible.
3914 // If the Def moves immediate and the use is single
3915 // We are saving VGPR here.
3916 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3917 if (Def && Def->isMoveImmediate() &&
3918 isInlineConstant(Def->getOperand(1)) &&
3919 MRI->hasOneNonDBGUse(Src0->getReg())) {
3920 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3921 Src0Inlined = true;
3922 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3923 RI.isSGPRReg(*MRI, Src0->getReg())) {
3924 return false;
3925 }
3926 // VGPR is okay as Src0 - fallthrough
3927 }
3928
3929 if (Src1->isReg() && !Src0Inlined) {
3930 // We have one slot for inlinable constant so far - try to fill it
3931 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3932 if (Def && Def->isMoveImmediate() &&
3933 isInlineConstant(Def->getOperand(1)) &&
3934 MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI))
3935 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3936 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3937 return false;
3938 // VGPR is okay as Src1 - fallthrough
3939 }
3940 }
3941
3942 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3943 if (pseudoToMCOpcode(NewOpc) == -1)
3944 return false;
3945
3946 // FIXME: This would be a lot easier if we could return a new instruction
3947 // instead of having to modify in place.
3948
3949 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3950 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3951 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3952 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3953 UseMI.untieRegOperand(
3954 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3955
3956 const std::optional<int64_t> SubRegImm =
3957 extractSubregFromImm(Imm, Src2->getSubReg());
3958
3959 // ChangingToImmediate adds Src2 back to the instruction.
3960 Src2->ChangeToImmediate(*SubRegImm);
3961
3962 // These come before src2.
3964 UseMI.setDesc(get(NewOpc));
3965
3966 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3967 NewOpc == AMDGPU::V_FMAAK_F16_fake16) {
3968 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3969 Register Tmp = MRI->createVirtualRegister(NewRC);
3970 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3971 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3972 UseMI.getOperand(0).getReg())
3973 .addReg(Tmp, RegState::Kill);
3974 UseMI.getOperand(0).setReg(Tmp);
3975 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3976 CopyRegOperandToNarrowerRC(UseMI, 2, NewRC);
3977 }
3978
3979 // It might happen that UseMI was commuted
3980 // and we now have SGPR as SRC1. If so 2 inlined
3981 // constant and SGPR are illegal.
3983
3984 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3985 if (DeleteDef)
3986 DefMI.eraseFromParent();
3987
3988 return true;
3989 }
3990 }
3991
3992 return false;
3993}
3994
3995static bool
3998 if (BaseOps1.size() != BaseOps2.size())
3999 return false;
4000 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
4001 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
4002 return false;
4003 }
4004 return true;
4005}
4006
4007static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
4008 LocationSize WidthB, int OffsetB) {
4009 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
4010 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
4011 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
4012 return LowWidth.hasValue() &&
4013 LowOffset + (int)LowWidth.getValue() <= HighOffset;
4014}
4015
4016bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
4017 const MachineInstr &MIb) const {
4018 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
4019 int64_t Offset0, Offset1;
4020 LocationSize Dummy0 = LocationSize::precise(0);
4021 LocationSize Dummy1 = LocationSize::precise(0);
4022 bool Offset0IsScalable, Offset1IsScalable;
4023 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
4024 Dummy0, &RI) ||
4025 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
4026 Dummy1, &RI))
4027 return false;
4028
4029 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
4030 return false;
4031
4032 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
4033 // FIXME: Handle ds_read2 / ds_write2.
4034 return false;
4035 }
4036 LocationSize Width0 = MIa.memoperands().front()->getSize();
4037 LocationSize Width1 = MIb.memoperands().front()->getSize();
4038 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
4039}
4040
4042 const MachineInstr &MIb) const {
4043 assert(MIa.mayLoadOrStore() &&
4044 "MIa must load from or modify a memory location");
4045 assert(MIb.mayLoadOrStore() &&
4046 "MIb must load from or modify a memory location");
4047
4049 return false;
4050
4051 // XXX - Can we relax this between address spaces?
4052 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
4053 return false;
4054
4055 if (isLDSDMA(MIa) || isLDSDMA(MIb))
4056 return false;
4057
4058 if (MIa.isBundle() || MIb.isBundle())
4059 return false;
4060
4061 // TODO: Should we check the address space from the MachineMemOperand? That
4062 // would allow us to distinguish objects we know don't alias based on the
4063 // underlying address space, even if it was lowered to a different one,
4064 // e.g. private accesses lowered to use MUBUF instructions on a scratch
4065 // buffer.
4066 if (isDS(MIa)) {
4067 if (isDS(MIb))
4068 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4069
4070 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
4071 }
4072
4073 if (isMUBUF(MIa) || isMTBUF(MIa)) {
4074 if (isMUBUF(MIb) || isMTBUF(MIb))
4075 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4076
4077 if (isFLAT(MIb))
4078 return isFLATScratch(MIb);
4079
4080 return !isSMRD(MIb);
4081 }
4082
4083 if (isSMRD(MIa)) {
4084 if (isSMRD(MIb))
4085 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4086
4087 if (isFLAT(MIb))
4088 return isFLATScratch(MIb);
4089
4090 return !isMUBUF(MIb) && !isMTBUF(MIb);
4091 }
4092
4093 if (isFLAT(MIa)) {
4094 if (isFLAT(MIb)) {
4095 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
4096 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
4097 return true;
4098
4099 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4100 }
4101
4102 return false;
4103 }
4104
4105 return false;
4106}
4107
4109 int64_t &Imm, MachineInstr **DefMI = nullptr) {
4110 if (Reg.isPhysical())
4111 return false;
4112 auto *Def = MRI.getUniqueVRegDef(Reg);
4113 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
4114 Imm = Def->getOperand(1).getImm();
4115 if (DefMI)
4116 *DefMI = Def;
4117 return true;
4118 }
4119 return false;
4120}
4121
4122static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
4123 MachineInstr **DefMI = nullptr) {
4124 if (!MO->isReg())
4125 return false;
4126 const MachineFunction *MF = MO->getParent()->getMF();
4127 const MachineRegisterInfo &MRI = MF->getRegInfo();
4128 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
4129}
4130
4132 MachineInstr &NewMI) {
4133 if (LV) {
4134 unsigned NumOps = MI.getNumOperands();
4135 for (unsigned I = 1; I < NumOps; ++I) {
4136 MachineOperand &Op = MI.getOperand(I);
4137 if (Op.isReg() && Op.isKill())
4138 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
4139 }
4140 }
4141}
4142
4143static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4144 switch (Opc) {
4145 case AMDGPU::V_MAC_F16_e32:
4146 case AMDGPU::V_MAC_F16_e64:
4147 return AMDGPU::V_MAD_F16_e64;
4148 case AMDGPU::V_MAC_F32_e32:
4149 case AMDGPU::V_MAC_F32_e64:
4150 return AMDGPU::V_MAD_F32_e64;
4151 case AMDGPU::V_MAC_LEGACY_F32_e32:
4152 case AMDGPU::V_MAC_LEGACY_F32_e64:
4153 return AMDGPU::V_MAD_LEGACY_F32_e64;
4154 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4155 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4156 return AMDGPU::V_FMA_LEGACY_F32_e64;
4157 case AMDGPU::V_FMAC_F16_e32:
4158 case AMDGPU::V_FMAC_F16_e64:
4159 case AMDGPU::V_FMAC_F16_t16_e64:
4160 case AMDGPU::V_FMAC_F16_fake16_e64:
4161 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4162 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4163 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4164 : AMDGPU::V_FMA_F16_gfx9_e64;
4165 case AMDGPU::V_FMAC_F32_e32:
4166 case AMDGPU::V_FMAC_F32_e64:
4167 return AMDGPU::V_FMA_F32_e64;
4168 case AMDGPU::V_FMAC_F64_e32:
4169 case AMDGPU::V_FMAC_F64_e64:
4170 return AMDGPU::V_FMA_F64_e64;
4171 default:
4172 llvm_unreachable("invalid instruction");
4173 }
4174}
4175
4176/// Helper struct for the implementation of 3-address conversion to communicate
4177/// updates made to instruction operands.
4179 /// Other instruction whose def is no longer used by the converted
4180 /// instruction.
4182};
4183
4185 LiveVariables *LV,
4186 LiveIntervals *LIS) const {
4187 MachineBasicBlock &MBB = *MI.getParent();
4188 MachineInstr *CandidateMI = &MI;
4189
4190 if (MI.isBundle()) {
4191 // This is a temporary placeholder for bundle handling that enables us to
4192 // exercise the relevant code paths in the two-address instruction pass.
4193 if (MI.getBundleSize() != 1)
4194 return nullptr;
4195 CandidateMI = MI.getNextNode();
4196 }
4197
4199 MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U);
4200 if (!NewMI)
4201 return nullptr;
4202
4203 if (MI.isBundle()) {
4204 CandidateMI->eraseFromBundle();
4205
4206 for (MachineOperand &MO : MI.all_defs()) {
4207 if (MO.isTied())
4208 MI.untieRegOperand(MO.getOperandNo());
4209 }
4210 } else {
4211 updateLiveVariables(LV, MI, *NewMI);
4212 if (LIS) {
4213 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
4214 // SlotIndex of defs needs to be updated when converting to early-clobber
4215 MachineOperand &Def = NewMI->getOperand(0);
4216 if (Def.isEarlyClobber() && Def.isReg() &&
4217 LIS->hasInterval(Def.getReg())) {
4218 SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
4219 SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
4220 auto &LI = LIS->getInterval(Def.getReg());
4221 auto UpdateDefIndex = [&](LiveRange &LR) {
4222 auto *S = LR.find(OldIndex);
4223 if (S != LR.end() && S->start == OldIndex) {
4224 assert(S->valno && S->valno->def == OldIndex);
4225 S->start = NewIndex;
4226 S->valno->def = NewIndex;
4227 }
4228 };
4229 UpdateDefIndex(LI);
4230 for (auto &SR : LI.subranges())
4231 UpdateDefIndex(SR);
4232 }
4233 }
4234 }
4235
4236 if (U.RemoveMIUse) {
4237 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4238 // The only user is the instruction which will be killed.
4239 Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
4240
4241 if (MRI.hasOneNonDBGUse(DefReg)) {
4242 // We cannot just remove the DefMI here, calling pass will crash.
4243 U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
4244 U.RemoveMIUse->getOperand(0).setIsDead(true);
4245 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4246 U.RemoveMIUse->removeOperand(I);
4247 if (LV)
4248 LV->getVarInfo(DefReg).AliveBlocks.clear();
4249 }
4250
4251 if (MI.isBundle()) {
4252 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4253 if (!VRI.Reads && !VRI.Writes) {
4254 for (MachineOperand &MO : MI.all_uses()) {
4255 if (MO.isReg() && MO.getReg() == DefReg) {
4256 assert(MO.getSubReg() == 0 &&
4257 "tied sub-registers in bundles currently not supported");
4258 MI.removeOperand(MO.getOperandNo());
4259 break;
4260 }
4261 }
4262
4263 if (LIS)
4264 LIS->shrinkToUses(&LIS->getInterval(DefReg));
4265 }
4266 } else if (LIS) {
4267 LiveInterval &DefLI = LIS->getInterval(DefReg);
4268
4269 // We cannot delete the original instruction here, so hack out the use
4270 // in the original instruction with a dummy register so we can use
4271 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4272 // not have the complexity of deleting a use to consider here.
4273 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4274 for (MachineOperand &MIOp : MI.uses()) {
4275 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4276 MIOp.setIsUndef(true);
4277 MIOp.setReg(DummyReg);
4278 }
4279 }
4280
4281 if (MI.isBundle()) {
4282 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4283 if (!VRI.Reads && !VRI.Writes) {
4284 for (MachineOperand &MIOp : MI.uses()) {
4285 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4286 MIOp.setIsUndef(true);
4287 MIOp.setReg(DummyReg);
4288 }
4289 }
4290 }
4291
4292 MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false,
4293 false, /*isUndef=*/true));
4294 }
4295
4296 LIS->shrinkToUses(&DefLI);
4297 }
4298 }
4299
4300 return MI.isBundle() ? &MI : NewMI;
4301}
4302
4304SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4305 ThreeAddressUpdates &U) const {
4306 MachineBasicBlock &MBB = *MI.getParent();
4307 unsigned Opc = MI.getOpcode();
4308
4309 // Handle MFMA.
4310 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4311 if (NewMFMAOpc != -1) {
4313 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4314 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4315 MIB.add(MI.getOperand(I));
4316 return MIB;
4317 }
4318
4319 if (SIInstrInfo::isWMMA(MI)) {
4320 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4321 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4322 .setMIFlags(MI.getFlags());
4323 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4324 MIB->addOperand(MI.getOperand(I));
4325 return MIB;
4326 }
4327
4328 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4329 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4330 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4331 "present pre-RA");
4332
4333 // Handle MAC/FMAC.
4334 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4335 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4336 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4337 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4338 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4339 bool Src0Literal = false;
4340
4341 switch (Opc) {
4342 default:
4343 return nullptr;
4344 case AMDGPU::V_MAC_F16_e64:
4345 case AMDGPU::V_FMAC_F16_e64:
4346 case AMDGPU::V_FMAC_F16_t16_e64:
4347 case AMDGPU::V_FMAC_F16_fake16_e64:
4348 case AMDGPU::V_MAC_F32_e64:
4349 case AMDGPU::V_MAC_LEGACY_F32_e64:
4350 case AMDGPU::V_FMAC_F32_e64:
4351 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4352 case AMDGPU::V_FMAC_F64_e64:
4353 break;
4354 case AMDGPU::V_MAC_F16_e32:
4355 case AMDGPU::V_FMAC_F16_e32:
4356 case AMDGPU::V_MAC_F32_e32:
4357 case AMDGPU::V_MAC_LEGACY_F32_e32:
4358 case AMDGPU::V_FMAC_F32_e32:
4359 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4360 case AMDGPU::V_FMAC_F64_e32: {
4361 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4362 AMDGPU::OpName::src0);
4363 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4364 if (!Src0->isReg() && !Src0->isImm())
4365 return nullptr;
4366
4367 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4368 Src0Literal = true;
4369
4370 break;
4371 }
4372 }
4373
4374 MachineInstrBuilder MIB;
4375 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4376 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4377 const MachineOperand *Src0Mods =
4378 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4379 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4380 const MachineOperand *Src1Mods =
4381 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4382 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4383 const MachineOperand *Src2Mods =
4384 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4385 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4386 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4387 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4388
4389 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4390 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4391 // If we have an SGPR input, we will violate the constant bus restriction.
4392 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4393 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4394 MachineInstr *DefMI;
4395
4396 int64_t Imm;
4397 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4398 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4399 if (pseudoToMCOpcode(NewOpc) != -1) {
4400 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4401 .add(*Dst)
4402 .add(*Src0)
4403 .add(*Src1)
4404 .addImm(Imm)
4405 .setMIFlags(MI.getFlags());
4406 U.RemoveMIUse = DefMI;
4407 return MIB;
4408 }
4409 }
4410 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4411 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4412 if (pseudoToMCOpcode(NewOpc) != -1) {
4413 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4414 .add(*Dst)
4415 .add(*Src0)
4416 .addImm(Imm)
4417 .add(*Src2)
4418 .setMIFlags(MI.getFlags());
4419 U.RemoveMIUse = DefMI;
4420 return MIB;
4421 }
4422 }
4423 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4424 if (Src0Literal) {
4425 Imm = Src0->getImm();
4426 DefMI = nullptr;
4427 }
4428 if (pseudoToMCOpcode(NewOpc) != -1 &&
4430 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4431 Src1)) {
4432 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4433 .add(*Dst)
4434 .add(*Src1)
4435 .addImm(Imm)
4436 .add(*Src2)
4437 .setMIFlags(MI.getFlags());
4438 U.RemoveMIUse = DefMI;
4439 return MIB;
4440 }
4441 }
4442 }
4443
4444 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4445 // if VOP3 does not allow a literal operand.
4446 if (Src0Literal && !ST.hasVOP3Literal())
4447 return nullptr;
4448
4449 unsigned NewOpc = getNewFMAInst(ST, Opc);
4450
4451 if (pseudoToMCOpcode(NewOpc) == -1)
4452 return nullptr;
4453
4454 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4455 .add(*Dst)
4456 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4457 .add(*Src0)
4458 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4459 .add(*Src1)
4460 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4461 .add(*Src2)
4462 .addImm(Clamp ? Clamp->getImm() : 0)
4463 .addImm(Omod ? Omod->getImm() : 0)
4464 .setMIFlags(MI.getFlags());
4465 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4466 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4467 return MIB;
4468}
4469
4470// It's not generally safe to move VALU instructions across these since it will
4471// start using the register as a base index rather than directly.
4472// XXX - Why isn't hasSideEffects sufficient for these?
4474 switch (MI.getOpcode()) {
4475 case AMDGPU::S_SET_GPR_IDX_ON:
4476 case AMDGPU::S_SET_GPR_IDX_MODE:
4477 case AMDGPU::S_SET_GPR_IDX_OFF:
4478 return true;
4479 default:
4480 return false;
4481 }
4482}
4483
4485 const MachineBasicBlock *MBB,
4486 const MachineFunction &MF) const {
4487 // Skipping the check for SP writes in the base implementation. The reason it
4488 // was added was apparently due to compile time concerns.
4489 //
4490 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4491 // but is probably avoidable.
4492
4493 // Copied from base implementation.
4494 // Terminators and labels can't be scheduled around.
4495 if (MI.isTerminator() || MI.isPosition())
4496 return true;
4497
4498 // INLINEASM_BR can jump to another block
4499 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4500 return true;
4501
4502 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4503 return true;
4504
4505 // Target-independent instructions do not have an implicit-use of EXEC, even
4506 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4507 // boundaries prevents incorrect movements of such instructions.
4508 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4509 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4510 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4511 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4512 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4514}
4515
4517 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4518 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4519 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4520}
4521
4523 // Instructions that access scratch use FLAT encoding or BUF encodings.
4524 if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
4525 return false;
4526
4527 // SCRATCH instructions always access scratch.
4528 if (isFLATScratch(MI))
4529 return true;
4530
4531 // If FLAT_SCRATCH registers are not initialized, we can never access scratch
4532 // via the aperture.
4533 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4534 return false;
4535
4536 // If there are no memory operands then conservatively assume the flat
4537 // operation may access scratch.
4538 if (MI.memoperands_empty())
4539 return true;
4540
4541 // See if any memory operand specifies an address space that involves scratch.
4542 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4543 unsigned AS = Memop->getAddrSpace();
4544 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4545 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4546 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4547 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4548 }
4549 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4550 });
4551}
4552
4554 assert(isFLAT(MI));
4555
4556 // All flat instructions use the VMEM counter except prefetch.
4557 if (!usesVM_CNT(MI))
4558 return false;
4559
4560 // If there are no memory operands then conservatively assume the flat
4561 // operation may access VMEM.
4562 if (MI.memoperands_empty())
4563 return true;
4564
4565 // See if any memory operand specifies an address space that involves VMEM.
4566 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4567 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4568 // (GDS) address space is not supported by flat operations. Therefore, simply
4569 // return true unless only the LDS address space is found.
4570 for (const MachineMemOperand *Memop : MI.memoperands()) {
4571 unsigned AS = Memop->getAddrSpace();
4573 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4574 return true;
4575 }
4576
4577 return false;
4578}
4579
4581 assert(isFLAT(MI));
4582
4583 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4584 if (!usesLGKM_CNT(MI))
4585 return false;
4586
4587 // If in tgsplit mode then there can be no use of LDS.
4588 if (ST.isTgSplitEnabled())
4589 return false;
4590
4591 // If there are no memory operands then conservatively assume the flat
4592 // operation may access LDS.
4593 if (MI.memoperands_empty())
4594 return true;
4595
4596 // See if any memory operand specifies an address space that involves LDS.
4597 for (const MachineMemOperand *Memop : MI.memoperands()) {
4598 unsigned AS = Memop->getAddrSpace();
4600 return true;
4601 }
4602
4603 return false;
4604}
4605
4607 // Skip the full operand and register alias search modifiesRegister
4608 // does. There's only a handful of instructions that touch this, it's only an
4609 // implicit def, and doesn't alias any other registers.
4610 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4611}
4612
4614 unsigned Opcode = MI.getOpcode();
4615
4616 if (MI.mayStore() && isSMRD(MI))
4617 return true; // scalar store or atomic
4618
4619 // This will terminate the function when other lanes may need to continue.
4620 if (MI.isReturn())
4621 return true;
4622
4623 // These instructions cause shader I/O that may cause hardware lockups
4624 // when executed with an empty EXEC mask.
4625 //
4626 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4627 // EXEC = 0, but checking for that case here seems not worth it
4628 // given the typical code patterns.
4629 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4630 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4631 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT ||
4632 Opcode == AMDGPU::S_SETHALT)
4633 return true;
4634
4635 if (MI.isCall() || MI.isInlineAsm())
4636 return true; // conservative assumption
4637
4638 // Assume that barrier interactions are only intended with active lanes.
4639 if (isBarrier(Opcode))
4640 return true;
4641
4642 // A mode change is a scalar operation that influences vector instructions.
4644 return true;
4645
4646 // These are like SALU instructions in terms of effects, so it's questionable
4647 // whether we should return true for those.
4648 //
4649 // However, executing them with EXEC = 0 causes them to operate on undefined
4650 // data, which we avoid by returning true here.
4651 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4652 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4653 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4654 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4655 return true;
4656
4657 return false;
4658}
4659
4661 const MachineInstr &MI) const {
4662 if (MI.isMetaInstruction())
4663 return false;
4664
4665 // This won't read exec if this is an SGPR->SGPR copy.
4666 if (MI.isCopyLike()) {
4667 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4668 return true;
4669
4670 // Make sure this isn't copying exec as a normal operand
4671 return MI.readsRegister(AMDGPU::EXEC, &RI);
4672 }
4673
4674 // Make a conservative assumption about the callee.
4675 if (MI.isCall())
4676 return true;
4677
4678 // Be conservative with any unhandled generic opcodes.
4679 if (!isTargetSpecificOpcode(MI.getOpcode()))
4680 return true;
4681
4682 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4683}
4684
4685bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4686 switch (Imm.getBitWidth()) {
4687 case 1: // This likely will be a condition code mask.
4688 return true;
4689
4690 case 32:
4691 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4692 ST.hasInv2PiInlineImm());
4693 case 64:
4694 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4695 ST.hasInv2PiInlineImm());
4696 case 16:
4697 return ST.has16BitInsts() &&
4698 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4699 ST.hasInv2PiInlineImm());
4700 default:
4701 llvm_unreachable("invalid bitwidth");
4702 }
4703}
4704
4706 APInt IntImm = Imm.bitcastToAPInt();
4707 int64_t IntImmVal = IntImm.getSExtValue();
4708 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4709 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4710 default:
4711 llvm_unreachable("invalid fltSemantics");
4714 return isInlineConstant(IntImm);
4716 return ST.has16BitInsts() &&
4717 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4719 return ST.has16BitInsts() &&
4720 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4721 }
4722}
4723
4724bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4725 // MachineOperand provides no way to tell the true operand size, since it only
4726 // records a 64-bit value. We need to know the size to determine if a 32-bit
4727 // floating point immediate bit pattern is legal for an integer immediate. It
4728 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4729 switch (OperandType) {
4739 int32_t Trunc = static_cast<int32_t>(Imm);
4740 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4741 }
4747 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4750 // We would expect inline immediates to not be concerned with an integer/fp
4751 // distinction. However, in the case of 16-bit integer operations, the
4752 // "floating point" values appear to not work. It seems read the low 16-bits
4753 // of 32-bit immediates, which happens to always work for the integer
4754 // values.
4755 //
4756 // See llvm bugzilla 46302.
4757 //
4758 // TODO: Theoretically we could use op-sel to use the high bits of the
4759 // 32-bit FP values.
4768 return AMDGPU::isPKFMACF16InlineConstant(Imm, ST.isGFX11Plus());
4773 return false;
4776 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4777 // A few special case instructions have 16-bit operands on subtargets
4778 // where 16-bit instructions are not legal.
4779 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4780 // constants in these cases
4781 int16_t Trunc = static_cast<int16_t>(Imm);
4782 return ST.has16BitInsts() &&
4783 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4784 }
4785
4786 return false;
4787 }
4790 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4791 int16_t Trunc = static_cast<int16_t>(Imm);
4792 return ST.has16BitInsts() &&
4793 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4794 }
4795 return false;
4796 }
4800 return false;
4802 return isLegalAV64PseudoImm(Imm);
4805 // Always embedded in the instruction for free.
4806 return true;
4816 // Just ignore anything else.
4817 return true;
4818 default:
4819 llvm_unreachable("invalid operand type");
4820 }
4821}
4822
4823static bool compareMachineOp(const MachineOperand &Op0,
4824 const MachineOperand &Op1) {
4825 if (Op0.getType() != Op1.getType())
4826 return false;
4827
4828 switch (Op0.getType()) {
4830 return Op0.getReg() == Op1.getReg();
4832 return Op0.getImm() == Op1.getImm();
4833 default:
4834 llvm_unreachable("Didn't expect to be comparing these operand types");
4835 }
4836}
4837
4839 const MCOperandInfo &OpInfo) const {
4840 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4841 return true;
4842
4843 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4844 return false;
4845
4846 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4847 return true;
4848
4849 return ST.hasVOP3Literal();
4850}
4851
4852bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4853 int64_t ImmVal) const {
4854 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4855 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4856 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4857 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4858 AMDGPU::OpName::src2))
4859 return false;
4860 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4861 }
4862
4863 return isLiteralOperandLegal(InstDesc, OpInfo);
4864}
4865
4866bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4867 const MachineOperand &MO) const {
4868 if (MO.isImm())
4869 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4870
4871 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4872 "unexpected imm-like operand kind");
4873 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4874 return isLiteralOperandLegal(InstDesc, OpInfo);
4875}
4876
4878 // 2 32-bit inline constants packed into one.
4879 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4880 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4881}
4882
4883bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4884 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4885 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4886 return false;
4887
4888 int Op32 = AMDGPU::getVOPe32(Opcode);
4889 if (Op32 == -1)
4890 return false;
4891
4892 return pseudoToMCOpcode(Op32) != -1;
4893}
4894
4895bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4896 // The src0_modifier operand is present on all instructions
4897 // that have modifiers.
4898
4899 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4900}
4901
4903 AMDGPU::OpName OpName) const {
4904 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4905 return Mods && Mods->getImm();
4906}
4907
4909 return any_of(ModifierOpNames,
4910 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4911}
4912
4914 const MachineRegisterInfo &MRI) const {
4915 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4916 // Can't shrink instruction with three operands.
4917 if (Src2) {
4918 switch (MI.getOpcode()) {
4919 default: return false;
4920
4921 case AMDGPU::V_ADDC_U32_e64:
4922 case AMDGPU::V_SUBB_U32_e64:
4923 case AMDGPU::V_SUBBREV_U32_e64: {
4924 const MachineOperand *Src1
4925 = getNamedOperand(MI, AMDGPU::OpName::src1);
4926 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4927 return false;
4928 // Additional verification is needed for sdst/src2.
4929 return true;
4930 }
4931 case AMDGPU::V_MAC_F16_e64:
4932 case AMDGPU::V_MAC_F32_e64:
4933 case AMDGPU::V_MAC_LEGACY_F32_e64:
4934 case AMDGPU::V_FMAC_F16_e64:
4935 case AMDGPU::V_FMAC_F16_t16_e64:
4936 case AMDGPU::V_FMAC_F16_fake16_e64:
4937 case AMDGPU::V_FMAC_F32_e64:
4938 case AMDGPU::V_FMAC_F64_e64:
4939 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4940 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4941 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4942 return false;
4943 break;
4944
4945 case AMDGPU::V_CNDMASK_B32_e64:
4946 break;
4947 }
4948 }
4949
4950 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4951 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4952 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4953 return false;
4954
4955 // We don't need to check src0, all input types are legal, so just make sure
4956 // src0 isn't using any modifiers.
4957 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4958 return false;
4959
4960 // Can it be shrunk to a valid 32 bit opcode?
4961 if (!hasVALU32BitEncoding(MI.getOpcode()))
4962 return false;
4963
4964 // Check output modifiers
4965 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4966 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4967 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4968 // TODO: Can we avoid checking bound_ctrl/fi here?
4969 // They are only used by permlane*_swap special case.
4970 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4971 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4972}
4973
4974// Set VCC operand with all flags from \p Orig, except for setting it as
4975// implicit.
4977 const MachineOperand &Orig) {
4978
4979 for (MachineOperand &Use : MI.implicit_operands()) {
4980 if (Use.isUse() &&
4981 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4982 Use.setIsUndef(Orig.isUndef());
4983 Use.setIsKill(Orig.isKill());
4984 return;
4985 }
4986 }
4987}
4988
4990 unsigned Op32) const {
4991 MachineBasicBlock *MBB = MI.getParent();
4992
4993 const MCInstrDesc &Op32Desc = get(Op32);
4994 MachineInstrBuilder Inst32 =
4995 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4996 .setMIFlags(MI.getFlags());
4997
4998 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4999 // For VOPC instructions, this is replaced by an implicit def of vcc.
5000
5001 // We assume the defs of the shrunk opcode are in the same order, and the
5002 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
5003 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
5004 Inst32.add(MI.getOperand(I));
5005
5006 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
5007
5008 int Idx = MI.getNumExplicitDefs();
5009 for (const MachineOperand &Use : MI.explicit_uses()) {
5010 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
5012 continue;
5013
5014 if (&Use == Src2) {
5015 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
5016 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
5017 // replaced with an implicit read of vcc or vcc_lo. The implicit read
5018 // of vcc was already added during the initial BuildMI, but we
5019 // 1) may need to change vcc to vcc_lo to preserve the original register
5020 // 2) have to preserve the original flags.
5021 copyFlagsToImplicitVCC(*Inst32, *Src2);
5022 continue;
5023 }
5024 }
5025
5026 Inst32.add(Use);
5027 }
5028
5029 // FIXME: Losing implicit operands
5030 fixImplicitOperands(*Inst32);
5031 return Inst32;
5032}
5033
5035 // Null is free
5036 Register Reg = RegOp.getReg();
5037 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
5038 return false;
5039
5040 // SGPRs use the constant bus
5041
5042 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
5043 // physical register operands should also count, except for exec.
5044 if (RegOp.isImplicit())
5045 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
5046
5047 // SGPRs use the constant bus
5048 return AMDGPU::SReg_32RegClass.contains(Reg) ||
5049 AMDGPU::SReg_64RegClass.contains(Reg);
5050}
5051
5053 const MachineRegisterInfo &MRI) const {
5054 Register Reg = RegOp.getReg();
5055 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5056 : physRegUsesConstantBus(RegOp);
5057}
5058
5060 const MachineOperand &MO,
5061 const MCOperandInfo &OpInfo) const {
5062 // Literal constants use the constant bus.
5063 if (!MO.isReg())
5064 return !isInlineConstant(MO, OpInfo);
5065
5066 Register Reg = MO.getReg();
5067 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5069}
5070
5072 for (const MachineOperand &MO : MI.implicit_operands()) {
5073 // We only care about reads.
5074 if (MO.isDef())
5075 continue;
5076
5077 switch (MO.getReg()) {
5078 case AMDGPU::VCC:
5079 case AMDGPU::VCC_LO:
5080 case AMDGPU::VCC_HI:
5081 case AMDGPU::M0:
5082 case AMDGPU::FLAT_SCR:
5083 return MO.getReg();
5084
5085 default:
5086 break;
5087 }
5088 }
5089
5090 return Register();
5091}
5092
5093static bool shouldReadExec(const MachineInstr &MI) {
5094 if (SIInstrInfo::isVALU(MI)) {
5095 switch (MI.getOpcode()) {
5096 case AMDGPU::V_READLANE_B32:
5097 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
5098 case AMDGPU::V_WRITELANE_B32:
5099 case AMDGPU::SI_SPILL_S32_TO_VGPR:
5100 return false;
5101 }
5102
5103 return true;
5104 }
5105
5106 if (MI.isPreISelOpcode() ||
5107 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
5110 return false;
5111
5112 return true;
5113}
5114
5115static bool isRegOrFI(const MachineOperand &MO) {
5116 return MO.isReg() || MO.isFI();
5117}
5118
5119static bool isSubRegOf(const SIRegisterInfo &TRI,
5120 const MachineOperand &SuperVec,
5121 const MachineOperand &SubReg) {
5122 if (SubReg.getReg().isPhysical())
5123 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
5124
5125 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
5126 SubReg.getReg() == SuperVec.getReg();
5127}
5128
5129// Verify the illegal copy from vector register to SGPR for generic opcode COPY
5130bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
5131 const MachineRegisterInfo &MRI,
5132 StringRef &ErrInfo) const {
5133 Register DstReg = MI.getOperand(0).getReg();
5134 Register SrcReg = MI.getOperand(1).getReg();
5135 // This is a check for copy from vector register to SGPR
5136 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
5137 ErrInfo = "illegal copy from vector register to SGPR";
5138 return false;
5139 }
5140 return true;
5141}
5142
5144 StringRef &ErrInfo) const {
5145 uint32_t Opcode = MI.getOpcode();
5146 const MachineFunction *MF = MI.getMF();
5147 const MachineRegisterInfo &MRI = MF->getRegInfo();
5148
5149 // FIXME: At this point the COPY verify is done only for non-ssa forms.
5150 // Find a better property to recognize the point where instruction selection
5151 // is just done.
5152 // We can only enforce this check after SIFixSGPRCopies pass so that the
5153 // illegal copies are legalized and thereafter we don't expect a pass
5154 // inserting similar copies.
5155 if (!MRI.isSSA() && MI.isCopy())
5156 return verifyCopy(MI, MRI, ErrInfo);
5157
5158 if (SIInstrInfo::isGenericOpcode(Opcode))
5159 return true;
5160
5161 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
5162 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
5163 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
5164 int Src3Idx = -1;
5165 if (Src0Idx == -1) {
5166 // VOPD V_DUAL_* instructions use different operand names.
5167 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
5168 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
5169 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
5170 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
5171 }
5172
5173 // Make sure the number of operands is correct.
5174 const MCInstrDesc &Desc = get(Opcode);
5175 if (!Desc.isVariadic() &&
5176 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
5177 ErrInfo = "Instruction has wrong number of operands.";
5178 return false;
5179 }
5180
5181 if (MI.isInlineAsm()) {
5182 // Verify register classes for inlineasm constraints.
5183 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
5184 I != E; ++I) {
5185 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
5186 if (!RC)
5187 continue;
5188
5189 const MachineOperand &Op = MI.getOperand(I);
5190 if (!Op.isReg())
5191 continue;
5192
5193 Register Reg = Op.getReg();
5194 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5195 ErrInfo = "inlineasm operand has incorrect register class.";
5196 return false;
5197 }
5198 }
5199
5200 return true;
5201 }
5202
5203 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5204 ErrInfo = "missing memory operand from image instruction.";
5205 return false;
5206 }
5207
5208 // Make sure the register classes are correct.
5209 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5210 const MachineOperand &MO = MI.getOperand(i);
5211 if (MO.isFPImm()) {
5212 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5213 "all fp values to integers.";
5214 return false;
5215 }
5216
5217 const MCOperandInfo &OpInfo = Desc.operands()[i];
5218 int16_t RegClass = getOpRegClassID(OpInfo);
5219
5220 switch (OpInfo.OperandType) {
5222 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5223 ErrInfo = "Illegal immediate value for operand.";
5224 return false;
5225 }
5226 break;
5240 break;
5242 break;
5243 break;
5257 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5258 ErrInfo = "Illegal immediate value for operand.";
5259 return false;
5260 }
5261 break;
5262 }
5264 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5265 ErrInfo = "Expected inline constant for operand.";
5266 return false;
5267 }
5268 break;
5272 break;
5277 // Check if this operand is an immediate.
5278 // FrameIndex operands will be replaced by immediates, so they are
5279 // allowed.
5280 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5281 ErrInfo = "Expected immediate, but got non-immediate";
5282 return false;
5283 }
5284 break;
5288 break;
5289 default:
5290 if (OpInfo.isGenericType())
5291 continue;
5292 break;
5293 }
5294
5295 if (!MO.isReg())
5296 continue;
5297 Register Reg = MO.getReg();
5298 if (!Reg)
5299 continue;
5300
5301 // FIXME: Ideally we would have separate instruction definitions with the
5302 // aligned register constraint.
5303 // FIXME: We do not verify inline asm operands, but custom inline asm
5304 // verification is broken anyway
5305 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO &&
5306 Opcode != AMDGPU::V_MOV_B64_PSEUDO && !isSpill(MI)) {
5307 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5308 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5309 if (const TargetRegisterClass *SubRC =
5310 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5311 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5312 if (RC)
5313 RC = SubRC;
5314 }
5315 }
5316
5317 // Check that this is the aligned version of the class.
5318 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5319 ErrInfo = "Subtarget requires even aligned vector registers";
5320 return false;
5321 }
5322 }
5323
5324 if (RegClass != -1) {
5325 if (Reg.isVirtual())
5326 continue;
5327
5328 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5329 if (!RC->contains(Reg)) {
5330 ErrInfo = "Operand has incorrect register class.";
5331 return false;
5332 }
5333 }
5334 }
5335
5336 // Verify SDWA
5337 if (isSDWA(MI)) {
5338 if (!ST.hasSDWA()) {
5339 ErrInfo = "SDWA is not supported on this target";
5340 return false;
5341 }
5342
5343 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5344 AMDGPU::OpName::dst_sel}) {
5345 const MachineOperand *MO = getNamedOperand(MI, Op);
5346 if (!MO)
5347 continue;
5348 int64_t Imm = MO->getImm();
5349 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5350 ErrInfo = "Invalid SDWA selection";
5351 return false;
5352 }
5353 }
5354
5355 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5356
5357 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5358 if (OpIdx == -1)
5359 continue;
5360 const MachineOperand &MO = MI.getOperand(OpIdx);
5361
5362 if (!ST.hasSDWAScalar()) {
5363 // Only VGPRS on VI
5364 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5365 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5366 return false;
5367 }
5368 } else {
5369 // No immediates on GFX9
5370 if (!MO.isReg()) {
5371 ErrInfo =
5372 "Only reg allowed as operands in SDWA instructions on GFX9+";
5373 return false;
5374 }
5375 }
5376 }
5377
5378 if (!ST.hasSDWAOmod()) {
5379 // No omod allowed on VI
5380 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5381 if (OMod != nullptr &&
5382 (!OMod->isImm() || OMod->getImm() != 0)) {
5383 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5384 return false;
5385 }
5386 }
5387
5388 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5389 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5390 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5391 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5392 const MachineOperand *Src0ModsMO =
5393 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5394 unsigned Mods = Src0ModsMO->getImm();
5395 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5396 Mods & SISrcMods::SEXT) {
5397 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5398 return false;
5399 }
5400 }
5401
5402 uint32_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5403 if (isVOPC(BasicOpcode)) {
5404 if (!ST.hasSDWASdst() && DstIdx != -1) {
5405 // Only vcc allowed as dst on VI for VOPC
5406 const MachineOperand &Dst = MI.getOperand(DstIdx);
5407 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5408 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5409 return false;
5410 }
5411 } else if (!ST.hasSDWAOutModsVOPC()) {
5412 // No clamp allowed on GFX9 for VOPC
5413 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5414 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5415 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5416 return false;
5417 }
5418
5419 // No omod allowed on GFX9 for VOPC
5420 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5421 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5422 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5423 return false;
5424 }
5425 }
5426 }
5427
5428 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5429 if (DstUnused && DstUnused->isImm() &&
5430 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5431 const MachineOperand &Dst = MI.getOperand(DstIdx);
5432 if (!Dst.isReg() || !Dst.isTied()) {
5433 ErrInfo = "Dst register should have tied register";
5434 return false;
5435 }
5436
5437 const MachineOperand &TiedMO =
5438 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5439 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5440 ErrInfo =
5441 "Dst register should be tied to implicit use of preserved register";
5442 return false;
5443 }
5444 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5445 ErrInfo = "Dst register should use same physical register as preserved";
5446 return false;
5447 }
5448 }
5449 }
5450
5451 // Verify MIMG / VIMAGE / VSAMPLE
5452 if (isImage(Opcode) && !MI.mayStore()) {
5453 // Ensure that the return type used is large enough for all the options
5454 // being used TFE/LWE require an extra result register.
5455 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5456 if (DMask) {
5457 uint64_t DMaskImm = DMask->getImm();
5458 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5459 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5460 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5461 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5462
5463 // Adjust for packed 16 bit values
5464 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5465 RegCount = divideCeil(RegCount, 2);
5466
5467 // Adjust if using LWE or TFE
5468 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5469 RegCount += 1;
5470
5471 const uint32_t DstIdx =
5472 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5473 const MachineOperand &Dst = MI.getOperand(DstIdx);
5474 if (Dst.isReg()) {
5475 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5476 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5477 if (RegCount > DstSize) {
5478 ErrInfo = "Image instruction returns too many registers for dst "
5479 "register class";
5480 return false;
5481 }
5482 }
5483 }
5484 }
5485
5486 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5487 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5488 unsigned ConstantBusCount = 0;
5489 bool UsesLiteral = false;
5490 const MachineOperand *LiteralVal = nullptr;
5491
5492 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5493 if (ImmIdx != -1) {
5494 ++ConstantBusCount;
5495 UsesLiteral = true;
5496 LiteralVal = &MI.getOperand(ImmIdx);
5497 }
5498
5499 SmallVector<Register, 2> SGPRsUsed;
5500 Register SGPRUsed;
5501
5502 // Only look at the true operands. Only a real operand can use the constant
5503 // bus, and we don't want to check pseudo-operands like the source modifier
5504 // flags.
5505 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5506 if (OpIdx == -1)
5507 continue;
5508 const MachineOperand &MO = MI.getOperand(OpIdx);
5509 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5510 if (MO.isReg()) {
5511 SGPRUsed = MO.getReg();
5512 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5513 ++ConstantBusCount;
5514 SGPRsUsed.push_back(SGPRUsed);
5515 }
5516 } else if (!MO.isFI()) { // Treat FI like a register.
5517 if (!UsesLiteral) {
5518 ++ConstantBusCount;
5519 UsesLiteral = true;
5520 LiteralVal = &MO;
5521 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5522 assert(isVOP2(MI) || isVOP3(MI));
5523 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5524 return false;
5525 }
5526 }
5527 }
5528 }
5529
5530 SGPRUsed = findImplicitSGPRRead(MI);
5531 if (SGPRUsed) {
5532 // Implicit uses may safely overlap true operands
5533 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5534 return !RI.regsOverlap(SGPRUsed, SGPR);
5535 })) {
5536 ++ConstantBusCount;
5537 SGPRsUsed.push_back(SGPRUsed);
5538 }
5539 }
5540
5541 // v_writelane_b32 is an exception from constant bus restriction:
5542 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5543 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5544 Opcode != AMDGPU::V_WRITELANE_B32) {
5545 ErrInfo = "VOP* instruction violates constant bus restriction";
5546 return false;
5547 }
5548
5549 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5550 ErrInfo = "VOP3 instruction uses literal";
5551 return false;
5552 }
5553 }
5554
5555 // Special case for writelane - this can break the multiple constant bus rule,
5556 // but still can't use more than one SGPR register
5557 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5558 unsigned SGPRCount = 0;
5559 Register SGPRUsed;
5560
5561 for (int OpIdx : {Src0Idx, Src1Idx}) {
5562 if (OpIdx == -1)
5563 break;
5564
5565 const MachineOperand &MO = MI.getOperand(OpIdx);
5566
5567 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5568 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5569 if (MO.getReg() != SGPRUsed)
5570 ++SGPRCount;
5571 SGPRUsed = MO.getReg();
5572 }
5573 }
5574 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5575 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5576 return false;
5577 }
5578 }
5579 }
5580
5581 // Verify misc. restrictions on specific instructions.
5582 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5583 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5584 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5585 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5586 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5587 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5588 if (!compareMachineOp(Src0, Src1) &&
5589 !compareMachineOp(Src0, Src2)) {
5590 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5591 return false;
5592 }
5593 }
5594 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5595 SISrcMods::ABS) ||
5596 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5597 SISrcMods::ABS) ||
5598 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5599 SISrcMods::ABS)) {
5600 ErrInfo = "ABS not allowed in VOP3B instructions";
5601 return false;
5602 }
5603 }
5604
5605 if (isSOP2(MI) || isSOPC(MI)) {
5606 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5607 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5608
5609 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5610 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5611 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5612 !Src0.isIdenticalTo(Src1)) {
5613 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5614 return false;
5615 }
5616 }
5617
5618 if (isSOPK(MI)) {
5619 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5620 if (Desc.isBranch()) {
5621 if (!Op->isMBB()) {
5622 ErrInfo = "invalid branch target for SOPK instruction";
5623 return false;
5624 }
5625 } else {
5626 uint64_t Imm = Op->getImm();
5627 if (sopkIsZext(Opcode)) {
5628 if (!isUInt<16>(Imm)) {
5629 ErrInfo = "invalid immediate for SOPK instruction";
5630 return false;
5631 }
5632 } else {
5633 if (!isInt<16>(Imm)) {
5634 ErrInfo = "invalid immediate for SOPK instruction";
5635 return false;
5636 }
5637 }
5638 }
5639 }
5640
5641 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5642 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5643 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5644 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5645 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5646 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5647
5648 const unsigned StaticNumOps =
5649 Desc.getNumOperands() + Desc.implicit_uses().size();
5650 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5651
5652 // Require additional implicit operands. This allows a fixup done by the
5653 // post RA scheduler where the main implicit operand is killed and
5654 // implicit-defs are added for sub-registers that remain live after this
5655 // instruction.
5656 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5657 ErrInfo = "missing implicit register operands";
5658 return false;
5659 }
5660
5661 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5662 if (IsDst) {
5663 if (!Dst->isUse()) {
5664 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5665 return false;
5666 }
5667
5668 unsigned UseOpIdx;
5669 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5670 UseOpIdx != StaticNumOps + 1) {
5671 ErrInfo = "movrel implicit operands should be tied";
5672 return false;
5673 }
5674 }
5675
5676 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5677 const MachineOperand &ImpUse
5678 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5679 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5680 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5681 ErrInfo = "src0 should be subreg of implicit vector use";
5682 return false;
5683 }
5684 }
5685
5686 // Make sure we aren't losing exec uses in the td files. This mostly requires
5687 // being careful when using let Uses to try to add other use registers.
5688 if (shouldReadExec(MI)) {
5689 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5690 ErrInfo = "VALU instruction does not implicitly read exec mask";
5691 return false;
5692 }
5693 }
5694
5695 if (isSMRD(MI)) {
5696 if (MI.mayStore() &&
5697 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5698 // The register offset form of scalar stores may only use m0 as the
5699 // soffset register.
5700 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5701 if (Soff && Soff->getReg() != AMDGPU::M0) {
5702 ErrInfo = "scalar stores must use m0 as offset register";
5703 return false;
5704 }
5705 }
5706 }
5707
5708 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5709 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5710 if (Offset->getImm() != 0) {
5711 ErrInfo = "subtarget does not support offsets in flat instructions";
5712 return false;
5713 }
5714 }
5715
5716 if (isDS(MI) && !ST.hasGDS()) {
5717 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5718 if (GDSOp && GDSOp->getImm() != 0) {
5719 ErrInfo = "GDS is not supported on this subtarget";
5720 return false;
5721 }
5722 }
5723
5724 if (isImage(MI)) {
5725 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5726 if (DimOp) {
5727 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5728 AMDGPU::OpName::vaddr0);
5729 AMDGPU::OpName RSrcOpName =
5730 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5731 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5732 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5733 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5734 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5735 const AMDGPU::MIMGDimInfo *Dim =
5737
5738 if (!Dim) {
5739 ErrInfo = "dim is out of range";
5740 return false;
5741 }
5742
5743 bool IsA16 = false;
5744 if (ST.hasR128A16()) {
5745 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5746 IsA16 = R128A16->getImm() != 0;
5747 } else if (ST.hasA16()) {
5748 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5749 IsA16 = A16->getImm() != 0;
5750 }
5751
5752 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5753
5754 unsigned AddrWords =
5755 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5756
5757 unsigned VAddrWords;
5758 if (IsNSA) {
5759 VAddrWords = RsrcIdx - VAddr0Idx;
5760 if (ST.hasPartialNSAEncoding() &&
5761 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5762 unsigned LastVAddrIdx = RsrcIdx - 1;
5763 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5764 }
5765 } else {
5766 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5767 if (AddrWords > 12)
5768 AddrWords = 16;
5769 }
5770
5771 if (VAddrWords != AddrWords) {
5772 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5773 << " but got " << VAddrWords << "\n");
5774 ErrInfo = "bad vaddr size";
5775 return false;
5776 }
5777 }
5778 }
5779
5780 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5781 if (DppCt) {
5782 using namespace AMDGPU::DPP;
5783
5784 unsigned DC = DppCt->getImm();
5785 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5786 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5787 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5788 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5789 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5790 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5791 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5792 ErrInfo = "Invalid dpp_ctrl value";
5793 return false;
5794 }
5795 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5796 !ST.hasDPPWavefrontShifts()) {
5797 ErrInfo = "Invalid dpp_ctrl value: "
5798 "wavefront shifts are not supported on GFX10+";
5799 return false;
5800 }
5801 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5802 !ST.hasDPPBroadcasts()) {
5803 ErrInfo = "Invalid dpp_ctrl value: "
5804 "broadcasts are not supported on GFX10+";
5805 return false;
5806 }
5807 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5808 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5809 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5810 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5811 !ST.hasGFX90AInsts()) {
5812 ErrInfo = "Invalid dpp_ctrl value: "
5813 "row_newbroadcast/row_share is not supported before "
5814 "GFX90A/GFX10";
5815 return false;
5816 }
5817 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5818 ErrInfo = "Invalid dpp_ctrl value: "
5819 "row_share and row_xmask are not supported before GFX10";
5820 return false;
5821 }
5822 }
5823
5824 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5826 AMDGPU::isDPALU_DPP(Desc, *this, ST)) {
5827 ErrInfo = "Invalid dpp_ctrl value: "
5828 "DP ALU dpp only support row_newbcast";
5829 return false;
5830 }
5831 }
5832
5833 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5834 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5835 AMDGPU::OpName DataName =
5836 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5837 const MachineOperand *Data = getNamedOperand(MI, DataName);
5838 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5839 if (Data && !Data->isReg())
5840 Data = nullptr;
5841
5842 if (ST.hasGFX90AInsts()) {
5843 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5844 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5845 ErrInfo = "Invalid register class: "
5846 "vdata and vdst should be both VGPR or AGPR";
5847 return false;
5848 }
5849 if (Data && Data2 &&
5850 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5851 ErrInfo = "Invalid register class: "
5852 "both data operands should be VGPR or AGPR";
5853 return false;
5854 }
5855 } else {
5856 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5857 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5858 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5859 ErrInfo = "Invalid register class: "
5860 "agpr loads and stores not supported on this GPU";
5861 return false;
5862 }
5863 }
5864 }
5865
5866 if (ST.needsAlignedVGPRs()) {
5867 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5869 if (!Op)
5870 return true;
5871 Register Reg = Op->getReg();
5872 if (Reg.isPhysical())
5873 return !(RI.getHWRegIndex(Reg) & 1);
5874 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5875 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5876 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5877 };
5878
5879 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5880 Opcode == AMDGPU::DS_GWS_BARRIER) {
5881
5882 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5883 ErrInfo = "Subtarget requires even aligned vector registers "
5884 "for DS_GWS instructions";
5885 return false;
5886 }
5887 }
5888
5889 if (isMIMG(MI)) {
5890 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5891 ErrInfo = "Subtarget requires even aligned vector registers "
5892 "for vaddr operand of image instructions";
5893 return false;
5894 }
5895 }
5896 }
5897
5898 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5899 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5900 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5901 ErrInfo = "Invalid register class: "
5902 "v_accvgpr_write with an SGPR is not supported on this GPU";
5903 return false;
5904 }
5905 }
5906
5907 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5908 const MachineOperand &SrcOp = MI.getOperand(1);
5909 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5910 ErrInfo = "pseudo expects only physical SGPRs";
5911 return false;
5912 }
5913 }
5914
5915 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5916 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5917 if (!ST.hasScaleOffset()) {
5918 ErrInfo = "Subtarget does not support offset scaling";
5919 return false;
5920 }
5921 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5922 ErrInfo = "Instruction does not support offset scaling";
5923 return false;
5924 }
5925 }
5926 }
5927
5928 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5929 // information.
5930 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5931 for (unsigned I = 0; I < 3; ++I) {
5933 return false;
5934 }
5935 }
5936
5937 if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
5938 MI.readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, nullptr)) {
5939 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
5940 if ((Dst && RI.getRegClassForReg(MRI, Dst->getReg()) ==
5941 &AMDGPU::SReg_64RegClass) ||
5942 Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
5943 ErrInfo = "Instruction cannot read flat_scratch_base_hi";
5944 return false;
5945 }
5946 }
5947
5948 return true;
5949}
5950
5951// It is more readable to list mapped opcodes on the same line.
5952// clang-format off
5953
5955 switch (MI.getOpcode()) {
5956 default: return AMDGPU::INSTRUCTION_LIST_END;
5957 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5958 case AMDGPU::COPY: return AMDGPU::COPY;
5959 case AMDGPU::PHI: return AMDGPU::PHI;
5960 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5961 case AMDGPU::WQM: return AMDGPU::WQM;
5962 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5963 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5964 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5965 case AMDGPU::S_MOV_B32: {
5966 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
5967 return MI.getOperand(1).isReg() ||
5968 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5969 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5970 }
5971 case AMDGPU::S_ADD_I32:
5972 return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5973 case AMDGPU::S_ADDC_U32:
5974 return AMDGPU::V_ADDC_U32_e32;
5975 case AMDGPU::S_SUB_I32:
5976 return ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5977 // FIXME: These are not consistently handled, and selected when the carry is
5978 // used.
5979 case AMDGPU::S_ADD_U32:
5980 return AMDGPU::V_ADD_CO_U32_e32;
5981 case AMDGPU::S_SUB_U32:
5982 return AMDGPU::V_SUB_CO_U32_e32;
5983 case AMDGPU::S_ADD_U64_PSEUDO:
5984 return AMDGPU::V_ADD_U64_PSEUDO;
5985 case AMDGPU::S_SUB_U64_PSEUDO:
5986 return AMDGPU::V_SUB_U64_PSEUDO;
5987 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5988 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5989 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5990 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5991 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5992 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5993 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5994 case AMDGPU::S_XNOR_B32:
5995 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5996 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5997 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5998 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5999 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
6000 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
6001 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
6002 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
6003 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
6004 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
6005 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
6006 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
6007 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
6008 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
6009 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
6010 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
6011 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
6012 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
6013 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
6014 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
6015 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
6016 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
6017 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
6018 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
6019 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
6020 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
6021 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
6022 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
6023 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
6024 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
6025 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
6026 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
6027 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
6028 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
6029 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
6030 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
6031 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
6032 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
6033 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
6034 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
6035 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
6036 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
6037 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
6038 case AMDGPU::S_CVT_F32_F16:
6039 case AMDGPU::S_CVT_HI_F32_F16:
6040 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
6041 : AMDGPU::V_CVT_F32_F16_fake16_e64;
6042 case AMDGPU::S_CVT_F16_F32:
6043 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
6044 : AMDGPU::V_CVT_F16_F32_fake16_e64;
6045 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
6046 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
6047 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
6048 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
6049 case AMDGPU::S_CEIL_F16:
6050 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
6051 : AMDGPU::V_CEIL_F16_fake16_e64;
6052 case AMDGPU::S_FLOOR_F16:
6053 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
6054 : AMDGPU::V_FLOOR_F16_fake16_e64;
6055 case AMDGPU::S_TRUNC_F16:
6056 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
6057 : AMDGPU::V_TRUNC_F16_fake16_e64;
6058 case AMDGPU::S_RNDNE_F16:
6059 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
6060 : AMDGPU::V_RNDNE_F16_fake16_e64;
6061 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
6062 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
6063 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
6064 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
6065 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
6066 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
6067 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
6068 case AMDGPU::S_ADD_F16:
6069 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
6070 : AMDGPU::V_ADD_F16_fake16_e64;
6071 case AMDGPU::S_SUB_F16:
6072 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
6073 : AMDGPU::V_SUB_F16_fake16_e64;
6074 case AMDGPU::S_MIN_F16:
6075 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
6076 : AMDGPU::V_MIN_F16_fake16_e64;
6077 case AMDGPU::S_MAX_F16:
6078 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
6079 : AMDGPU::V_MAX_F16_fake16_e64;
6080 case AMDGPU::S_MINIMUM_F16:
6081 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
6082 : AMDGPU::V_MINIMUM_F16_fake16_e64;
6083 case AMDGPU::S_MAXIMUM_F16:
6084 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
6085 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
6086 case AMDGPU::S_MUL_F16:
6087 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
6088 : AMDGPU::V_MUL_F16_fake16_e64;
6089 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
6090 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
6091 case AMDGPU::S_FMAC_F16:
6092 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
6093 : AMDGPU::V_FMAC_F16_fake16_e64;
6094 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
6095 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
6096 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
6097 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
6098 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
6099 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
6100 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
6101 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
6102 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
6103 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
6104 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
6105 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
6106 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
6107 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
6108 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
6109 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
6110 case AMDGPU::S_CMP_LT_F16:
6111 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
6112 : AMDGPU::V_CMP_LT_F16_fake16_e64;
6113 case AMDGPU::S_CMP_EQ_F16:
6114 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
6115 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
6116 case AMDGPU::S_CMP_LE_F16:
6117 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
6118 : AMDGPU::V_CMP_LE_F16_fake16_e64;
6119 case AMDGPU::S_CMP_GT_F16:
6120 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
6121 : AMDGPU::V_CMP_GT_F16_fake16_e64;
6122 case AMDGPU::S_CMP_LG_F16:
6123 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
6124 : AMDGPU::V_CMP_LG_F16_fake16_e64;
6125 case AMDGPU::S_CMP_GE_F16:
6126 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
6127 : AMDGPU::V_CMP_GE_F16_fake16_e64;
6128 case AMDGPU::S_CMP_O_F16:
6129 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
6130 : AMDGPU::V_CMP_O_F16_fake16_e64;
6131 case AMDGPU::S_CMP_U_F16:
6132 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
6133 : AMDGPU::V_CMP_U_F16_fake16_e64;
6134 case AMDGPU::S_CMP_NGE_F16:
6135 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
6136 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
6137 case AMDGPU::S_CMP_NLG_F16:
6138 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
6139 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
6140 case AMDGPU::S_CMP_NGT_F16:
6141 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
6142 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
6143 case AMDGPU::S_CMP_NLE_F16:
6144 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
6145 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
6146 case AMDGPU::S_CMP_NEQ_F16:
6147 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
6148 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
6149 case AMDGPU::S_CMP_NLT_F16:
6150 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
6151 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
6152 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
6153 case AMDGPU::V_S_EXP_F16_e64:
6154 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
6155 : AMDGPU::V_EXP_F16_fake16_e64;
6156 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
6157 case AMDGPU::V_S_LOG_F16_e64:
6158 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
6159 : AMDGPU::V_LOG_F16_fake16_e64;
6160 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
6161 case AMDGPU::V_S_RCP_F16_e64:
6162 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
6163 : AMDGPU::V_RCP_F16_fake16_e64;
6164 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
6165 case AMDGPU::V_S_RSQ_F16_e64:
6166 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
6167 : AMDGPU::V_RSQ_F16_fake16_e64;
6168 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
6169 case AMDGPU::V_S_SQRT_F16_e64:
6170 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
6171 : AMDGPU::V_SQRT_F16_fake16_e64;
6172 }
6174 "Unexpected scalar opcode without corresponding vector one!");
6175}
6176
6177// clang-format on
6178
6182 const DebugLoc &DL, Register Reg,
6183 bool IsSCCLive,
6184 SlotIndexes *Indexes) const {
6185 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6186 const SIInstrInfo *TII = ST.getInstrInfo();
6188 if (IsSCCLive) {
6189 // Insert two move instructions, one to save the original value of EXEC and
6190 // the other to turn on all bits in EXEC. This is required as we can't use
6191 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
6192 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
6194 auto FlipExecMI =
6195 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
6196 if (Indexes) {
6197 Indexes->insertMachineInstrInMaps(*StoreExecMI);
6198 Indexes->insertMachineInstrInMaps(*FlipExecMI);
6199 }
6200 } else {
6201 auto SaveExec =
6202 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
6203 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
6204 if (Indexes)
6205 Indexes->insertMachineInstrInMaps(*SaveExec);
6206 }
6207}
6208
6211 const DebugLoc &DL, Register Reg,
6212 SlotIndexes *Indexes) const {
6214 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
6215 .addReg(Reg, RegState::Kill);
6216 if (Indexes)
6217 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
6218}
6219
6223 "Not a whole wave func");
6224 MachineBasicBlock &MBB = *MF.begin();
6225 for (MachineInstr &MI : MBB)
6226 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6227 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6228 return &MI;
6229
6230 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6231}
6232
6234 unsigned OpNo) const {
6235 const MCInstrDesc &Desc = get(MI.getOpcode());
6236 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6237 Desc.operands()[OpNo].RegClass == -1) {
6238 Register Reg = MI.getOperand(OpNo).getReg();
6239
6240 if (Reg.isVirtual()) {
6241 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6242 return MRI.getRegClass(Reg);
6243 }
6244 return RI.getPhysRegBaseClass(Reg);
6245 }
6246
6247 int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]);
6248 return RegClass < 0 ? nullptr : RI.getRegClass(RegClass);
6249}
6250
6253 MachineBasicBlock *MBB = MI.getParent();
6254 MachineOperand &MO = MI.getOperand(OpIdx);
6255 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6256 unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]);
6257 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6258 unsigned Size = RI.getRegSizeInBits(*RC);
6259 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6260 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6261 : AMDGPU::V_MOV_B32_e32;
6262 if (MO.isReg())
6263 Opcode = AMDGPU::COPY;
6264 else if (RI.isSGPRClass(RC))
6265 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6266
6267 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6268 Register Reg = MRI.createVirtualRegister(VRC);
6269 DebugLoc DL = MBB->findDebugLoc(I);
6270 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6271 MO.ChangeToRegister(Reg, false);
6272}
6273
6276 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6277 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6278 if (!SuperReg.getReg().isVirtual())
6279 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6280
6281 MachineBasicBlock *MBB = MI->getParent();
6282 const DebugLoc &DL = MI->getDebugLoc();
6283 Register SubReg = MRI.createVirtualRegister(SubRC);
6284
6285 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6286 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6287 .addReg(SuperReg.getReg(), {}, NewSubIdx);
6288 return SubReg;
6289}
6290
6293 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6294 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6295 if (Op.isImm()) {
6296 if (SubIdx == AMDGPU::sub0)
6297 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6298 if (SubIdx == AMDGPU::sub1)
6299 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6300
6301 llvm_unreachable("Unhandled register index for immediate");
6302 }
6303
6304 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6305 SubIdx, SubRC);
6306 return MachineOperand::CreateReg(SubReg, false);
6307}
6308
6309// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6310void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6311 assert(Inst.getNumExplicitOperands() == 3);
6312 MachineOperand Op1 = Inst.getOperand(1);
6313 Inst.removeOperand(1);
6314 Inst.addOperand(Op1);
6315}
6316
6318 const MCOperandInfo &OpInfo,
6319 const MachineOperand &MO) const {
6320 if (!MO.isReg())
6321 return false;
6322
6323 Register Reg = MO.getReg();
6324
6325 const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo));
6326 if (Reg.isPhysical())
6327 return DRC->contains(Reg);
6328
6329 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6330
6331 if (MO.getSubReg()) {
6332 const MachineFunction *MF = MO.getParent()->getMF();
6333 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6334 if (!SuperRC)
6335 return false;
6336 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6337 }
6338
6339 return RI.getCommonSubClass(DRC, RC) != nullptr;
6340}
6341
6343 const MachineOperand &MO) const {
6344 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6345 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6346 unsigned Opc = MI.getOpcode();
6347
6348 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6349 // information.
6350 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6351 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6352 constexpr AMDGPU::OpName OpNames[] = {
6353 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6354
6355 for (auto [I, OpName] : enumerate(OpNames)) {
6356 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6357 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6359 return false;
6360 }
6361 }
6362
6363 if (!isLegalRegOperand(MRI, OpInfo, MO))
6364 return false;
6365
6366 // check Accumulate GPR operand
6367 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6368 if (IsAGPR && !ST.hasMAIInsts())
6369 return false;
6370 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6371 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6372 return false;
6373 // Atomics should have both vdst and vdata either vgpr or agpr.
6374 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6375 const int DataIdx = AMDGPU::getNamedOperandIdx(
6376 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6377 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6378 MI.getOperand(DataIdx).isReg() &&
6379 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6380 return false;
6381 if ((int)OpIdx == DataIdx) {
6382 if (VDstIdx != -1 &&
6383 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6384 return false;
6385 // DS instructions with 2 src operands also must have tied RC.
6386 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6387 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6388 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6389 return false;
6390 }
6391
6392 // Check V_ACCVGPR_WRITE_B32_e64
6393 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6394 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6395 RI.isSGPRReg(MRI, MO.getReg()))
6396 return false;
6397
6398 if (ST.hasFlatScratchHiInB64InstHazard() &&
6399 MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
6400 if (const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst)) {
6401 if (AMDGPU::getRegBitWidth(*RI.getRegClassForReg(MRI, Dst->getReg())) ==
6402 64)
6403 return false;
6404 }
6405 if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
6406 return false;
6407 }
6408
6409 return true;
6410}
6411
6413 const MCOperandInfo &OpInfo,
6414 const MachineOperand &MO) const {
6415 if (MO.isReg())
6416 return isLegalRegOperand(MRI, OpInfo, MO);
6417
6418 // Handle non-register types that are treated like immediates.
6419 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6420 return true;
6421}
6422
6424 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6425 const MachineOperand *MO) const {
6426 constexpr unsigned NumOps = 3;
6427 constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
6428 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6429 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6430 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6431
6432 assert(SrcN < NumOps);
6433
6434 if (!MO) {
6435 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6436 if (SrcIdx == -1)
6437 return true;
6438 MO = &MI.getOperand(SrcIdx);
6439 }
6440
6441 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6442 return true;
6443
6444 int ModsIdx =
6445 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6446 if (ModsIdx == -1)
6447 return true;
6448
6449 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6450 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6451 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6452
6453 return !OpSel && !OpSelHi;
6454}
6455
6457 const MachineOperand *MO) const {
6458 const MachineFunction &MF = *MI.getMF();
6459 const MachineRegisterInfo &MRI = MF.getRegInfo();
6460 const MCInstrDesc &InstDesc = MI.getDesc();
6461 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6462 int64_t RegClass = getOpRegClassID(OpInfo);
6463 const TargetRegisterClass *DefinedRC =
6464 RegClass != -1 ? RI.getRegClass(RegClass) : nullptr;
6465 if (!MO)
6466 MO = &MI.getOperand(OpIdx);
6467
6468 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6469
6470 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6471 const MachineOperand *UsedLiteral = nullptr;
6472
6473 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6474 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6475
6476 // TODO: Be more permissive with frame indexes.
6477 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6478 if (!LiteralLimit--)
6479 return false;
6480
6481 UsedLiteral = MO;
6482 }
6483
6485 if (MO->isReg())
6486 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6487
6488 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6489 if (i == OpIdx)
6490 continue;
6491 const MachineOperand &Op = MI.getOperand(i);
6492 if (Op.isReg()) {
6493 if (Op.isUse()) {
6494 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6495 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6496 if (--ConstantBusLimit <= 0)
6497 return false;
6498 }
6499 }
6500 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6501 !isInlineConstant(Op, InstDesc.operands()[i])) {
6502 // The same literal may be used multiple times.
6503 if (!UsedLiteral)
6504 UsedLiteral = &Op;
6505 else if (UsedLiteral->isIdenticalTo(Op))
6506 continue;
6507
6508 if (!LiteralLimit--)
6509 return false;
6510 if (--ConstantBusLimit <= 0)
6511 return false;
6512 }
6513 }
6514 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6515 // There can be at most one literal operand, but it can be repeated.
6516 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6517 if (i == OpIdx)
6518 continue;
6519 const MachineOperand &Op = MI.getOperand(i);
6520 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6521 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6522 !Op.isIdenticalTo(*MO))
6523 return false;
6524
6525 // Do not fold a non-inlineable and non-register operand into an
6526 // instruction that already has a frame index. The frame index handling
6527 // code could not handle well when a frame index co-exists with another
6528 // non-register operand, unless that operand is an inlineable immediate.
6529 if (Op.isFI())
6530 return false;
6531 }
6532 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6533 isF16PseudoScalarTrans(MI.getOpcode())) {
6534 return false;
6535 }
6536
6537 if (MO->isReg()) {
6538 if (!DefinedRC)
6539 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6540 return isLegalRegOperand(MI, OpIdx, *MO);
6541 }
6542
6543 if (MO->isImm()) {
6544 uint64_t Imm = MO->getImm();
6545 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6546 bool Is64BitOp = Is64BitFPOp ||
6547 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6548 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6549 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6550 if (Is64BitOp &&
6551 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6552 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6553 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6554 return false;
6555
6556 // FIXME: We can use sign extended 64-bit literals, but only for signed
6557 // operands. At the moment we do not know if an operand is signed.
6558 // Such operand will be encoded as its low 32 bits and then either
6559 // correctly sign extended or incorrectly zero extended by HW.
6560 // If 64-bit literals are supported and the literal will be encoded
6561 // as full 64 bit we still can use it.
6562 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6563 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6564 return false;
6565 }
6566 }
6567
6568 // Handle non-register types that are treated like immediates.
6569 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6570
6571 if (!DefinedRC) {
6572 // This operand expects an immediate.
6573 return true;
6574 }
6575
6576 return isImmOperandLegal(MI, OpIdx, *MO);
6577}
6578
6580 bool IsGFX950Only = ST.hasGFX950Insts();
6581 bool IsGFX940Only = ST.hasGFX940Insts();
6582
6583 if (!IsGFX950Only && !IsGFX940Only)
6584 return false;
6585
6586 if (!isVALU(MI))
6587 return false;
6588
6589 // V_COS, V_EXP, V_RCP, etc.
6590 if (isTRANS(MI))
6591 return true;
6592
6593 // DOT2, DOT2C, DOT4, etc.
6594 if (isDOT(MI))
6595 return true;
6596
6597 // MFMA, SMFMA
6598 if (isMFMA(MI))
6599 return true;
6600
6601 unsigned Opcode = MI.getOpcode();
6602 switch (Opcode) {
6603 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6604 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6605 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6606 case AMDGPU::V_MQSAD_U32_U8_e64:
6607 case AMDGPU::V_PK_ADD_F16:
6608 case AMDGPU::V_PK_ADD_F32:
6609 case AMDGPU::V_PK_ADD_I16:
6610 case AMDGPU::V_PK_ADD_U16:
6611 case AMDGPU::V_PK_ASHRREV_I16:
6612 case AMDGPU::V_PK_FMA_F16:
6613 case AMDGPU::V_PK_FMA_F32:
6614 case AMDGPU::V_PK_FMAC_F16_e32:
6615 case AMDGPU::V_PK_FMAC_F16_e64:
6616 case AMDGPU::V_PK_LSHLREV_B16:
6617 case AMDGPU::V_PK_LSHRREV_B16:
6618 case AMDGPU::V_PK_MAD_I16:
6619 case AMDGPU::V_PK_MAD_U16:
6620 case AMDGPU::V_PK_MAX_F16:
6621 case AMDGPU::V_PK_MAX_I16:
6622 case AMDGPU::V_PK_MAX_U16:
6623 case AMDGPU::V_PK_MIN_F16:
6624 case AMDGPU::V_PK_MIN_I16:
6625 case AMDGPU::V_PK_MIN_U16:
6626 case AMDGPU::V_PK_MOV_B32:
6627 case AMDGPU::V_PK_MUL_F16:
6628 case AMDGPU::V_PK_MUL_F32:
6629 case AMDGPU::V_PK_MUL_LO_U16:
6630 case AMDGPU::V_PK_SUB_I16:
6631 case AMDGPU::V_PK_SUB_U16:
6632 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6633 return true;
6634 default:
6635 return false;
6636 }
6637}
6638
6640 MachineInstr &MI) const {
6641 unsigned Opc = MI.getOpcode();
6642 const MCInstrDesc &InstrDesc = get(Opc);
6643
6644 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6645 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6646
6647 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6648 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6649
6650 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6651 // we need to only have one constant bus use before GFX10.
6652 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6653 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6654 RI.isSGPRReg(MRI, Src0.getReg()))
6655 legalizeOpWithMove(MI, Src0Idx);
6656
6657 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6658 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6659 // src0/src1 with V_READFIRSTLANE.
6660 if (Opc == AMDGPU::V_WRITELANE_B32) {
6661 const DebugLoc &DL = MI.getDebugLoc();
6662 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6663 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6664 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6665 .add(Src0);
6666 Src0.ChangeToRegister(Reg, false);
6667 }
6668 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6669 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6670 const DebugLoc &DL = MI.getDebugLoc();
6671 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6672 .add(Src1);
6673 Src1.ChangeToRegister(Reg, false);
6674 }
6675 return;
6676 }
6677
6678 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6679 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6680 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6681 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6682 legalizeOpWithMove(MI, Src2Idx);
6683 }
6684
6685 // VOP2 src0 instructions support all operand types, so we don't need to check
6686 // their legality. If src1 is already legal, we don't need to do anything.
6687 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6688 return;
6689
6690 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6691 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6692 // select is uniform.
6693 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6694 RI.isVGPR(MRI, Src1.getReg())) {
6695 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6696 const DebugLoc &DL = MI.getDebugLoc();
6697 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6698 .add(Src1);
6699 Src1.ChangeToRegister(Reg, false);
6700 return;
6701 }
6702
6703 // We do not use commuteInstruction here because it is too aggressive and will
6704 // commute if it is possible. We only want to commute here if it improves
6705 // legality. This can be called a fairly large number of times so don't waste
6706 // compile time pointlessly swapping and checking legality again.
6707 if (HasImplicitSGPR || !MI.isCommutable()) {
6708 legalizeOpWithMove(MI, Src1Idx);
6709 return;
6710 }
6711
6712 // If src0 can be used as src1, commuting will make the operands legal.
6713 // Otherwise we have to give up and insert a move.
6714 //
6715 // TODO: Other immediate-like operand kinds could be commuted if there was a
6716 // MachineOperand::ChangeTo* for them.
6717 if ((!Src1.isImm() && !Src1.isReg()) ||
6718 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6719 legalizeOpWithMove(MI, Src1Idx);
6720 return;
6721 }
6722
6723 int CommutedOpc = commuteOpcode(MI);
6724 if (CommutedOpc == -1) {
6725 legalizeOpWithMove(MI, Src1Idx);
6726 return;
6727 }
6728
6729 MI.setDesc(get(CommutedOpc));
6730
6731 Register Src0Reg = Src0.getReg();
6732 unsigned Src0SubReg = Src0.getSubReg();
6733 bool Src0Kill = Src0.isKill();
6734
6735 if (Src1.isImm())
6736 Src0.ChangeToImmediate(Src1.getImm());
6737 else if (Src1.isReg()) {
6738 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6739 Src0.setSubReg(Src1.getSubReg());
6740 } else
6741 llvm_unreachable("Should only have register or immediate operands");
6742
6743 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6744 Src1.setSubReg(Src0SubReg);
6746}
6747
6748// Legalize VOP3 operands. All operand types are supported for any operand
6749// but only one literal constant and only starting from GFX10.
6751 MachineInstr &MI) const {
6752 unsigned Opc = MI.getOpcode();
6753
6754 int VOP3Idx[3] = {
6755 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6756 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6757 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6758 };
6759
6760 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6761 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6762 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6763 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6764 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6765 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6766 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6767 // src1 and src2 must be scalar
6768 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6769 const DebugLoc &DL = MI.getDebugLoc();
6770 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6771 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6772 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6773 .add(Src1);
6774 Src1.ChangeToRegister(Reg, false);
6775 }
6776 if (VOP3Idx[2] != -1) {
6777 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6778 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6779 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6780 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6781 .add(Src2);
6782 Src2.ChangeToRegister(Reg, false);
6783 }
6784 }
6785 }
6786
6787 // Find the one SGPR operand we are allowed to use.
6788 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6789 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6790 SmallDenseSet<unsigned> SGPRsUsed;
6791 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6792 if (SGPRReg) {
6793 SGPRsUsed.insert(SGPRReg);
6794 --ConstantBusLimit;
6795 }
6796
6797 for (int Idx : VOP3Idx) {
6798 if (Idx == -1)
6799 break;
6800 MachineOperand &MO = MI.getOperand(Idx);
6801
6802 if (!MO.isReg()) {
6803 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6804 continue;
6805
6806 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6807 --LiteralLimit;
6808 --ConstantBusLimit;
6809 continue;
6810 }
6811
6812 --LiteralLimit;
6813 --ConstantBusLimit;
6814 legalizeOpWithMove(MI, Idx);
6815 continue;
6816 }
6817
6818 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6819 continue; // VGPRs are legal
6820
6821 // We can use one SGPR in each VOP3 instruction prior to GFX10
6822 // and two starting from GFX10.
6823 if (SGPRsUsed.count(MO.getReg()))
6824 continue;
6825 if (ConstantBusLimit > 0) {
6826 SGPRsUsed.insert(MO.getReg());
6827 --ConstantBusLimit;
6828 continue;
6829 }
6830
6831 // If we make it this far, then the operand is not legal and we must
6832 // legalize it.
6833 legalizeOpWithMove(MI, Idx);
6834 }
6835
6836 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6837 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6838 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6839 legalizeOpWithMove(MI, VOP3Idx[2]);
6840
6841 // Fix the register class of packed FP32 instructions on gfx12+. See
6842 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6844 for (unsigned I = 0; I < 3; ++I) {
6845 if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, /*SrcN=*/I))
6846 legalizeOpWithMove(MI, VOP3Idx[I]);
6847 }
6848 }
6849}
6850
6853 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6854 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6855 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6856 if (DstRC)
6857 SRC = RI.getCommonSubClass(SRC, DstRC);
6858
6859 Register DstReg = MRI.createVirtualRegister(SRC);
6860 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6861
6862 if (RI.hasAGPRs(VRC)) {
6863 VRC = RI.getEquivalentVGPRClass(VRC);
6864 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6865 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6866 get(TargetOpcode::COPY), NewSrcReg)
6867 .addReg(SrcReg);
6868 SrcReg = NewSrcReg;
6869 }
6870
6871 if (SubRegs == 1) {
6872 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6873 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6874 .addReg(SrcReg);
6875 return DstReg;
6876 }
6877
6879 for (unsigned i = 0; i < SubRegs; ++i) {
6880 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6881 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6882 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6883 .addReg(SrcReg, {}, RI.getSubRegFromChannel(i));
6884 SRegs.push_back(SGPR);
6885 }
6886
6888 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6889 get(AMDGPU::REG_SEQUENCE), DstReg);
6890 for (unsigned i = 0; i < SubRegs; ++i) {
6891 MIB.addReg(SRegs[i]);
6892 MIB.addImm(RI.getSubRegFromChannel(i));
6893 }
6894 return DstReg;
6895}
6896
6898 MachineInstr &MI) const {
6899
6900 // If the pointer is store in VGPRs, then we need to move them to
6901 // SGPRs using v_readfirstlane. This is safe because we only select
6902 // loads with uniform pointers to SMRD instruction so we know the
6903 // pointer value is uniform.
6904 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6905 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6906 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6907 SBase->setReg(SGPR);
6908 }
6909 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6910 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6911 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6912 SOff->setReg(SGPR);
6913 }
6914}
6915
6917 unsigned Opc = Inst.getOpcode();
6918 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6919 if (OldSAddrIdx < 0)
6920 return false;
6921
6922 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6923
6924 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6925 if (NewOpc < 0)
6927 if (NewOpc < 0)
6928 return false;
6929
6930 MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();
6931 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6932 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6933 return false;
6934
6935 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6936 if (NewVAddrIdx < 0)
6937 return false;
6938
6939 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6940
6941 // Check vaddr, it shall be zero or absent.
6942 MachineInstr *VAddrDef = nullptr;
6943 if (OldVAddrIdx >= 0) {
6944 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6945 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6946 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6947 !VAddrDef->getOperand(1).isImm() ||
6948 VAddrDef->getOperand(1).getImm() != 0)
6949 return false;
6950 }
6951
6952 const MCInstrDesc &NewDesc = get(NewOpc);
6953 Inst.setDesc(NewDesc);
6954
6955 // Callers expect iterator to be valid after this call, so modify the
6956 // instruction in place.
6957 if (OldVAddrIdx == NewVAddrIdx) {
6958 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6959 // Clear use list from the old vaddr holding a zero register.
6960 MRI.removeRegOperandFromUseList(&NewVAddr);
6961 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6962 Inst.removeOperand(OldSAddrIdx);
6963 // Update the use list with the pointer we have just moved from vaddr to
6964 // saddr position. Otherwise new vaddr will be missing from the use list.
6965 MRI.removeRegOperandFromUseList(&NewVAddr);
6966 MRI.addRegOperandToUseList(&NewVAddr);
6967 } else {
6968 assert(OldSAddrIdx == NewVAddrIdx);
6969
6970 if (OldVAddrIdx >= 0) {
6971 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6972 AMDGPU::OpName::vdst_in);
6973
6974 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6975 // it asserts. Untie the operands for now and retie them afterwards.
6976 if (NewVDstIn != -1) {
6977 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6978 Inst.untieRegOperand(OldVDstIn);
6979 }
6980
6981 Inst.removeOperand(OldVAddrIdx);
6982
6983 if (NewVDstIn != -1) {
6984 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6985 Inst.tieOperands(NewVDst, NewVDstIn);
6986 }
6987 }
6988 }
6989
6990 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6991 VAddrDef->eraseFromParent();
6992
6993 return true;
6994}
6995
6996// FIXME: Remove this when SelectionDAG is obsoleted.
6998 MachineInstr &MI) const {
6999 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
7000 return;
7001
7002 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
7003 // thinks they are uniform, so a readfirstlane should be valid.
7004 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
7005 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
7006 return;
7007
7009 return;
7010
7011 const TargetRegisterClass *DeclaredRC =
7012 getRegClass(MI.getDesc(), SAddr->getOperandNo());
7013
7014 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
7015 SAddr->setReg(ToSGPR);
7016}
7017
7020 const TargetRegisterClass *DstRC,
7023 const DebugLoc &DL) const {
7024 Register OpReg = Op.getReg();
7025 unsigned OpSubReg = Op.getSubReg();
7026
7027 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
7028 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
7029
7030 // Check if operand is already the correct register class.
7031 if (DstRC == OpRC)
7032 return;
7033
7034 Register DstReg = MRI.createVirtualRegister(DstRC);
7035 auto Copy =
7036 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
7037 Op.setReg(DstReg);
7038
7039 MachineInstr *Def = MRI.getVRegDef(OpReg);
7040 if (!Def)
7041 return;
7042
7043 // Try to eliminate the copy if it is copying an immediate value.
7044 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
7045 foldImmediate(*Copy, *Def, OpReg, &MRI);
7046
7047 bool ImpDef = Def->isImplicitDef();
7048 while (!ImpDef && Def && Def->isCopy()) {
7049 if (Def->getOperand(1).getReg().isPhysical())
7050 break;
7051 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
7052 ImpDef = Def && Def->isImplicitDef();
7053 }
7054 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
7055 !ImpDef)
7056 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
7057}
7058
7059// Emit the actual waterfall loop, executing the wrapped instruction for each
7060// unique value of \p ScalarOps across all lanes. In the best case we execute 1
7061// iteration, in the worst case we execute 64 (once per lane).
7062static void
7065 MachineBasicBlock &LoopBB,
7066 MachineBasicBlock &BodyBB,
7067 const DebugLoc &DL,
7068 ArrayRef<MachineOperand *> ScalarOps) {
7069 MachineFunction &MF = *LoopBB.getParent();
7070 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7071 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7073 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7074
7076 Register CondReg;
7077
7078 for (MachineOperand *ScalarOp : ScalarOps) {
7079 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
7080 unsigned NumSubRegs = RegSize / 32;
7081 Register VScalarOp = ScalarOp->getReg();
7082
7083 if (NumSubRegs == 1) {
7084 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7085
7086 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
7087 .addReg(VScalarOp);
7088
7089 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7090
7091 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
7092 .addReg(CurReg)
7093 .addReg(VScalarOp);
7094
7095 // Combine the comparison results with AND.
7096 if (!CondReg) // First.
7097 CondReg = NewCondReg;
7098 else { // If not the first, we create an AND.
7099 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7100 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7101 .addReg(CondReg)
7102 .addReg(NewCondReg);
7103 CondReg = AndReg;
7104 }
7105
7106 // Update ScalarOp operand to use the SGPR ScalarOp.
7107 ScalarOp->setReg(CurReg);
7108 ScalarOp->setIsKill();
7109 } else {
7110 SmallVector<Register, 8> ReadlanePieces;
7111 RegState VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
7112 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
7113 "Unhandled register size");
7114
7115 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
7116 Register CurRegLo =
7117 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7118 Register CurRegHi =
7119 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7120
7121 // Read the next variant <- also loop target.
7122 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
7123 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
7124
7125 // Read the next variant <- also loop target.
7126 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
7127 .addReg(VScalarOp, VScalarOpUndef,
7128 TRI->getSubRegFromChannel(Idx + 1));
7129
7130 ReadlanePieces.push_back(CurRegLo);
7131 ReadlanePieces.push_back(CurRegHi);
7132
7133 // Comparison is to be done as 64-bit.
7134 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
7135 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
7136 .addReg(CurRegLo)
7137 .addImm(AMDGPU::sub0)
7138 .addReg(CurRegHi)
7139 .addImm(AMDGPU::sub1);
7140
7141 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7142 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
7143 NewCondReg)
7144 .addReg(CurReg);
7145 if (NumSubRegs <= 2)
7146 Cmp.addReg(VScalarOp);
7147 else
7148 Cmp.addReg(VScalarOp, VScalarOpUndef,
7149 TRI->getSubRegFromChannel(Idx, 2));
7150
7151 // Combine the comparison results with AND.
7152 if (!CondReg) // First.
7153 CondReg = NewCondReg;
7154 else { // If not the first, we create an AND.
7155 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7156 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7157 .addReg(CondReg)
7158 .addReg(NewCondReg);
7159 CondReg = AndReg;
7160 }
7161 } // End for loop.
7162
7163 const auto *SScalarOpRC =
7164 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
7165 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
7166
7167 // Build scalar ScalarOp.
7168 auto Merge =
7169 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
7170 unsigned Channel = 0;
7171 for (Register Piece : ReadlanePieces) {
7172 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
7173 }
7174
7175 // Update ScalarOp operand to use the SGPR ScalarOp.
7176 ScalarOp->setReg(SScalarOp);
7177 ScalarOp->setIsKill();
7178 }
7179 }
7180
7181 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7182 MRI.setSimpleHint(SaveExec, CondReg);
7183
7184 // Update EXEC to matching lanes, saving original to SaveExec.
7185 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
7186 .addReg(CondReg, RegState::Kill);
7187
7188 // The original instruction is here; we insert the terminators after it.
7189 I = BodyBB.end();
7190
7191 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
7192 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
7193 .addReg(LMC.ExecReg)
7194 .addReg(SaveExec);
7195
7196 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
7197}
7198
7199// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7200// with SGPRs by iterating over all unique values across all lanes.
7201// Returns the loop basic block that now contains \p MI.
7202static MachineBasicBlock *
7206 MachineBasicBlock::iterator Begin = nullptr,
7207 MachineBasicBlock::iterator End = nullptr) {
7208 MachineBasicBlock &MBB = *MI.getParent();
7209 MachineFunction &MF = *MBB.getParent();
7210 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7211 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7212 MachineRegisterInfo &MRI = MF.getRegInfo();
7213 if (!Begin.isValid())
7214 Begin = &MI;
7215 if (!End.isValid()) {
7216 End = &MI;
7217 ++End;
7218 }
7219 const DebugLoc &DL = MI.getDebugLoc();
7221 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7222
7223 // Save SCC. Waterfall Loop may overwrite SCC.
7224 Register SaveSCCReg;
7225
7226 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7227 // rather than unlimited scan everywhere
7228 bool SCCNotDead =
7229 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
7230 std::numeric_limits<unsigned>::max()) !=
7232 if (SCCNotDead) {
7233 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7234 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
7235 .addImm(1)
7236 .addImm(0);
7237 }
7238
7239 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7240
7241 // Save the EXEC mask
7242 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
7243
7244 // Killed uses in the instruction we are waterfalling around will be
7245 // incorrect due to the added control-flow.
7247 ++AfterMI;
7248 for (auto I = Begin; I != AfterMI; I++) {
7249 for (auto &MO : I->all_uses())
7250 MRI.clearKillFlags(MO.getReg());
7251 }
7252
7253 // To insert the loop we need to split the block. Move everything after this
7254 // point to a new block, and insert a new empty block between the two.
7257 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7259 ++MBBI;
7260
7261 MF.insert(MBBI, LoopBB);
7262 MF.insert(MBBI, BodyBB);
7263 MF.insert(MBBI, RemainderBB);
7264
7265 LoopBB->addSuccessor(BodyBB);
7266 BodyBB->addSuccessor(LoopBB);
7267 BodyBB->addSuccessor(RemainderBB);
7268
7269 // Move Begin to MI to the BodyBB, and the remainder of the block to
7270 // RemainderBB.
7271 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7272 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7273 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7274
7275 MBB.addSuccessor(LoopBB);
7276
7277 // Update dominators. We know that MBB immediately dominates LoopBB, that
7278 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7279 // RemainderBB. RemainderBB immediately dominates all of the successors
7280 // transferred to it from MBB that MBB used to properly dominate.
7281 if (MDT) {
7282 MDT->addNewBlock(LoopBB, &MBB);
7283 MDT->addNewBlock(BodyBB, LoopBB);
7284 MDT->addNewBlock(RemainderBB, BodyBB);
7285 for (auto &Succ : RemainderBB->successors()) {
7286 if (MDT->properlyDominates(&MBB, Succ)) {
7287 MDT->changeImmediateDominator(Succ, RemainderBB);
7288 }
7289 }
7290 }
7291
7292 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
7293
7294 MachineBasicBlock::iterator First = RemainderBB->begin();
7295 // Restore SCC
7296 if (SCCNotDead) {
7297 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7298 .addReg(SaveSCCReg, RegState::Kill)
7299 .addImm(0);
7300 }
7301
7302 // Restore the EXEC mask
7303 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7304 .addReg(SaveExec);
7305 return BodyBB;
7306}
7307
7308// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7309static std::tuple<unsigned, unsigned>
7311 MachineBasicBlock &MBB = *MI.getParent();
7312 MachineFunction &MF = *MBB.getParent();
7313 MachineRegisterInfo &MRI = MF.getRegInfo();
7314
7315 // Extract the ptr from the resource descriptor.
7316 unsigned RsrcPtr =
7317 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7318 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7319
7320 // Create an empty resource descriptor
7321 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7322 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7323 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7324 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7325 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7326
7327 // Zero64 = 0
7328 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7329 .addImm(0);
7330
7331 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7332 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7333 .addImm(Lo_32(RsrcDataFormat));
7334
7335 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7336 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7337 .addImm(Hi_32(RsrcDataFormat));
7338
7339 // NewSRsrc = {Zero64, SRsrcFormat}
7340 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7341 .addReg(Zero64)
7342 .addImm(AMDGPU::sub0_sub1)
7343 .addReg(SRsrcFormatLo)
7344 .addImm(AMDGPU::sub2)
7345 .addReg(SRsrcFormatHi)
7346 .addImm(AMDGPU::sub3);
7347
7348 return std::tuple(RsrcPtr, NewSRsrc);
7349}
7350
7353 MachineDominatorTree *MDT) const {
7354 MachineFunction &MF = *MI.getMF();
7355 MachineRegisterInfo &MRI = MF.getRegInfo();
7356 MachineBasicBlock *CreatedBB = nullptr;
7357
7358 // Legalize VOP2
7359 if (isVOP2(MI) || isVOPC(MI)) {
7361 return CreatedBB;
7362 }
7363
7364 // Legalize VOP3
7365 if (isVOP3(MI)) {
7367 return CreatedBB;
7368 }
7369
7370 // Legalize SMRD
7371 if (isSMRD(MI)) {
7373 return CreatedBB;
7374 }
7375
7376 // Legalize FLAT
7377 if (isFLAT(MI)) {
7379 return CreatedBB;
7380 }
7381
7382 // Legalize PHI
7383 // The register class of the operands must be the same type as the register
7384 // class of the output.
7385 if (MI.getOpcode() == AMDGPU::PHI) {
7386 const TargetRegisterClass *VRC = getOpRegClass(MI, 0);
7387 assert(!RI.isSGPRClass(VRC));
7388
7389 // Update all the operands so they have the same type.
7390 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7391 MachineOperand &Op = MI.getOperand(I);
7392 if (!Op.isReg() || !Op.getReg().isVirtual())
7393 continue;
7394
7395 // MI is a PHI instruction.
7396 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7398
7399 // Avoid creating no-op copies with the same src and dst reg class. These
7400 // confuse some of the machine passes.
7401 legalizeGenericOperand(*InsertBB, Insert, VRC, Op, MRI, MI.getDebugLoc());
7402 }
7403 }
7404
7405 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7406 // VGPR dest type and SGPR sources, insert copies so all operands are
7407 // VGPRs. This seems to help operand folding / the register coalescer.
7408 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7409 MachineBasicBlock *MBB = MI.getParent();
7410 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7411 if (RI.hasVGPRs(DstRC)) {
7412 // Update all the operands so they are VGPR register classes. These may
7413 // not be the same register class because REG_SEQUENCE supports mixing
7414 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7415 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7416 MachineOperand &Op = MI.getOperand(I);
7417 if (!Op.isReg() || !Op.getReg().isVirtual())
7418 continue;
7419
7420 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7421 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7422 if (VRC == OpRC)
7423 continue;
7424
7425 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7426 Op.setIsKill();
7427 }
7428 }
7429
7430 return CreatedBB;
7431 }
7432
7433 // Legalize INSERT_SUBREG
7434 // src0 must have the same register class as dst
7435 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7436 Register Dst = MI.getOperand(0).getReg();
7437 Register Src0 = MI.getOperand(1).getReg();
7438 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7439 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7440 if (DstRC != Src0RC) {
7441 MachineBasicBlock *MBB = MI.getParent();
7442 MachineOperand &Op = MI.getOperand(1);
7443 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7444 }
7445 return CreatedBB;
7446 }
7447
7448 // Legalize SI_INIT_M0
7449 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7450 MachineOperand &Src = MI.getOperand(0);
7451 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7452 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7453 return CreatedBB;
7454 }
7455
7456 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7457 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7458 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7459 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7460 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7461 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7462 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7463 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7464 MachineOperand &Src = MI.getOperand(1);
7465 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7466 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7467 return CreatedBB;
7468 }
7469
7470 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7471 //
7472 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7473 // scratch memory access. In both cases, the legalization never involves
7474 // conversion to the addr64 form.
7476 (isMUBUF(MI) || isMTBUF(MI)))) {
7477 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7478 ? AMDGPU::OpName::rsrc
7479 : AMDGPU::OpName::srsrc;
7480 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7481 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7482 CreatedBB = loadScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7483
7484 AMDGPU::OpName SampOpName =
7485 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7486 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7487 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7488 CreatedBB = loadScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
7489
7490 return CreatedBB;
7491 }
7492
7493 // Legalize SI_CALL
7494 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7495 MachineOperand *Dest = &MI.getOperand(0);
7496 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7497 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7498 // following copies, we also need to move copies from and to physical
7499 // registers into the loop block.
7500 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7501 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7502
7503 // Also move the copies to physical registers into the loop block
7504 MachineBasicBlock &MBB = *MI.getParent();
7506 while (Start->getOpcode() != FrameSetupOpcode)
7507 --Start;
7509 while (End->getOpcode() != FrameDestroyOpcode)
7510 ++End;
7511 // Also include following copies of the return value
7512 ++End;
7513 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
7514 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
7515 ++End;
7516 CreatedBB =
7517 loadScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
7518 }
7519 }
7520
7521 // Legalize s_sleep_var.
7522 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7523 const DebugLoc &DL = MI.getDebugLoc();
7524 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7525 int Src0Idx =
7526 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7527 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7528 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7529 .add(Src0);
7530 Src0.ChangeToRegister(Reg, false);
7531 return nullptr;
7532 }
7533
7534 // Legalize TENSOR_LOAD_TO_LDS_d2/_d4, TENSOR_STORE_FROM_LDS_d2/_d4. All their
7535 // operands are scalar.
7536 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d2 ||
7537 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d4 ||
7538 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d2 ||
7539 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d4) {
7540 for (MachineOperand &Src : MI.explicit_operands()) {
7541 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7542 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7543 }
7544 return CreatedBB;
7545 }
7546
7547 // Legalize MUBUF instructions.
7548 bool isSoffsetLegal = true;
7549 int SoffsetIdx =
7550 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7551 if (SoffsetIdx != -1) {
7552 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7553 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7554 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7555 isSoffsetLegal = false;
7556 }
7557 }
7558
7559 bool isRsrcLegal = true;
7560 int RsrcIdx =
7561 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7562 if (RsrcIdx != -1) {
7563 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7564 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7565 isRsrcLegal = false;
7566 }
7567
7568 // The operands are legal.
7569 if (isRsrcLegal && isSoffsetLegal)
7570 return CreatedBB;
7571
7572 if (!isRsrcLegal) {
7573 // Legalize a VGPR Rsrc
7574 //
7575 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7576 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7577 // a zero-value SRsrc.
7578 //
7579 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7580 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7581 // above.
7582 //
7583 // Otherwise we are on non-ADDR64 hardware, and/or we have
7584 // idxen/offen/bothen and we fall back to a waterfall loop.
7585
7586 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7587 MachineBasicBlock &MBB = *MI.getParent();
7588
7589 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7590 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7591 // This is already an ADDR64 instruction so we need to add the pointer
7592 // extracted from the resource descriptor to the current value of VAddr.
7593 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7594 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7595 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7596
7597 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7598 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7599 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7600
7601 unsigned RsrcPtr, NewSRsrc;
7602 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7603
7604 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7605 const DebugLoc &DL = MI.getDebugLoc();
7606 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7607 .addDef(CondReg0)
7608 .addReg(RsrcPtr, {}, AMDGPU::sub0)
7609 .addReg(VAddr->getReg(), {}, AMDGPU::sub0)
7610 .addImm(0);
7611
7612 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7613 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7614 .addDef(CondReg1, RegState::Dead)
7615 .addReg(RsrcPtr, {}, AMDGPU::sub1)
7616 .addReg(VAddr->getReg(), {}, AMDGPU::sub1)
7617 .addReg(CondReg0, RegState::Kill)
7618 .addImm(0);
7619
7620 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7621 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7622 .addReg(NewVAddrLo)
7623 .addImm(AMDGPU::sub0)
7624 .addReg(NewVAddrHi)
7625 .addImm(AMDGPU::sub1);
7626
7627 VAddr->setReg(NewVAddr);
7628 Rsrc->setReg(NewSRsrc);
7629 } else if (!VAddr && ST.hasAddr64()) {
7630 // This instructions is the _OFFSET variant, so we need to convert it to
7631 // ADDR64.
7632 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7633 "FIXME: Need to emit flat atomics here");
7634
7635 unsigned RsrcPtr, NewSRsrc;
7636 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7637
7638 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7639 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7640 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7641 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7642 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7643
7644 // Atomics with return have an additional tied operand and are
7645 // missing some of the special bits.
7646 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7647 MachineInstr *Addr64;
7648
7649 if (!VDataIn) {
7650 // Regular buffer load / store.
7652 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7653 .add(*VData)
7654 .addReg(NewVAddr)
7655 .addReg(NewSRsrc)
7656 .add(*SOffset)
7657 .add(*Offset);
7658
7659 if (const MachineOperand *CPol =
7660 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7661 MIB.addImm(CPol->getImm());
7662 }
7663
7664 if (const MachineOperand *TFE =
7665 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7666 MIB.addImm(TFE->getImm());
7667 }
7668
7669 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7670
7671 MIB.cloneMemRefs(MI);
7672 Addr64 = MIB;
7673 } else {
7674 // Atomics with return.
7675 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7676 .add(*VData)
7677 .add(*VDataIn)
7678 .addReg(NewVAddr)
7679 .addReg(NewSRsrc)
7680 .add(*SOffset)
7681 .add(*Offset)
7682 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7683 .cloneMemRefs(MI);
7684 }
7685
7686 MI.removeFromParent();
7687
7688 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7689 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7690 NewVAddr)
7691 .addReg(RsrcPtr, {}, AMDGPU::sub0)
7692 .addImm(AMDGPU::sub0)
7693 .addReg(RsrcPtr, {}, AMDGPU::sub1)
7694 .addImm(AMDGPU::sub1);
7695 } else {
7696 // Legalize a VGPR Rsrc and soffset together.
7697 if (!isSoffsetLegal) {
7698 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7699 CreatedBB = loadScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7700 return CreatedBB;
7701 }
7702 CreatedBB = loadScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7703 return CreatedBB;
7704 }
7705 }
7706
7707 // Legalize a VGPR soffset.
7708 if (!isSoffsetLegal) {
7709 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7710 CreatedBB = loadScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7711 return CreatedBB;
7712 }
7713 return CreatedBB;
7714}
7715
7717 InstrList.insert(MI);
7718 // Add MBUF instructiosn to deferred list.
7719 int RsrcIdx =
7720 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7721 if (RsrcIdx != -1) {
7722 DeferredList.insert(MI);
7723 }
7724}
7725
7727 return DeferredList.contains(MI);
7728}
7729
7730// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7731// lowering (change sgpr to vgpr).
7732// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7733// size. Need to legalize the size of the operands during the vgpr lowering
7734// chain. This can be removed after we have sgpr16 in place
7736 MachineRegisterInfo &MRI) const {
7737 if (!ST.useRealTrue16Insts())
7738 return;
7739
7740 unsigned Opcode = MI.getOpcode();
7741 MachineBasicBlock *MBB = MI.getParent();
7742 // Legalize operands and check for size mismatch
7743 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7744 OpIdx >= get(Opcode).getNumOperands() ||
7745 get(Opcode).operands()[OpIdx].RegClass == -1)
7746 return;
7747
7748 MachineOperand &Op = MI.getOperand(OpIdx);
7749 if (!Op.isReg() || !Op.getReg().isVirtual())
7750 return;
7751
7752 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7753 if (!RI.isVGPRClass(CurrRC))
7754 return;
7755
7756 int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]);
7757 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7758 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7759 Op.setSubReg(AMDGPU::lo16);
7760 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7761 const DebugLoc &DL = MI.getDebugLoc();
7762 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7763 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7764 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7765 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7766 .addReg(Op.getReg())
7767 .addImm(AMDGPU::lo16)
7768 .addReg(Undef)
7769 .addImm(AMDGPU::hi16);
7770 Op.setReg(NewDstReg);
7771 }
7772}
7774 MachineRegisterInfo &MRI) const {
7775 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7777}
7778
7780 MachineDominatorTree *MDT) const {
7781
7782 while (!Worklist.empty()) {
7783 MachineInstr &Inst = *Worklist.top();
7784 Worklist.erase_top();
7785 // Skip MachineInstr in the deferred list.
7786 if (Worklist.isDeferred(&Inst))
7787 continue;
7788 moveToVALUImpl(Worklist, MDT, Inst);
7789 }
7790
7791 // Deferred list of instructions will be processed once
7792 // all the MachineInstr in the worklist are done.
7793 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7794 moveToVALUImpl(Worklist, MDT, *Inst);
7795 assert(Worklist.empty() &&
7796 "Deferred MachineInstr are not supposed to re-populate worklist");
7797 }
7798}
7799
7802 MachineInstr &Inst) const {
7803
7805 if (!MBB)
7806 return;
7807 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7808 unsigned Opcode = Inst.getOpcode();
7809 unsigned NewOpcode = getVALUOp(Inst);
7810 const DebugLoc &DL = Inst.getDebugLoc();
7811
7812 // Handle some special cases
7813 switch (Opcode) {
7814 default:
7815 break;
7816 case AMDGPU::S_ADD_I32:
7817 case AMDGPU::S_SUB_I32: {
7818 // FIXME: The u32 versions currently selected use the carry.
7819 bool Changed;
7820 MachineBasicBlock *CreatedBBTmp = nullptr;
7821 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7822 if (Changed)
7823 return;
7824
7825 // Default handling
7826 break;
7827 }
7828
7829 case AMDGPU::S_MUL_U64:
7830 if (ST.hasVectorMulU64()) {
7831 NewOpcode = AMDGPU::V_MUL_U64_e64;
7832 break;
7833 }
7834 // Split s_mul_u64 in 32-bit vector multiplications.
7835 splitScalarSMulU64(Worklist, Inst, MDT);
7836 Inst.eraseFromParent();
7837 return;
7838
7839 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7840 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7841 // This is a special case of s_mul_u64 where all the operands are either
7842 // zero extended or sign extended.
7843 splitScalarSMulPseudo(Worklist, Inst, MDT);
7844 Inst.eraseFromParent();
7845 return;
7846
7847 case AMDGPU::S_AND_B64:
7848 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7849 Inst.eraseFromParent();
7850 return;
7851
7852 case AMDGPU::S_OR_B64:
7853 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7854 Inst.eraseFromParent();
7855 return;
7856
7857 case AMDGPU::S_XOR_B64:
7858 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7859 Inst.eraseFromParent();
7860 return;
7861
7862 case AMDGPU::S_NAND_B64:
7863 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7864 Inst.eraseFromParent();
7865 return;
7866
7867 case AMDGPU::S_NOR_B64:
7868 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7869 Inst.eraseFromParent();
7870 return;
7871
7872 case AMDGPU::S_XNOR_B64:
7873 if (ST.hasDLInsts())
7874 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7875 else
7876 splitScalar64BitXnor(Worklist, Inst, MDT);
7877 Inst.eraseFromParent();
7878 return;
7879
7880 case AMDGPU::S_ANDN2_B64:
7881 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7882 Inst.eraseFromParent();
7883 return;
7884
7885 case AMDGPU::S_ORN2_B64:
7886 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7887 Inst.eraseFromParent();
7888 return;
7889
7890 case AMDGPU::S_BREV_B64:
7891 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7892 Inst.eraseFromParent();
7893 return;
7894
7895 case AMDGPU::S_NOT_B64:
7896 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7897 Inst.eraseFromParent();
7898 return;
7899
7900 case AMDGPU::S_BCNT1_I32_B64:
7901 splitScalar64BitBCNT(Worklist, Inst);
7902 Inst.eraseFromParent();
7903 return;
7904
7905 case AMDGPU::S_BFE_I64:
7906 splitScalar64BitBFE(Worklist, Inst);
7907 Inst.eraseFromParent();
7908 return;
7909
7910 case AMDGPU::S_FLBIT_I32_B64:
7911 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7912 Inst.eraseFromParent();
7913 return;
7914 case AMDGPU::S_FF1_I32_B64:
7915 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7916 Inst.eraseFromParent();
7917 return;
7918
7919 case AMDGPU::S_LSHL_B32:
7920 if (ST.hasOnlyRevVALUShifts()) {
7921 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7922 swapOperands(Inst);
7923 }
7924 break;
7925 case AMDGPU::S_ASHR_I32:
7926 if (ST.hasOnlyRevVALUShifts()) {
7927 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7928 swapOperands(Inst);
7929 }
7930 break;
7931 case AMDGPU::S_LSHR_B32:
7932 if (ST.hasOnlyRevVALUShifts()) {
7933 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7934 swapOperands(Inst);
7935 }
7936 break;
7937 case AMDGPU::S_LSHL_B64:
7938 if (ST.hasOnlyRevVALUShifts()) {
7939 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7940 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7941 : AMDGPU::V_LSHLREV_B64_e64;
7942 swapOperands(Inst);
7943 }
7944 break;
7945 case AMDGPU::S_ASHR_I64:
7946 if (ST.hasOnlyRevVALUShifts()) {
7947 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7948 swapOperands(Inst);
7949 }
7950 break;
7951 case AMDGPU::S_LSHR_B64:
7952 if (ST.hasOnlyRevVALUShifts()) {
7953 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7954 swapOperands(Inst);
7955 }
7956 break;
7957
7958 case AMDGPU::S_ABS_I32:
7959 lowerScalarAbs(Worklist, Inst);
7960 Inst.eraseFromParent();
7961 return;
7962
7963 case AMDGPU::S_ABSDIFF_I32:
7964 lowerScalarAbsDiff(Worklist, Inst);
7965 Inst.eraseFromParent();
7966 return;
7967
7968 case AMDGPU::S_CBRANCH_SCC0:
7969 case AMDGPU::S_CBRANCH_SCC1: {
7970 // Clear unused bits of vcc
7971 Register CondReg = Inst.getOperand(1).getReg();
7972 bool IsSCC = CondReg == AMDGPU::SCC;
7974 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
7975 .addReg(LMC.ExecReg)
7976 .addReg(IsSCC ? LMC.VccReg : CondReg);
7977 Inst.removeOperand(1);
7978 } break;
7979
7980 case AMDGPU::S_BFE_U64:
7981 case AMDGPU::S_BFM_B64:
7982 llvm_unreachable("Moving this op to VALU not implemented");
7983
7984 case AMDGPU::S_PACK_LL_B32_B16:
7985 case AMDGPU::S_PACK_LH_B32_B16:
7986 case AMDGPU::S_PACK_HL_B32_B16:
7987 case AMDGPU::S_PACK_HH_B32_B16:
7988 movePackToVALU(Worklist, MRI, Inst);
7989 Inst.eraseFromParent();
7990 return;
7991
7992 case AMDGPU::S_XNOR_B32:
7993 lowerScalarXnor(Worklist, Inst);
7994 Inst.eraseFromParent();
7995 return;
7996
7997 case AMDGPU::S_NAND_B32:
7998 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7999 Inst.eraseFromParent();
8000 return;
8001
8002 case AMDGPU::S_NOR_B32:
8003 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
8004 Inst.eraseFromParent();
8005 return;
8006
8007 case AMDGPU::S_ANDN2_B32:
8008 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
8009 Inst.eraseFromParent();
8010 return;
8011
8012 case AMDGPU::S_ORN2_B32:
8013 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
8014 Inst.eraseFromParent();
8015 return;
8016
8017 // TODO: remove as soon as everything is ready
8018 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
8019 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
8020 // can only be selected from the uniform SDNode.
8021 case AMDGPU::S_ADD_CO_PSEUDO:
8022 case AMDGPU::S_SUB_CO_PSEUDO: {
8023 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
8024 ? AMDGPU::V_ADDC_U32_e64
8025 : AMDGPU::V_SUBB_U32_e64;
8026 const auto *CarryRC = RI.getWaveMaskRegClass();
8027
8028 Register CarryInReg = Inst.getOperand(4).getReg();
8029 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
8030 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
8031 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
8032 .addReg(CarryInReg);
8033 }
8034
8035 Register CarryOutReg = Inst.getOperand(1).getReg();
8036
8037 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
8038 MRI.getRegClass(Inst.getOperand(0).getReg())));
8039 MachineInstr *CarryOp =
8040 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
8041 .addReg(CarryOutReg, RegState::Define)
8042 .add(Inst.getOperand(2))
8043 .add(Inst.getOperand(3))
8044 .addReg(CarryInReg)
8045 .addImm(0);
8046 legalizeOperands(*CarryOp);
8047 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
8048 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8049 Inst.eraseFromParent();
8050 }
8051 return;
8052 case AMDGPU::S_UADDO_PSEUDO:
8053 case AMDGPU::S_USUBO_PSEUDO: {
8054 MachineOperand &Dest0 = Inst.getOperand(0);
8055 MachineOperand &Dest1 = Inst.getOperand(1);
8056 MachineOperand &Src0 = Inst.getOperand(2);
8057 MachineOperand &Src1 = Inst.getOperand(3);
8058
8059 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
8060 ? AMDGPU::V_ADD_CO_U32_e64
8061 : AMDGPU::V_SUB_CO_U32_e64;
8062 const TargetRegisterClass *NewRC =
8063 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
8064 Register DestReg = MRI.createVirtualRegister(NewRC);
8065 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
8066 .addReg(Dest1.getReg(), RegState::Define)
8067 .add(Src0)
8068 .add(Src1)
8069 .addImm(0); // clamp bit
8070
8071 legalizeOperands(*NewInstr, MDT);
8072 MRI.replaceRegWith(Dest0.getReg(), DestReg);
8073 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8074 Inst.eraseFromParent();
8075 }
8076 return;
8077 case AMDGPU::S_LSHL1_ADD_U32:
8078 case AMDGPU::S_LSHL2_ADD_U32:
8079 case AMDGPU::S_LSHL3_ADD_U32:
8080 case AMDGPU::S_LSHL4_ADD_U32: {
8081 MachineOperand &Dest = Inst.getOperand(0);
8082 MachineOperand &Src0 = Inst.getOperand(1);
8083 MachineOperand &Src1 = Inst.getOperand(2);
8084 unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
8085 : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
8086 : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
8087 : 4);
8088
8089 const TargetRegisterClass *NewRC =
8090 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
8091 Register DestReg = MRI.createVirtualRegister(NewRC);
8092 MachineInstr *NewInstr =
8093 BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
8094 .add(Src0)
8095 .addImm(ShiftAmt)
8096 .add(Src1);
8097
8098 legalizeOperands(*NewInstr, MDT);
8099 MRI.replaceRegWith(Dest.getReg(), DestReg);
8100 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8101 Inst.eraseFromParent();
8102 }
8103 return;
8104 case AMDGPU::S_CSELECT_B32:
8105 case AMDGPU::S_CSELECT_B64:
8106 lowerSelect(Worklist, Inst, MDT);
8107 Inst.eraseFromParent();
8108 return;
8109 case AMDGPU::S_CMP_EQ_I32:
8110 case AMDGPU::S_CMP_LG_I32:
8111 case AMDGPU::S_CMP_GT_I32:
8112 case AMDGPU::S_CMP_GE_I32:
8113 case AMDGPU::S_CMP_LT_I32:
8114 case AMDGPU::S_CMP_LE_I32:
8115 case AMDGPU::S_CMP_EQ_U32:
8116 case AMDGPU::S_CMP_LG_U32:
8117 case AMDGPU::S_CMP_GT_U32:
8118 case AMDGPU::S_CMP_GE_U32:
8119 case AMDGPU::S_CMP_LT_U32:
8120 case AMDGPU::S_CMP_LE_U32:
8121 case AMDGPU::S_CMP_EQ_U64:
8122 case AMDGPU::S_CMP_LG_U64:
8123 case AMDGPU::S_CMP_LT_F32:
8124 case AMDGPU::S_CMP_EQ_F32:
8125 case AMDGPU::S_CMP_LE_F32:
8126 case AMDGPU::S_CMP_GT_F32:
8127 case AMDGPU::S_CMP_LG_F32:
8128 case AMDGPU::S_CMP_GE_F32:
8129 case AMDGPU::S_CMP_O_F32:
8130 case AMDGPU::S_CMP_U_F32:
8131 case AMDGPU::S_CMP_NGE_F32:
8132 case AMDGPU::S_CMP_NLG_F32:
8133 case AMDGPU::S_CMP_NGT_F32:
8134 case AMDGPU::S_CMP_NLE_F32:
8135 case AMDGPU::S_CMP_NEQ_F32:
8136 case AMDGPU::S_CMP_NLT_F32: {
8137 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8138 auto NewInstr =
8139 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8140 .setMIFlags(Inst.getFlags());
8141 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
8142 0) {
8143 NewInstr
8144 .addImm(0) // src0_modifiers
8145 .add(Inst.getOperand(0)) // src0
8146 .addImm(0) // src1_modifiers
8147 .add(Inst.getOperand(1)) // src1
8148 .addImm(0); // clamp
8149 } else {
8150 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
8151 }
8152 legalizeOperands(*NewInstr, MDT);
8153 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8154 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8155 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8156 Inst.eraseFromParent();
8157 return;
8158 }
8159 case AMDGPU::S_CMP_LT_F16:
8160 case AMDGPU::S_CMP_EQ_F16:
8161 case AMDGPU::S_CMP_LE_F16:
8162 case AMDGPU::S_CMP_GT_F16:
8163 case AMDGPU::S_CMP_LG_F16:
8164 case AMDGPU::S_CMP_GE_F16:
8165 case AMDGPU::S_CMP_O_F16:
8166 case AMDGPU::S_CMP_U_F16:
8167 case AMDGPU::S_CMP_NGE_F16:
8168 case AMDGPU::S_CMP_NLG_F16:
8169 case AMDGPU::S_CMP_NGT_F16:
8170 case AMDGPU::S_CMP_NLE_F16:
8171 case AMDGPU::S_CMP_NEQ_F16:
8172 case AMDGPU::S_CMP_NLT_F16: {
8173 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8174 auto NewInstr =
8175 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8176 .setMIFlags(Inst.getFlags());
8177 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
8178 NewInstr
8179 .addImm(0) // src0_modifiers
8180 .add(Inst.getOperand(0)) // src0
8181 .addImm(0) // src1_modifiers
8182 .add(Inst.getOperand(1)) // src1
8183 .addImm(0); // clamp
8184 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8185 NewInstr.addImm(0); // op_sel0
8186 } else {
8187 NewInstr
8188 .add(Inst.getOperand(0))
8189 .add(Inst.getOperand(1));
8190 }
8191 legalizeOperandsVALUt16(*NewInstr, MRI);
8192 legalizeOperands(*NewInstr, MDT);
8193 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8194 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8195 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8196 Inst.eraseFromParent();
8197 return;
8198 }
8199 case AMDGPU::S_CVT_HI_F32_F16: {
8200 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8201 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8202 if (ST.useRealTrue16Insts()) {
8203 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
8204 .add(Inst.getOperand(1));
8205 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8206 .addImm(0) // src0_modifiers
8207 .addReg(TmpReg, {}, AMDGPU::hi16)
8208 .addImm(0) // clamp
8209 .addImm(0) // omod
8210 .addImm(0); // op_sel0
8211 } else {
8212 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8213 .addImm(16)
8214 .add(Inst.getOperand(1));
8215 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8216 .addImm(0) // src0_modifiers
8217 .addReg(TmpReg)
8218 .addImm(0) // clamp
8219 .addImm(0); // omod
8220 }
8221
8222 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8223 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8224 Inst.eraseFromParent();
8225 return;
8226 }
8227 case AMDGPU::S_MINIMUM_F32:
8228 case AMDGPU::S_MAXIMUM_F32: {
8229 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8230 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8231 .addImm(0) // src0_modifiers
8232 .add(Inst.getOperand(1))
8233 .addImm(0) // src1_modifiers
8234 .add(Inst.getOperand(2))
8235 .addImm(0) // clamp
8236 .addImm(0); // omod
8237 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8238
8239 legalizeOperands(*NewInstr, MDT);
8240 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8241 Inst.eraseFromParent();
8242 return;
8243 }
8244 case AMDGPU::S_MINIMUM_F16:
8245 case AMDGPU::S_MAXIMUM_F16: {
8246 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8247 ? &AMDGPU::VGPR_16RegClass
8248 : &AMDGPU::VGPR_32RegClass);
8249 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8250 .addImm(0) // src0_modifiers
8251 .add(Inst.getOperand(1))
8252 .addImm(0) // src1_modifiers
8253 .add(Inst.getOperand(2))
8254 .addImm(0) // clamp
8255 .addImm(0) // omod
8256 .addImm(0); // opsel0
8257 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8258 legalizeOperandsVALUt16(*NewInstr, MRI);
8259 legalizeOperands(*NewInstr, MDT);
8260 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8261 Inst.eraseFromParent();
8262 return;
8263 }
8264 case AMDGPU::V_S_EXP_F16_e64:
8265 case AMDGPU::V_S_LOG_F16_e64:
8266 case AMDGPU::V_S_RCP_F16_e64:
8267 case AMDGPU::V_S_RSQ_F16_e64:
8268 case AMDGPU::V_S_SQRT_F16_e64: {
8269 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8270 ? &AMDGPU::VGPR_16RegClass
8271 : &AMDGPU::VGPR_32RegClass);
8272 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8273 .add(Inst.getOperand(1)) // src0_modifiers
8274 .add(Inst.getOperand(2))
8275 .add(Inst.getOperand(3)) // clamp
8276 .add(Inst.getOperand(4)) // omod
8277 .setMIFlags(Inst.getFlags());
8278 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8279 NewInstr.addImm(0); // opsel0
8280 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8281 legalizeOperandsVALUt16(*NewInstr, MRI);
8282 legalizeOperands(*NewInstr, MDT);
8283 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8284 Inst.eraseFromParent();
8285 return;
8286 }
8287 }
8288
8289 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8290 // We cannot move this instruction to the VALU, so we should try to
8291 // legalize its operands instead.
8292 legalizeOperands(Inst, MDT);
8293 return;
8294 }
8295 // Handle converting generic instructions like COPY-to-SGPR into
8296 // COPY-to-VGPR.
8297 if (NewOpcode == Opcode) {
8298 Register DstReg = Inst.getOperand(0).getReg();
8299 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8300
8301 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8302 // hope for the best.
8303 if (Inst.isCopy() && DstReg.isPhysical() &&
8304 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8305 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8306 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8307 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
8308 .add(Inst.getOperand(1));
8309 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
8310 DstReg)
8311 .addReg(NewDst);
8312
8313 Inst.eraseFromParent();
8314 return;
8315 }
8316
8317 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
8318 Register NewDstReg = Inst.getOperand(1).getReg();
8319 const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
8320 if (const TargetRegisterClass *CommonRC =
8321 RI.getCommonSubClass(NewDstRC, SrcRC)) {
8322 // Instead of creating a copy where src and dst are the same register
8323 // class, we just replace all uses of dst with src. These kinds of
8324 // copies interfere with the heuristics MachineSink uses to decide
8325 // whether or not to split a critical edge. Since the pass assumes
8326 // that copies will end up as machine instructions and not be
8327 // eliminated.
8328 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8329 MRI.replaceRegWith(DstReg, NewDstReg);
8330 MRI.clearKillFlags(NewDstReg);
8331 Inst.getOperand(0).setReg(DstReg);
8332
8333 if (!MRI.constrainRegClass(NewDstReg, CommonRC))
8334 llvm_unreachable("failed to constrain register");
8335
8336 Inst.eraseFromParent();
8337
8338 for (MachineOperand &UseMO :
8339 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8340 MachineInstr &UseMI = *UseMO.getParent();
8341
8342 // Legalize t16 operands since replaceReg is called after
8343 // addUsersToVALU.
8345
8346 unsigned OpIdx = UseMI.getOperandNo(&UseMO);
8347 if (const TargetRegisterClass *OpRC =
8348 getRegClass(UseMI.getDesc(), OpIdx))
8349 MRI.constrainRegClass(NewDstReg, OpRC);
8350 }
8351
8352 return;
8353 }
8354 }
8355
8356 // If this is a v2s copy between 16bit and 32bit reg,
8357 // replace vgpr copy to reg_sequence/extract_subreg
8358 // This can be remove after we have sgpr16 in place
8359 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8360 Inst.getOperand(1).getReg().isVirtual() &&
8361 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8362 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8363 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8364 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8365 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8366 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8367 get(AMDGPU::IMPLICIT_DEF), Undef);
8368 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8369 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8370 .addReg(Inst.getOperand(1).getReg())
8371 .addImm(AMDGPU::lo16)
8372 .addReg(Undef)
8373 .addImm(AMDGPU::hi16);
8374 Inst.eraseFromParent();
8375 MRI.replaceRegWith(DstReg, NewDstReg);
8376 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8377 return;
8378 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8379 AMDGPU::lo16)) {
8380 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8381 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8382 MRI.replaceRegWith(DstReg, NewDstReg);
8383 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8384 return;
8385 }
8386 }
8387
8388 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8389 MRI.replaceRegWith(DstReg, NewDstReg);
8390 legalizeOperands(Inst, MDT);
8391 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8392 return;
8393 }
8394
8395 // Use the new VALU Opcode.
8396 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8397 .setMIFlags(Inst.getFlags());
8398 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8399 // Intersperse VOP3 modifiers among the SALU operands.
8400 NewInstr->addOperand(Inst.getOperand(0));
8401 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8402 AMDGPU::OpName::src0_modifiers) >= 0)
8403 NewInstr.addImm(0);
8404 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8405 const MachineOperand &Src = Inst.getOperand(1);
8406 NewInstr->addOperand(Src);
8407 }
8408
8409 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8410 // We are converting these to a BFE, so we need to add the missing
8411 // operands for the size and offset.
8412 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8413 NewInstr.addImm(0);
8414 NewInstr.addImm(Size);
8415 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8416 // The VALU version adds the second operand to the result, so insert an
8417 // extra 0 operand.
8418 NewInstr.addImm(0);
8419 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8420 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8421 // If we need to move this to VGPRs, we need to unpack the second
8422 // operand back into the 2 separate ones for bit offset and width.
8423 assert(OffsetWidthOp.isImm() &&
8424 "Scalar BFE is only implemented for constant width and offset");
8425 uint32_t Imm = OffsetWidthOp.getImm();
8426
8427 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8428 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8429 NewInstr.addImm(Offset);
8430 NewInstr.addImm(BitWidth);
8431 } else {
8432 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8433 AMDGPU::OpName::src1_modifiers) >= 0)
8434 NewInstr.addImm(0);
8435 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8436 NewInstr->addOperand(Inst.getOperand(2));
8437 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8438 AMDGPU::OpName::src2_modifiers) >= 0)
8439 NewInstr.addImm(0);
8440 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8441 NewInstr->addOperand(Inst.getOperand(3));
8442 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8443 NewInstr.addImm(0);
8444 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8445 NewInstr.addImm(0);
8446 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8447 NewInstr.addImm(0);
8448 }
8449 } else {
8450 // Just copy the SALU operands.
8451 for (const MachineOperand &Op : Inst.explicit_operands())
8452 NewInstr->addOperand(Op);
8453 }
8454
8455 // Remove any references to SCC. Vector instructions can't read from it, and
8456 // We're just about to add the implicit use / defs of VCC, and we don't want
8457 // both.
8458 for (MachineOperand &Op : Inst.implicit_operands()) {
8459 if (Op.getReg() == AMDGPU::SCC) {
8460 // Only propagate through live-def of SCC.
8461 if (Op.isDef() && !Op.isDead())
8462 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8463 if (Op.isUse())
8464 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8465 }
8466 }
8467 Inst.eraseFromParent();
8468 Register NewDstReg;
8469 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8470 Register DstReg = NewInstr->getOperand(0).getReg();
8471 assert(DstReg.isVirtual());
8472 // Update the destination register class.
8473 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8474 assert(NewDstRC);
8475 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8476 MRI.replaceRegWith(DstReg, NewDstReg);
8477 }
8478 fixImplicitOperands(*NewInstr);
8479
8480 legalizeOperandsVALUt16(*NewInstr, MRI);
8481
8482 // Legalize the operands
8483 legalizeOperands(*NewInstr, MDT);
8484 if (NewDstReg)
8485 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8486}
8487
8488// Add/sub require special handling to deal with carry outs.
8489std::pair<bool, MachineBasicBlock *>
8490SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8491 MachineDominatorTree *MDT) const {
8492 if (ST.hasAddNoCarryInsts()) {
8493 // Assume there is no user of scc since we don't select this in that case.
8494 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8495 // is used.
8496
8497 MachineBasicBlock &MBB = *Inst.getParent();
8498 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8499
8500 Register OldDstReg = Inst.getOperand(0).getReg();
8501 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8502
8503 unsigned Opc = Inst.getOpcode();
8504 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8505
8506 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8507 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8508
8509 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8510 Inst.removeOperand(3);
8511
8512 Inst.setDesc(get(NewOpc));
8513 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8514 Inst.addImplicitDefUseOperands(*MBB.getParent());
8515 MRI.replaceRegWith(OldDstReg, ResultReg);
8516 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8517
8518 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8519 return std::pair(true, NewBB);
8520 }
8521
8522 return std::pair(false, nullptr);
8523}
8524
8525void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8526 MachineDominatorTree *MDT) const {
8527
8528 MachineBasicBlock &MBB = *Inst.getParent();
8529 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8530 MachineBasicBlock::iterator MII = Inst;
8531 const DebugLoc &DL = Inst.getDebugLoc();
8532
8533 MachineOperand &Dest = Inst.getOperand(0);
8534 MachineOperand &Src0 = Inst.getOperand(1);
8535 MachineOperand &Src1 = Inst.getOperand(2);
8536 MachineOperand &Cond = Inst.getOperand(3);
8537
8538 Register CondReg = Cond.getReg();
8539 bool IsSCC = (CondReg == AMDGPU::SCC);
8540
8541 // If this is a trivial select where the condition is effectively not SCC
8542 // (CondReg is a source of copy to SCC), then the select is semantically
8543 // equivalent to copying CondReg. Hence, there is no need to create
8544 // V_CNDMASK, we can just use that and bail out.
8545 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8546 (Src1.getImm() == 0)) {
8547 MRI.replaceRegWith(Dest.getReg(), CondReg);
8548 return;
8549 }
8550
8551 Register NewCondReg = CondReg;
8552 if (IsSCC) {
8553 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8554 NewCondReg = MRI.createVirtualRegister(TC);
8555
8556 // Now look for the closest SCC def if it is a copy
8557 // replacing the CondReg with the COPY source register
8558 bool CopyFound = false;
8559 for (MachineInstr &CandI :
8561 Inst.getParent()->rend())) {
8562 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8563 -1) {
8564 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8565 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8566 .addReg(CandI.getOperand(1).getReg());
8567 CopyFound = true;
8568 }
8569 break;
8570 }
8571 }
8572 if (!CopyFound) {
8573 // SCC def is not a copy
8574 // Insert a trivial select instead of creating a copy, because a copy from
8575 // SCC would semantically mean just copying a single bit, but we may need
8576 // the result to be a vector condition mask that needs preserving.
8577 unsigned Opcode =
8578 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8579 auto NewSelect =
8580 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8581 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8582 }
8583 }
8584
8585 Register NewDestReg = MRI.createVirtualRegister(
8586 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8587 MachineInstr *NewInst;
8588 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8589 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8590 .addImm(0)
8591 .add(Src1) // False
8592 .addImm(0)
8593 .add(Src0) // True
8594 .addReg(NewCondReg);
8595 } else {
8596 NewInst =
8597 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8598 .add(Src1) // False
8599 .add(Src0) // True
8600 .addReg(NewCondReg);
8601 }
8602 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8603 legalizeOperands(*NewInst, MDT);
8604 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8605}
8606
8607void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8608 MachineInstr &Inst) const {
8609 MachineBasicBlock &MBB = *Inst.getParent();
8610 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8611 MachineBasicBlock::iterator MII = Inst;
8612 const DebugLoc &DL = Inst.getDebugLoc();
8613
8614 MachineOperand &Dest = Inst.getOperand(0);
8615 MachineOperand &Src = Inst.getOperand(1);
8616 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8617 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8618
8619 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8620 : AMDGPU::V_SUB_CO_U32_e32;
8621
8622 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8623 .addImm(0)
8624 .addReg(Src.getReg());
8625
8626 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8627 .addReg(Src.getReg())
8628 .addReg(TmpReg);
8629
8630 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8631 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8632}
8633
8634void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
8635 MachineInstr &Inst) const {
8636 MachineBasicBlock &MBB = *Inst.getParent();
8637 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8638 MachineBasicBlock::iterator MII = Inst;
8639 const DebugLoc &DL = Inst.getDebugLoc();
8640
8641 MachineOperand &Dest = Inst.getOperand(0);
8642 MachineOperand &Src1 = Inst.getOperand(1);
8643 MachineOperand &Src2 = Inst.getOperand(2);
8644 Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8645 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8646 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8647
8648 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8649 : AMDGPU::V_SUB_CO_U32_e32;
8650
8651 BuildMI(MBB, MII, DL, get(SubOp), SubResultReg)
8652 .addReg(Src1.getReg())
8653 .addReg(Src2.getReg());
8654
8655 BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg);
8656
8657 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8658 .addReg(SubResultReg)
8659 .addReg(TmpReg);
8660
8661 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8662 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8663}
8664
8665void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8666 MachineInstr &Inst) const {
8667 MachineBasicBlock &MBB = *Inst.getParent();
8668 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8669 MachineBasicBlock::iterator MII = Inst;
8670 const DebugLoc &DL = Inst.getDebugLoc();
8671
8672 MachineOperand &Dest = Inst.getOperand(0);
8673 MachineOperand &Src0 = Inst.getOperand(1);
8674 MachineOperand &Src1 = Inst.getOperand(2);
8675
8676 if (ST.hasDLInsts()) {
8677 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8678 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8679 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8680
8681 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8682 .add(Src0)
8683 .add(Src1);
8684
8685 MRI.replaceRegWith(Dest.getReg(), NewDest);
8686 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8687 } else {
8688 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8689 // invert either source and then perform the XOR. If either source is a
8690 // scalar register, then we can leave the inversion on the scalar unit to
8691 // achieve a better distribution of scalar and vector instructions.
8692 bool Src0IsSGPR = Src0.isReg() &&
8693 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8694 bool Src1IsSGPR = Src1.isReg() &&
8695 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8696 MachineInstr *Xor;
8697 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8698 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8699
8700 // Build a pair of scalar instructions and add them to the work list.
8701 // The next iteration over the work list will lower these to the vector
8702 // unit as necessary.
8703 if (Src0IsSGPR) {
8704 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8705 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8706 .addReg(Temp)
8707 .add(Src1);
8708 } else if (Src1IsSGPR) {
8709 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8710 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8711 .add(Src0)
8712 .addReg(Temp);
8713 } else {
8714 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8715 .add(Src0)
8716 .add(Src1);
8717 MachineInstr *Not =
8718 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8719 Worklist.insert(Not);
8720 }
8721
8722 MRI.replaceRegWith(Dest.getReg(), NewDest);
8723
8724 Worklist.insert(Xor);
8725
8726 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8727 }
8728}
8729
8730void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8731 MachineInstr &Inst,
8732 unsigned Opcode) const {
8733 MachineBasicBlock &MBB = *Inst.getParent();
8734 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8735 MachineBasicBlock::iterator MII = Inst;
8736 const DebugLoc &DL = Inst.getDebugLoc();
8737
8738 MachineOperand &Dest = Inst.getOperand(0);
8739 MachineOperand &Src0 = Inst.getOperand(1);
8740 MachineOperand &Src1 = Inst.getOperand(2);
8741
8742 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8743 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8744
8745 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8746 .add(Src0)
8747 .add(Src1);
8748
8749 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8750 .addReg(Interm);
8751
8752 Worklist.insert(&Op);
8753 Worklist.insert(&Not);
8754
8755 MRI.replaceRegWith(Dest.getReg(), NewDest);
8756 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8757}
8758
8759void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8760 MachineInstr &Inst,
8761 unsigned Opcode) const {
8762 MachineBasicBlock &MBB = *Inst.getParent();
8763 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8764 MachineBasicBlock::iterator MII = Inst;
8765 const DebugLoc &DL = Inst.getDebugLoc();
8766
8767 MachineOperand &Dest = Inst.getOperand(0);
8768 MachineOperand &Src0 = Inst.getOperand(1);
8769 MachineOperand &Src1 = Inst.getOperand(2);
8770
8771 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8772 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8773
8774 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8775 .add(Src1);
8776
8777 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8778 .add(Src0)
8779 .addReg(Interm);
8780
8781 Worklist.insert(&Not);
8782 Worklist.insert(&Op);
8783
8784 MRI.replaceRegWith(Dest.getReg(), NewDest);
8785 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8786}
8787
8788void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8789 MachineInstr &Inst, unsigned Opcode,
8790 bool Swap) const {
8791 MachineBasicBlock &MBB = *Inst.getParent();
8792 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8793
8794 MachineOperand &Dest = Inst.getOperand(0);
8795 MachineOperand &Src0 = Inst.getOperand(1);
8796 const DebugLoc &DL = Inst.getDebugLoc();
8797
8798 MachineBasicBlock::iterator MII = Inst;
8799
8800 const MCInstrDesc &InstDesc = get(Opcode);
8801 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8802 MRI.getRegClass(Src0.getReg()) :
8803 &AMDGPU::SGPR_32RegClass;
8804
8805 const TargetRegisterClass *Src0SubRC =
8806 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8807
8808 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8809 AMDGPU::sub0, Src0SubRC);
8810
8811 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8812 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8813 const TargetRegisterClass *NewDestSubRC =
8814 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8815
8816 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8817 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8818
8819 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8820 AMDGPU::sub1, Src0SubRC);
8821
8822 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8823 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8824
8825 if (Swap)
8826 std::swap(DestSub0, DestSub1);
8827
8828 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8829 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8830 .addReg(DestSub0)
8831 .addImm(AMDGPU::sub0)
8832 .addReg(DestSub1)
8833 .addImm(AMDGPU::sub1);
8834
8835 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8836
8837 Worklist.insert(&LoHalf);
8838 Worklist.insert(&HiHalf);
8839
8840 // We don't need to legalizeOperands here because for a single operand, src0
8841 // will support any kind of input.
8842
8843 // Move all users of this moved value.
8844 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8845}
8846
8847// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8848// split the s_mul_u64 in 32-bit vector multiplications.
8849void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8850 MachineInstr &Inst,
8851 MachineDominatorTree *MDT) const {
8852 MachineBasicBlock &MBB = *Inst.getParent();
8853 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8854
8855 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8856 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8857 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8858
8859 MachineOperand &Dest = Inst.getOperand(0);
8860 MachineOperand &Src0 = Inst.getOperand(1);
8861 MachineOperand &Src1 = Inst.getOperand(2);
8862 const DebugLoc &DL = Inst.getDebugLoc();
8863 MachineBasicBlock::iterator MII = Inst;
8864
8865 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8866 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8867 const TargetRegisterClass *Src0SubRC =
8868 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8869 if (RI.isSGPRClass(Src0SubRC))
8870 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8871 const TargetRegisterClass *Src1SubRC =
8872 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8873 if (RI.isSGPRClass(Src1SubRC))
8874 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8875
8876 // First, we extract the low 32-bit and high 32-bit values from each of the
8877 // operands.
8878 MachineOperand Op0L =
8879 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8880 MachineOperand Op1L =
8881 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8882 MachineOperand Op0H =
8883 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8884 MachineOperand Op1H =
8885 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8886
8887 // The multilication is done as follows:
8888 //
8889 // Op1H Op1L
8890 // * Op0H Op0L
8891 // --------------------
8892 // Op1H*Op0L Op1L*Op0L
8893 // + Op1H*Op0H Op1L*Op0H
8894 // -----------------------------------------
8895 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8896 //
8897 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8898 // value and that would overflow.
8899 // The low 32-bit value is Op1L*Op0L.
8900 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8901
8902 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8903 MachineInstr *Op1L_Op0H =
8904 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8905 .add(Op1L)
8906 .add(Op0H);
8907
8908 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8909 MachineInstr *Op1H_Op0L =
8910 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8911 .add(Op1H)
8912 .add(Op0L);
8913
8914 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8915 MachineInstr *Carry =
8916 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8917 .add(Op1L)
8918 .add(Op0L);
8919
8920 MachineInstr *LoHalf =
8921 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8922 .add(Op1L)
8923 .add(Op0L);
8924
8925 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8926 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8927 .addReg(Op1L_Op0H_Reg)
8928 .addReg(Op1H_Op0L_Reg);
8929
8930 MachineInstr *HiHalf =
8931 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8932 .addReg(AddReg)
8933 .addReg(CarryReg);
8934
8935 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8936 .addReg(DestSub0)
8937 .addImm(AMDGPU::sub0)
8938 .addReg(DestSub1)
8939 .addImm(AMDGPU::sub1);
8940
8941 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8942
8943 // Try to legalize the operands in case we need to swap the order to keep it
8944 // valid.
8945 legalizeOperands(*Op1L_Op0H, MDT);
8946 legalizeOperands(*Op1H_Op0L, MDT);
8947 legalizeOperands(*Carry, MDT);
8948 legalizeOperands(*LoHalf, MDT);
8949 legalizeOperands(*Add, MDT);
8950 legalizeOperands(*HiHalf, MDT);
8951
8952 // Move all users of this moved value.
8953 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8954}
8955
8956// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8957// multiplications.
8958void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8959 MachineInstr &Inst,
8960 MachineDominatorTree *MDT) const {
8961 MachineBasicBlock &MBB = *Inst.getParent();
8962 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8963
8964 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8965 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8966 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8967
8968 MachineOperand &Dest = Inst.getOperand(0);
8969 MachineOperand &Src0 = Inst.getOperand(1);
8970 MachineOperand &Src1 = Inst.getOperand(2);
8971 const DebugLoc &DL = Inst.getDebugLoc();
8972 MachineBasicBlock::iterator MII = Inst;
8973
8974 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8975 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8976 const TargetRegisterClass *Src0SubRC =
8977 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8978 if (RI.isSGPRClass(Src0SubRC))
8979 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8980 const TargetRegisterClass *Src1SubRC =
8981 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8982 if (RI.isSGPRClass(Src1SubRC))
8983 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8984
8985 // First, we extract the low 32-bit and high 32-bit values from each of the
8986 // operands.
8987 MachineOperand Op0L =
8988 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8989 MachineOperand Op1L =
8990 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8991
8992 unsigned Opc = Inst.getOpcode();
8993 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8994 ? AMDGPU::V_MUL_HI_U32_e64
8995 : AMDGPU::V_MUL_HI_I32_e64;
8996 MachineInstr *HiHalf =
8997 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8998
8999 MachineInstr *LoHalf =
9000 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
9001 .add(Op1L)
9002 .add(Op0L);
9003
9004 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9005 .addReg(DestSub0)
9006 .addImm(AMDGPU::sub0)
9007 .addReg(DestSub1)
9008 .addImm(AMDGPU::sub1);
9009
9010 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9011
9012 // Try to legalize the operands in case we need to swap the order to keep it
9013 // valid.
9014 legalizeOperands(*HiHalf, MDT);
9015 legalizeOperands(*LoHalf, MDT);
9016
9017 // Move all users of this moved value.
9018 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9019}
9020
9021void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
9022 MachineInstr &Inst, unsigned Opcode,
9023 MachineDominatorTree *MDT) const {
9024 MachineBasicBlock &MBB = *Inst.getParent();
9025 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9026
9027 MachineOperand &Dest = Inst.getOperand(0);
9028 MachineOperand &Src0 = Inst.getOperand(1);
9029 MachineOperand &Src1 = Inst.getOperand(2);
9030 const DebugLoc &DL = Inst.getDebugLoc();
9031
9032 MachineBasicBlock::iterator MII = Inst;
9033
9034 const MCInstrDesc &InstDesc = get(Opcode);
9035 const TargetRegisterClass *Src0RC = Src0.isReg() ?
9036 MRI.getRegClass(Src0.getReg()) :
9037 &AMDGPU::SGPR_32RegClass;
9038
9039 const TargetRegisterClass *Src0SubRC =
9040 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9041 const TargetRegisterClass *Src1RC = Src1.isReg() ?
9042 MRI.getRegClass(Src1.getReg()) :
9043 &AMDGPU::SGPR_32RegClass;
9044
9045 const TargetRegisterClass *Src1SubRC =
9046 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9047
9048 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9049 AMDGPU::sub0, Src0SubRC);
9050 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9051 AMDGPU::sub0, Src1SubRC);
9052 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9053 AMDGPU::sub1, Src0SubRC);
9054 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9055 AMDGPU::sub1, Src1SubRC);
9056
9057 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9058 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
9059 const TargetRegisterClass *NewDestSubRC =
9060 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
9061
9062 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
9063 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
9064 .add(SrcReg0Sub0)
9065 .add(SrcReg1Sub0);
9066
9067 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
9068 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
9069 .add(SrcReg0Sub1)
9070 .add(SrcReg1Sub1);
9071
9072 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
9073 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9074 .addReg(DestSub0)
9075 .addImm(AMDGPU::sub0)
9076 .addReg(DestSub1)
9077 .addImm(AMDGPU::sub1);
9078
9079 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9080
9081 Worklist.insert(&LoHalf);
9082 Worklist.insert(&HiHalf);
9083
9084 // Move all users of this moved value.
9085 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9086}
9087
9088void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
9089 MachineInstr &Inst,
9090 MachineDominatorTree *MDT) const {
9091 MachineBasicBlock &MBB = *Inst.getParent();
9092 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9093
9094 MachineOperand &Dest = Inst.getOperand(0);
9095 MachineOperand &Src0 = Inst.getOperand(1);
9096 MachineOperand &Src1 = Inst.getOperand(2);
9097 const DebugLoc &DL = Inst.getDebugLoc();
9098
9099 MachineBasicBlock::iterator MII = Inst;
9100
9101 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9102
9103 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
9104
9105 MachineOperand* Op0;
9106 MachineOperand* Op1;
9107
9108 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
9109 Op0 = &Src0;
9110 Op1 = &Src1;
9111 } else {
9112 Op0 = &Src1;
9113 Op1 = &Src0;
9114 }
9115
9116 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
9117 .add(*Op0);
9118
9119 Register NewDest = MRI.createVirtualRegister(DestRC);
9120
9121 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
9122 .addReg(Interm)
9123 .add(*Op1);
9124
9125 MRI.replaceRegWith(Dest.getReg(), NewDest);
9126
9127 Worklist.insert(&Xor);
9128}
9129
9130void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
9131 MachineInstr &Inst) const {
9132 MachineBasicBlock &MBB = *Inst.getParent();
9133 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9134
9135 MachineBasicBlock::iterator MII = Inst;
9136 const DebugLoc &DL = Inst.getDebugLoc();
9137
9138 MachineOperand &Dest = Inst.getOperand(0);
9139 MachineOperand &Src = Inst.getOperand(1);
9140
9141 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
9142 const TargetRegisterClass *SrcRC = Src.isReg() ?
9143 MRI.getRegClass(Src.getReg()) :
9144 &AMDGPU::SGPR_32RegClass;
9145
9146 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9147 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9148
9149 const TargetRegisterClass *SrcSubRC =
9150 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9151
9152 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9153 AMDGPU::sub0, SrcSubRC);
9154 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9155 AMDGPU::sub1, SrcSubRC);
9156
9157 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
9158
9159 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
9160
9161 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9162
9163 // We don't need to legalize operands here. src0 for either instruction can be
9164 // an SGPR, and the second input is unused or determined here.
9165 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9166}
9167
9168void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
9169 MachineInstr &Inst) const {
9170 MachineBasicBlock &MBB = *Inst.getParent();
9171 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9172 MachineBasicBlock::iterator MII = Inst;
9173 const DebugLoc &DL = Inst.getDebugLoc();
9174
9175 MachineOperand &Dest = Inst.getOperand(0);
9176 uint32_t Imm = Inst.getOperand(2).getImm();
9177 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
9178 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
9179
9180 (void) Offset;
9181
9182 // Only sext_inreg cases handled.
9183 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
9184 Offset == 0 && "Not implemented");
9185
9186 if (BitWidth < 32) {
9187 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9188 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9189 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9190
9191 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
9192 .addReg(Inst.getOperand(1).getReg(), {}, AMDGPU::sub0)
9193 .addImm(0)
9194 .addImm(BitWidth);
9195
9196 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
9197 .addImm(31)
9198 .addReg(MidRegLo);
9199
9200 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9201 .addReg(MidRegLo)
9202 .addImm(AMDGPU::sub0)
9203 .addReg(MidRegHi)
9204 .addImm(AMDGPU::sub1);
9205
9206 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9207 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9208 return;
9209 }
9210
9211 MachineOperand &Src = Inst.getOperand(1);
9212 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9213 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9214
9215 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
9216 .addImm(31)
9217 .addReg(Src.getReg(), {}, AMDGPU::sub0);
9218
9219 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9220 .addReg(Src.getReg(), {}, AMDGPU::sub0)
9221 .addImm(AMDGPU::sub0)
9222 .addReg(TmpReg)
9223 .addImm(AMDGPU::sub1);
9224
9225 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9226 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9227}
9228
9229void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
9230 MachineInstr &Inst, unsigned Opcode,
9231 MachineDominatorTree *MDT) const {
9232 // (S_FLBIT_I32_B64 hi:lo) ->
9233 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
9234 // (S_FF1_I32_B64 hi:lo) ->
9235 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
9236
9237 MachineBasicBlock &MBB = *Inst.getParent();
9238 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9239 MachineBasicBlock::iterator MII = Inst;
9240 const DebugLoc &DL = Inst.getDebugLoc();
9241
9242 MachineOperand &Dest = Inst.getOperand(0);
9243 MachineOperand &Src = Inst.getOperand(1);
9244
9245 const MCInstrDesc &InstDesc = get(Opcode);
9246
9247 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9248 unsigned OpcodeAdd = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64
9249 : AMDGPU::V_ADD_CO_U32_e32;
9250
9251 const TargetRegisterClass *SrcRC =
9252 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9253 const TargetRegisterClass *SrcSubRC =
9254 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9255
9256 MachineOperand SrcRegSub0 =
9257 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
9258 MachineOperand SrcRegSub1 =
9259 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
9260
9261 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9262 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9263 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9264 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9265
9266 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
9267
9268 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
9269
9270 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
9271 .addReg(IsCtlz ? MidReg1 : MidReg2)
9272 .addImm(32)
9273 .addImm(1); // enable clamp
9274
9275 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9276 .addReg(MidReg3)
9277 .addReg(IsCtlz ? MidReg2 : MidReg1);
9278
9279 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9280
9281 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9282}
9283
9284void SIInstrInfo::addUsersToMoveToVALUWorklist(
9285 Register DstReg, MachineRegisterInfo &MRI,
9286 SIInstrWorklist &Worklist) const {
9287 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9288 MachineInstr &UseMI = *MO.getParent();
9289
9290 unsigned OpNo = 0;
9291
9292 switch (UseMI.getOpcode()) {
9293 case AMDGPU::COPY:
9294 case AMDGPU::WQM:
9295 case AMDGPU::SOFT_WQM:
9296 case AMDGPU::STRICT_WWM:
9297 case AMDGPU::STRICT_WQM:
9298 case AMDGPU::REG_SEQUENCE:
9299 case AMDGPU::PHI:
9300 case AMDGPU::INSERT_SUBREG:
9301 break;
9302 default:
9303 OpNo = MO.getOperandNo();
9304 break;
9305 }
9306
9307 const TargetRegisterClass *OpRC = getOpRegClass(UseMI, OpNo);
9308 MRI.constrainRegClass(DstReg, OpRC);
9309
9310 if (!RI.hasVectorRegisters(OpRC))
9311 Worklist.insert(&UseMI);
9312 else
9313 // Legalization could change user list.
9314 legalizeOperandsVALUt16(UseMI, OpNo, MRI);
9315 }
9316}
9317
9318void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9320 MachineInstr &Inst) const {
9321 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9322 MachineBasicBlock *MBB = Inst.getParent();
9323 MachineOperand &Src0 = Inst.getOperand(1);
9324 MachineOperand &Src1 = Inst.getOperand(2);
9325 const DebugLoc &DL = Inst.getDebugLoc();
9326
9327 if (ST.useRealTrue16Insts()) {
9328 Register SrcReg0, SrcReg1;
9329 if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
9330 SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9331 BuildMI(*MBB, Inst, DL,
9332 get(Src0.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), SrcReg0)
9333 .add(Src0);
9334 } else {
9335 SrcReg0 = Src0.getReg();
9336 }
9337
9338 if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
9339 SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9340 BuildMI(*MBB, Inst, DL,
9341 get(Src1.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), SrcReg1)
9342 .add(Src1);
9343 } else {
9344 SrcReg1 = Src1.getReg();
9345 }
9346
9347 bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
9348 bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
9349
9350 auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
9351 switch (Inst.getOpcode()) {
9352 case AMDGPU::S_PACK_LL_B32_B16:
9353 NewMI
9354 .addReg(SrcReg0, {},
9355 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9356 .addImm(AMDGPU::lo16)
9357 .addReg(SrcReg1, {},
9358 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9359 .addImm(AMDGPU::hi16);
9360 break;
9361 case AMDGPU::S_PACK_LH_B32_B16:
9362 NewMI
9363 .addReg(SrcReg0, {},
9364 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9365 .addImm(AMDGPU::lo16)
9366 .addReg(SrcReg1, {}, AMDGPU::hi16)
9367 .addImm(AMDGPU::hi16);
9368 break;
9369 case AMDGPU::S_PACK_HL_B32_B16:
9370 NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
9371 .addImm(AMDGPU::lo16)
9372 .addReg(SrcReg1, {},
9373 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9374 .addImm(AMDGPU::hi16);
9375 break;
9376 case AMDGPU::S_PACK_HH_B32_B16:
9377 NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
9378 .addImm(AMDGPU::lo16)
9379 .addReg(SrcReg1, {}, AMDGPU::hi16)
9380 .addImm(AMDGPU::hi16);
9381 break;
9382 default:
9383 llvm_unreachable("unhandled s_pack_* instruction");
9384 }
9385
9386 MachineOperand &Dest = Inst.getOperand(0);
9387 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9388 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9389 return;
9390 }
9391
9392 switch (Inst.getOpcode()) {
9393 case AMDGPU::S_PACK_LL_B32_B16: {
9394 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9395 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9396
9397 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9398 // 0.
9399 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9400 .addImm(0xffff);
9401
9402 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9403 .addReg(ImmReg, RegState::Kill)
9404 .add(Src0);
9405
9406 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9407 .add(Src1)
9408 .addImm(16)
9409 .addReg(TmpReg, RegState::Kill);
9410 break;
9411 }
9412 case AMDGPU::S_PACK_LH_B32_B16: {
9413 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9414 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9415 .addImm(0xffff);
9416 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9417 .addReg(ImmReg, RegState::Kill)
9418 .add(Src0)
9419 .add(Src1);
9420 break;
9421 }
9422 case AMDGPU::S_PACK_HL_B32_B16: {
9423 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9424 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9425 .addImm(16)
9426 .add(Src0);
9427 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9428 .add(Src1)
9429 .addImm(16)
9430 .addReg(TmpReg, RegState::Kill);
9431 break;
9432 }
9433 case AMDGPU::S_PACK_HH_B32_B16: {
9434 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9435 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9436 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9437 .addImm(16)
9438 .add(Src0);
9439 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9440 .addImm(0xffff0000);
9441 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9442 .add(Src1)
9443 .addReg(ImmReg, RegState::Kill)
9444 .addReg(TmpReg, RegState::Kill);
9445 break;
9446 }
9447 default:
9448 llvm_unreachable("unhandled s_pack_* instruction");
9449 }
9450
9451 MachineOperand &Dest = Inst.getOperand(0);
9452 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9453 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9454}
9455
9456void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
9457 MachineInstr &SCCDefInst,
9458 SIInstrWorklist &Worklist,
9459 Register NewCond) const {
9460
9461 // Ensure that def inst defines SCC, which is still live.
9462 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9463 !Op.isDead() && Op.getParent() == &SCCDefInst);
9464 SmallVector<MachineInstr *, 4> CopyToDelete;
9465 // This assumes that all the users of SCC are in the same block
9466 // as the SCC def.
9467 for (MachineInstr &MI : // Skip the def inst itself.
9468 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9469 SCCDefInst.getParent()->end())) {
9470 // Check if SCC is used first.
9471 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9472 if (SCCIdx != -1) {
9473 if (MI.isCopy()) {
9474 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9475 Register DestReg = MI.getOperand(0).getReg();
9476
9477 MRI.replaceRegWith(DestReg, NewCond);
9478 CopyToDelete.push_back(&MI);
9479 } else {
9480
9481 if (NewCond.isValid())
9482 MI.getOperand(SCCIdx).setReg(NewCond);
9483
9484 Worklist.insert(&MI);
9485 }
9486 }
9487 // Exit if we find another SCC def.
9488 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9489 break;
9490 }
9491 for (auto &Copy : CopyToDelete)
9492 Copy->eraseFromParent();
9493}
9494
9495// Instructions that use SCC may be converted to VALU instructions. When that
9496// happens, the SCC register is changed to VCC_LO. The instruction that defines
9497// SCC must be changed to an instruction that defines VCC. This function makes
9498// sure that the instruction that defines SCC is added to the moveToVALU
9499// worklist.
9500void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9501 SIInstrWorklist &Worklist) const {
9502 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9503 // then there is nothing to do because the defining instruction has been
9504 // converted to a VALU already. If SCC then that instruction needs to be
9505 // converted to a VALU.
9506 for (MachineInstr &MI :
9507 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9508 SCCUseInst->getParent()->rend())) {
9509 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9510 break;
9511 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9512 Worklist.insert(&MI);
9513 break;
9514 }
9515 }
9516}
9517
9518const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9519 const MachineInstr &Inst) const {
9520 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9521
9522 switch (Inst.getOpcode()) {
9523 // For target instructions, getOpRegClass just returns the virtual register
9524 // class associated with the operand, so we need to find an equivalent VGPR
9525 // register class in order to move the instruction to the VALU.
9526 case AMDGPU::COPY:
9527 case AMDGPU::PHI:
9528 case AMDGPU::REG_SEQUENCE:
9529 case AMDGPU::INSERT_SUBREG:
9530 case AMDGPU::WQM:
9531 case AMDGPU::SOFT_WQM:
9532 case AMDGPU::STRICT_WWM:
9533 case AMDGPU::STRICT_WQM: {
9534 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9535 if (RI.isAGPRClass(SrcRC)) {
9536 if (RI.isAGPRClass(NewDstRC))
9537 return nullptr;
9538
9539 switch (Inst.getOpcode()) {
9540 case AMDGPU::PHI:
9541 case AMDGPU::REG_SEQUENCE:
9542 case AMDGPU::INSERT_SUBREG:
9543 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9544 break;
9545 default:
9546 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9547 }
9548
9549 if (!NewDstRC)
9550 return nullptr;
9551 } else {
9552 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9553 return nullptr;
9554
9555 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9556 if (!NewDstRC)
9557 return nullptr;
9558 }
9559
9560 return NewDstRC;
9561 }
9562 default:
9563 return NewDstRC;
9564 }
9565}
9566
9567// Find the one SGPR operand we are allowed to use.
9568Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9569 int OpIndices[3]) const {
9570 const MCInstrDesc &Desc = MI.getDesc();
9571
9572 // Find the one SGPR operand we are allowed to use.
9573 //
9574 // First we need to consider the instruction's operand requirements before
9575 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9576 // of VCC, but we are still bound by the constant bus requirement to only use
9577 // one.
9578 //
9579 // If the operand's class is an SGPR, we can never move it.
9580
9581 Register SGPRReg = findImplicitSGPRRead(MI);
9582 if (SGPRReg)
9583 return SGPRReg;
9584
9585 Register UsedSGPRs[3] = {Register()};
9586 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9587
9588 for (unsigned i = 0; i < 3; ++i) {
9589 int Idx = OpIndices[i];
9590 if (Idx == -1)
9591 break;
9592
9593 const MachineOperand &MO = MI.getOperand(Idx);
9594 if (!MO.isReg())
9595 continue;
9596
9597 // Is this operand statically required to be an SGPR based on the operand
9598 // constraints?
9599 const TargetRegisterClass *OpRC =
9600 RI.getRegClass(getOpRegClassID(Desc.operands()[Idx]));
9601 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9602 if (IsRequiredSGPR)
9603 return MO.getReg();
9604
9605 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9606 Register Reg = MO.getReg();
9607 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9608 if (RI.isSGPRClass(RegRC))
9609 UsedSGPRs[i] = Reg;
9610 }
9611
9612 // We don't have a required SGPR operand, so we have a bit more freedom in
9613 // selecting operands to move.
9614
9615 // Try to select the most used SGPR. If an SGPR is equal to one of the
9616 // others, we choose that.
9617 //
9618 // e.g.
9619 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9620 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9621
9622 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9623 // prefer those.
9624
9625 if (UsedSGPRs[0]) {
9626 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9627 SGPRReg = UsedSGPRs[0];
9628 }
9629
9630 if (!SGPRReg && UsedSGPRs[1]) {
9631 if (UsedSGPRs[1] == UsedSGPRs[2])
9632 SGPRReg = UsedSGPRs[1];
9633 }
9634
9635 return SGPRReg;
9636}
9637
9639 AMDGPU::OpName OperandName) const {
9640 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9641 return nullptr;
9642
9643 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9644 if (Idx == -1)
9645 return nullptr;
9646
9647 return &MI.getOperand(Idx);
9648}
9649
9651 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9652 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9655 return (Format << 44) |
9656 (1ULL << 56) | // RESOURCE_LEVEL = 1
9657 (3ULL << 60); // OOB_SELECT = 3
9658 }
9659
9660 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9661 if (ST.isAmdHsaOS()) {
9662 // Set ATC = 1. GFX9 doesn't have this bit.
9663 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9664 RsrcDataFormat |= (1ULL << 56);
9665
9666 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9667 // BTW, it disables TC L2 and therefore decreases performance.
9668 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9669 RsrcDataFormat |= (2ULL << 59);
9670 }
9671
9672 return RsrcDataFormat;
9673}
9674
9678 0xffffffff; // Size;
9679
9680 // GFX9 doesn't have ELEMENT_SIZE.
9681 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9682 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9683 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9684 }
9685
9686 // IndexStride = 64 / 32.
9687 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9688 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9689
9690 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9691 // Clear them unless we want a huge stride.
9692 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9693 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9694 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9695
9696 return Rsrc23;
9697}
9698
9700 unsigned Opc = MI.getOpcode();
9701
9702 return isSMRD(Opc);
9703}
9704
9706 return get(Opc).mayLoad() &&
9707 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9708}
9709
9711 int &FrameIndex) const {
9712 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9713 if (!Addr || !Addr->isFI())
9714 return Register();
9715
9716 assert(!MI.memoperands_empty() &&
9717 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9718
9719 FrameIndex = Addr->getIndex();
9720 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
9721}
9722
9724 int &FrameIndex) const {
9725 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9726 assert(Addr && Addr->isFI());
9727 FrameIndex = Addr->getIndex();
9728 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
9729}
9730
9732 int &FrameIndex) const {
9733 if (!MI.mayLoad())
9734 return Register();
9735
9736 if (isMUBUF(MI) || isVGPRSpill(MI))
9737 return isStackAccess(MI, FrameIndex);
9738
9739 if (isSGPRSpill(MI))
9740 return isSGPRStackAccess(MI, FrameIndex);
9741
9742 return Register();
9743}
9744
9746 int &FrameIndex) const {
9747 if (!MI.mayStore())
9748 return Register();
9749
9750 if (isMUBUF(MI) || isVGPRSpill(MI))
9751 return isStackAccess(MI, FrameIndex);
9752
9753 if (isSGPRSpill(MI))
9754 return isSGPRStackAccess(MI, FrameIndex);
9755
9756 return Register();
9757}
9758
9760 unsigned Size = 0;
9762 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9763 while (++I != E && I->isInsideBundle()) {
9764 assert(!I->isBundle() && "No nested bundle!");
9766 }
9767
9768 return Size;
9769}
9770
9772 unsigned Opc = MI.getOpcode();
9774 unsigned DescSize = Desc.getSize();
9775
9776 // If we have a definitive size, we can use it. Otherwise we need to inspect
9777 // the operands to know the size.
9778 if (isFixedSize(MI)) {
9779 unsigned Size = DescSize;
9780
9781 // If we hit the buggy offset, an extra nop will be inserted in MC so
9782 // estimate the worst case.
9783 if (MI.isBranch() && ST.hasOffset3fBug())
9784 Size += 4;
9785
9786 return Size;
9787 }
9788
9789 // Instructions may have a 32-bit literal encoded after them. Check
9790 // operands that could ever be literals.
9791 if (isVALU(MI) || isSALU(MI)) {
9792 if (isDPP(MI))
9793 return DescSize;
9794 bool HasLiteral = false;
9795 unsigned LiteralSize = 4;
9796 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9797 const MachineOperand &Op = MI.getOperand(I);
9798 const MCOperandInfo &OpInfo = Desc.operands()[I];
9799 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9800 HasLiteral = true;
9801 if (ST.has64BitLiterals()) {
9802 switch (OpInfo.OperandType) {
9803 default:
9804 break;
9806 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9807 LiteralSize = 8;
9808 break;
9810 // A 32-bit literal is only valid when the value fits in BOTH signed
9811 // and unsigned 32-bit ranges [0, 2^31-1], matching the MC code
9812 // emitter's getLit64Encoding logic. This is because of the lack of
9813 // abilility to tell signedness of the literal, therefore we need to
9814 // be conservative and assume values outside this range require a
9815 // 64-bit literal encoding (8 bytes).
9816 if (!Op.isImm() || !isInt<32>(Op.getImm()) ||
9817 !isUInt<32>(Op.getImm()))
9818 LiteralSize = 8;
9819 break;
9820 }
9821 }
9822 break;
9823 }
9824 }
9825 return HasLiteral ? DescSize + LiteralSize : DescSize;
9826 }
9827
9828 // Check whether we have extra NSA words.
9829 if (isMIMG(MI)) {
9830 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9831 if (VAddr0Idx < 0)
9832 return 8;
9833
9834 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9835 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9836 }
9837
9838 switch (Opc) {
9839 case TargetOpcode::BUNDLE:
9840 return getInstBundleSize(MI);
9841 case TargetOpcode::INLINEASM:
9842 case TargetOpcode::INLINEASM_BR: {
9843 const MachineFunction *MF = MI.getMF();
9844 const char *AsmStr = MI.getOperand(0).getSymbolName();
9845 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
9846 }
9847 default:
9848 if (MI.isMetaInstruction())
9849 return 0;
9850
9851 // If D16 Pseudo inst, get correct MC code size
9852 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9853 if (D16Info) {
9854 // Assume d16_lo/hi inst are always in same size
9855 unsigned LoInstOpcode = D16Info->LoOp;
9856 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9857 DescSize = Desc.getSize();
9858 }
9859
9860 // If FMA Pseudo inst, get correct MC code size
9861 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
9862 // All potential lowerings are the same size; arbitrarily pick one.
9863 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
9864 DescSize = Desc.getSize();
9865 }
9866
9867 return DescSize;
9868 }
9869}
9870
9872 if (!isFLAT(MI))
9873 return false;
9874
9875 if (MI.memoperands_empty())
9876 return true;
9877
9878 for (const MachineMemOperand *MMO : MI.memoperands()) {
9879 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9880 return true;
9881 }
9882 return false;
9883}
9884
9887 static const std::pair<int, const char *> TargetIndices[] = {
9888 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9889 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9890 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9891 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9892 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9893 return ArrayRef(TargetIndices);
9894}
9895
9896/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9897/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9903
9904/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9905/// pass.
9911
9912// Called during:
9913// - pre-RA scheduling and post-RA scheduling
9916 const ScheduleDAGMI *DAG) const {
9917 // Borrowed from Arm Target
9918 // We would like to restrict this hazard recognizer to only
9919 // post-RA scheduling; we can tell that we're post-RA because we don't
9920 // track VRegLiveness.
9921 if (!DAG->hasVRegLiveness())
9922 return new GCNHazardRecognizer(DAG->MF);
9924}
9925
9926std::pair<unsigned, unsigned>
9928 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9929}
9930
9933 static const std::pair<unsigned, const char *> TargetFlags[] = {
9934 {MO_GOTPCREL, "amdgpu-gotprel"},
9935 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9936 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9937 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9938 {MO_REL32_LO, "amdgpu-rel32-lo"},
9939 {MO_REL32_HI, "amdgpu-rel32-hi"},
9940 {MO_REL64, "amdgpu-rel64"},
9941 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9942 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9943 {MO_ABS64, "amdgpu-abs64"},
9944 };
9945
9946 return ArrayRef(TargetFlags);
9947}
9948
9951 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9952 {
9953 {MONoClobber, "amdgpu-noclobber"},
9954 {MOLastUse, "amdgpu-last-use"},
9955 {MOCooperative, "amdgpu-cooperative"},
9956 {MOThreadPrivate, "amdgpu-thread-private"},
9957 };
9958
9959 return ArrayRef(TargetFlags);
9960}
9961
9963 const MachineFunction &MF) const {
9965 assert(SrcReg.isVirtual());
9966 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
9967 return AMDGPU::WWM_COPY;
9968
9969 return AMDGPU::COPY;
9970}
9971
9973 uint32_t Opcode = MI.getOpcode();
9974 // Check if it is SGPR spill or wwm-register spill Opcode.
9975 if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode))
9976 return true;
9977
9978 const MachineFunction *MF = MI.getMF();
9979 const MachineRegisterInfo &MRI = MF->getRegInfo();
9981
9982 // See if this is Liverange split instruction inserted for SGPR or
9983 // wwm-register. The implicit def inserted for wwm-registers should also be
9984 // included as they can appear at the bb begin.
9985 bool IsLRSplitInst = MI.getFlag(MachineInstr::LRSplit);
9986 if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF)
9987 return false;
9988
9989 Register Reg = MI.getOperand(0).getReg();
9990 if (RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg)))
9991 return IsLRSplitInst;
9992
9993 return MFI->isWWMReg(Reg);
9994}
9995
9997 Register Reg) const {
9998 // We need to handle instructions which may be inserted during register
9999 // allocation to handle the prolog. The initial prolog instruction may have
10000 // been separated from the start of the block by spills and copies inserted
10001 // needed by the prolog. However, the insertions for scalar registers can
10002 // always be placed at the BB top as they are independent of the exec mask
10003 // value.
10004 bool IsNullOrVectorRegister = true;
10005 if (Reg) {
10006 const MachineFunction *MF = MI.getMF();
10007 const MachineRegisterInfo &MRI = MF->getRegInfo();
10008 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
10009 }
10010
10011 return IsNullOrVectorRegister &&
10012 (canAddToBBProlog(MI) ||
10013 (!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
10014 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
10015}
10016
10020 const DebugLoc &DL,
10021 Register DestReg) const {
10022 if (ST.hasAddNoCarryInsts())
10023 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
10024
10025 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
10026 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
10027 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
10028
10029 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
10030 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
10031}
10032
10035 const DebugLoc &DL,
10036 Register DestReg,
10037 RegScavenger &RS) const {
10038 if (ST.hasAddNoCarryInsts())
10039 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
10040
10041 // If available, prefer to use vcc.
10042 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
10043 ? Register(RI.getVCC())
10044 : RS.scavengeRegisterBackwards(
10045 *RI.getBoolRC(), I, /* RestoreAfter */ false,
10046 0, /* AllowSpill */ false);
10047
10048 // TODO: Users need to deal with this.
10049 if (!UnusedCarry.isValid())
10050 return MachineInstrBuilder();
10051
10052 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
10053 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
10054}
10055
10056bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
10057 switch (Opcode) {
10058 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
10059 case AMDGPU::SI_KILL_I1_TERMINATOR:
10060 return true;
10061 default:
10062 return false;
10063 }
10064}
10065
10067 switch (Opcode) {
10068 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
10069 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
10070 case AMDGPU::SI_KILL_I1_PSEUDO:
10071 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
10072 default:
10073 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
10074 }
10075}
10076
10077bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
10078 return Imm <= getMaxMUBUFImmOffset(ST);
10079}
10080
10082 // GFX12 field is non-negative 24-bit signed byte offset.
10083 const unsigned OffsetBits =
10084 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
10085 return (1 << OffsetBits) - 1;
10086}
10087
10089 if (!ST.isWave32())
10090 return;
10091
10092 if (MI.isInlineAsm())
10093 return;
10094
10095 if (MI.getNumOperands() < MI.getNumExplicitOperands())
10096 return;
10097
10098 for (auto &Op : MI.implicit_operands()) {
10099 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
10100 Op.setReg(AMDGPU::VCC_LO);
10101 }
10102}
10103
10105 if (!isSMRD(MI))
10106 return false;
10107
10108 // Check that it is using a buffer resource.
10109 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
10110 if (Idx == -1) // e.g. s_memtime
10111 return false;
10112
10113 const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]);
10114 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
10115}
10116
10117// Given Imm, split it into the values to put into the SOffset and ImmOffset
10118// fields in an MUBUF instruction. Return false if it is not possible (due to a
10119// hardware bug needing a workaround).
10120//
10121// The required alignment ensures that individual address components remain
10122// aligned if they are aligned to begin with. It also ensures that additional
10123// offsets within the given alignment can be added to the resulting ImmOffset.
10125 uint32_t &ImmOffset, Align Alignment) const {
10126 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
10127 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
10128 uint32_t Overflow = 0;
10129
10130 if (Imm > MaxImm) {
10131 if (Imm <= MaxImm + 64) {
10132 // Use an SOffset inline constant for 4..64
10133 Overflow = Imm - MaxImm;
10134 Imm = MaxImm;
10135 } else {
10136 // Try to keep the same value in SOffset for adjacent loads, so that
10137 // the corresponding register contents can be re-used.
10138 //
10139 // Load values with all low-bits (except for alignment bits) set into
10140 // SOffset, so that a larger range of values can be covered using
10141 // s_movk_i32.
10142 //
10143 // Atomic operations fail to work correctly when individual address
10144 // components are unaligned, even if their sum is aligned.
10145 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
10146 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
10147 Imm = Low;
10148 Overflow = High - Alignment.value();
10149 }
10150 }
10151
10152 if (Overflow > 0) {
10153 // There is a hardware bug in SI and CI which prevents address clamping in
10154 // MUBUF instructions from working correctly with SOffsets. The immediate
10155 // offset is unaffected.
10156 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
10157 return false;
10158
10159 // It is not possible to set immediate in SOffset field on some targets.
10160 if (ST.hasRestrictedSOffset())
10161 return false;
10162 }
10163
10164 ImmOffset = Imm;
10165 SOffset = Overflow;
10166 return true;
10167}
10168
10169// Depending on the used address space and instructions, some immediate offsets
10170// are allowed and some are not.
10171// Pre-GFX12, flat instruction offsets can only be non-negative, global and
10172// scratch instruction offsets can also be negative. On GFX12, offsets can be
10173// negative for all variants.
10174//
10175// There are several bugs related to these offsets:
10176// On gfx10.1, flat instructions that go into the global address space cannot
10177// use an offset.
10178//
10179// For scratch instructions, the address can be either an SGPR or a VGPR.
10180// The following offsets can be used, depending on the architecture (x means
10181// cannot be used):
10182// +----------------------------+------+------+
10183// | Address-Mode | SGPR | VGPR |
10184// +----------------------------+------+------+
10185// | gfx9 | | |
10186// | negative, 4-aligned offset | x | ok |
10187// | negative, unaligned offset | x | ok |
10188// +----------------------------+------+------+
10189// | gfx10 | | |
10190// | negative, 4-aligned offset | ok | ok |
10191// | negative, unaligned offset | ok | x |
10192// +----------------------------+------+------+
10193// | gfx10.3 | | |
10194// | negative, 4-aligned offset | ok | ok |
10195// | negative, unaligned offset | ok | ok |
10196// +----------------------------+------+------+
10197//
10198// This function ignores the addressing mode, so if an offset cannot be used in
10199// one addressing mode, it is considered illegal.
10200bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
10201 uint64_t FlatVariant) const {
10202 // TODO: Should 0 be special cased?
10203 if (!ST.hasFlatInstOffsets())
10204 return false;
10205
10206 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
10207 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
10208 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
10209 return false;
10210
10211 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10212 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
10213 (Offset % 4) != 0) {
10214 return false;
10215 }
10216
10217 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10218 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
10219 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
10220}
10221
10222// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
10223std::pair<int64_t, int64_t>
10224SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
10225 uint64_t FlatVariant) const {
10226 int64_t RemainderOffset = COffsetVal;
10227 int64_t ImmField = 0;
10228
10229 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10230 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
10231
10232 if (AllowNegative) {
10233 // Use signed division by a power of two to truncate towards 0.
10234 int64_t D = 1LL << NumBits;
10235 RemainderOffset = (COffsetVal / D) * D;
10236 ImmField = COffsetVal - RemainderOffset;
10237
10238 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10239 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
10240 (ImmField % 4) != 0) {
10241 // Make ImmField a multiple of 4
10242 RemainderOffset += ImmField % 4;
10243 ImmField -= ImmField % 4;
10244 }
10245 } else if (COffsetVal >= 0) {
10246 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
10247 RemainderOffset = COffsetVal - ImmField;
10248 }
10249
10250 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
10251 assert(RemainderOffset + ImmField == COffsetVal);
10252 return {ImmField, RemainderOffset};
10253}
10254
10256 if (ST.hasNegativeScratchOffsetBug() &&
10257 FlatVariant == SIInstrFlags::FlatScratch)
10258 return false;
10259
10260 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
10261}
10262
10263static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
10264 switch (ST.getGeneration()) {
10265 default:
10266 break;
10269 return SIEncodingFamily::SI;
10272 return SIEncodingFamily::VI;
10276 return ST.hasGFX11_7Insts() ? SIEncodingFamily::GFX1170
10279 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
10283 }
10284 llvm_unreachable("Unknown subtarget generation!");
10285}
10286
10287bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
10288 switch(MCOp) {
10289 // These opcodes use indirect register addressing so
10290 // they need special handling by codegen (currently missing).
10291 // Therefore it is too risky to allow these opcodes
10292 // to be selected by dpp combiner or sdwa peepholer.
10293 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10294 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10295 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10296 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10297 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10298 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10299 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10300 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10301 return true;
10302 default:
10303 return false;
10304 }
10305}
10306
10307#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10308 case OPCODE##_dpp: \
10309 case OPCODE##_e32: \
10310 case OPCODE##_e64: \
10311 case OPCODE##_e64_dpp: \
10312 case OPCODE##_sdwa:
10313
10314static bool isRenamedInGFX9(int Opcode) {
10315 switch (Opcode) {
10316 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10317 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10318 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10319 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10320 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10321 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10322 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10323 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10324 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10325 //
10326 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10327 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10328 case AMDGPU::V_FMA_F16_gfx9_e64:
10329 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10330 case AMDGPU::V_INTERP_P2_F16:
10331 case AMDGPU::V_MAD_F16_e64:
10332 case AMDGPU::V_MAD_U16_e64:
10333 case AMDGPU::V_MAD_I16_e64:
10334 return true;
10335 default:
10336 return false;
10337 }
10338}
10339
10340int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10341 assert(Opcode == (int)SIInstrInfo::getNonSoftWaitcntOpcode(Opcode) &&
10342 "SIInsertWaitcnts should have promoted soft waitcnt instructions!");
10343
10344 unsigned Gen = subtargetEncodingFamily(ST);
10345
10346 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10348
10349 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10350 // subtarget has UnpackedD16VMem feature.
10351 // TODO: remove this when we discard GFX80 encoding.
10352 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10354
10355 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10356 switch (ST.getGeneration()) {
10357 default:
10359 break;
10362 break;
10365 break;
10366 }
10367 }
10368
10369 if (isMAI(Opcode)) {
10370 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10371 if (MFMAOp != -1)
10372 Opcode = MFMAOp;
10373 }
10374
10375 int32_t MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10376
10377 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX11_7Insts())
10379
10380 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX1250Insts())
10382
10383 // -1 means that Opcode is already a native instruction.
10384 if (MCOp == -1)
10385 return Opcode;
10386
10387 if (ST.hasGFX90AInsts()) {
10388 uint32_t NMCOp = AMDGPU::INSTRUCTION_LIST_END;
10389 if (ST.hasGFX940Insts())
10391 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10393 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10395 if (NMCOp != AMDGPU::INSTRUCTION_LIST_END)
10396 MCOp = NMCOp;
10397 }
10398
10399 // INSTRUCTION_LIST_END means that Opcode is a pseudo instruction that has no
10400 // encoding in the given subtarget generation.
10401 if (MCOp == AMDGPU::INSTRUCTION_LIST_END)
10402 return -1;
10403
10404 if (isAsmOnlyOpcode(MCOp))
10405 return -1;
10406
10407 return MCOp;
10408}
10409
10410static
10412 assert(RegOpnd.isReg());
10413 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10414 getRegSubRegPair(RegOpnd);
10415}
10416
10419 assert(MI.isRegSequence());
10420 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10421 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10422 auto &RegOp = MI.getOperand(1 + 2 * I);
10423 return getRegOrUndef(RegOp);
10424 }
10426}
10427
10428// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10429// Following a subreg of reg:subreg isn't supported
10432 if (!RSR.SubReg)
10433 return false;
10434 switch (MI.getOpcode()) {
10435 default: break;
10436 case AMDGPU::REG_SEQUENCE:
10437 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10438 return true;
10439 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10440 case AMDGPU::INSERT_SUBREG:
10441 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10442 // inserted the subreg we're looking for
10443 RSR = getRegOrUndef(MI.getOperand(2));
10444 else { // the subreg in the rest of the reg
10445 auto R1 = getRegOrUndef(MI.getOperand(1));
10446 if (R1.SubReg) // subreg of subreg isn't supported
10447 return false;
10448 RSR.Reg = R1.Reg;
10449 }
10450 return true;
10451 }
10452 return false;
10453}
10454
10456 const MachineRegisterInfo &MRI) {
10457 assert(MRI.isSSA());
10458 if (!P.Reg.isVirtual())
10459 return nullptr;
10460
10461 auto RSR = P;
10462 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10463 while (auto *MI = DefInst) {
10464 DefInst = nullptr;
10465 switch (MI->getOpcode()) {
10466 case AMDGPU::COPY:
10467 case AMDGPU::V_MOV_B32_e32: {
10468 auto &Op1 = MI->getOperand(1);
10469 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10470 if (Op1.isUndef())
10471 return nullptr;
10472 RSR = getRegSubRegPair(Op1);
10473 DefInst = MRI.getVRegDef(RSR.Reg);
10474 }
10475 break;
10476 }
10477 default:
10478 if (followSubRegDef(*MI, RSR)) {
10479 if (!RSR.Reg)
10480 return nullptr;
10481 DefInst = MRI.getVRegDef(RSR.Reg);
10482 }
10483 }
10484 if (!DefInst)
10485 return MI;
10486 }
10487 return nullptr;
10488}
10489
10491 Register VReg,
10492 const MachineInstr &DefMI,
10493 const MachineInstr &UseMI) {
10494 assert(MRI.isSSA() && "Must be run on SSA");
10495
10496 auto *TRI = MRI.getTargetRegisterInfo();
10497 auto *DefBB = DefMI.getParent();
10498
10499 // Don't bother searching between blocks, although it is possible this block
10500 // doesn't modify exec.
10501 if (UseMI.getParent() != DefBB)
10502 return true;
10503
10504 const int MaxInstScan = 20;
10505 int NumInst = 0;
10506
10507 // Stop scan at the use.
10508 auto E = UseMI.getIterator();
10509 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10510 if (I->isDebugInstr())
10511 continue;
10512
10513 if (++NumInst > MaxInstScan)
10514 return true;
10515
10516 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10517 return true;
10518 }
10519
10520 return false;
10521}
10522
10524 Register VReg,
10525 const MachineInstr &DefMI) {
10526 assert(MRI.isSSA() && "Must be run on SSA");
10527
10528 auto *TRI = MRI.getTargetRegisterInfo();
10529 auto *DefBB = DefMI.getParent();
10530
10531 const int MaxUseScan = 10;
10532 int NumUse = 0;
10533
10534 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10535 auto &UseInst = *Use.getParent();
10536 // Don't bother searching between blocks, although it is possible this block
10537 // doesn't modify exec.
10538 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10539 return true;
10540
10541 if (++NumUse > MaxUseScan)
10542 return true;
10543 }
10544
10545 if (NumUse == 0)
10546 return false;
10547
10548 const int MaxInstScan = 20;
10549 int NumInst = 0;
10550
10551 // Stop scan when we have seen all the uses.
10552 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10553 assert(I != DefBB->end());
10554
10555 if (I->isDebugInstr())
10556 continue;
10557
10558 if (++NumInst > MaxInstScan)
10559 return true;
10560
10561 for (const MachineOperand &Op : I->operands()) {
10562 // We don't check reg masks here as they're used only on calls:
10563 // 1. EXEC is only considered const within one BB
10564 // 2. Call should be a terminator instruction if present in a BB
10565
10566 if (!Op.isReg())
10567 continue;
10568
10569 Register Reg = Op.getReg();
10570 if (Op.isUse()) {
10571 if (Reg == VReg && --NumUse == 0)
10572 return false;
10573 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10574 return true;
10575 }
10576 }
10577}
10578
10581 const DebugLoc &DL, Register Src, Register Dst) const {
10582 auto Cur = MBB.begin();
10583 if (Cur != MBB.end())
10584 do {
10585 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10586 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10587 ++Cur;
10588 } while (Cur != MBB.end() && Cur != LastPHIIt);
10589
10590 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10591 Dst);
10592}
10593
10596 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10597 if (InsPt != MBB.end() &&
10598 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10599 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10600 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10601 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10602 InsPt++;
10603 return BuildMI(MBB, InsPt, DL,
10604 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10605 .addReg(Src, {}, SrcSubReg)
10606 .addReg(AMDGPU::EXEC, RegState::Implicit);
10607 }
10608 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10609 Dst);
10610}
10611
10612bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10613
10616 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
10617 VirtRegMap *VRM) const {
10618 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10619 //
10620 // %0:sreg_32 = COPY $m0
10621 //
10622 // We explicitly chose SReg_32 for the virtual register so such a copy might
10623 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10624 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10625 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10626 // TargetInstrInfo::foldMemoryOperand() is going to try.
10627 // A similar issue also exists with spilling and reloading $exec registers.
10628 //
10629 // To prevent that, constrain the %0 register class here.
10630 if (isFullCopyInstr(MI)) {
10631 Register DstReg = MI.getOperand(0).getReg();
10632 Register SrcReg = MI.getOperand(1).getReg();
10633 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10634 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10635 MachineRegisterInfo &MRI = MF.getRegInfo();
10636 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10637 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10638 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10639 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10640 return nullptr;
10641 }
10642 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10643 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10644 return nullptr;
10645 }
10646 }
10647 }
10648
10649 return nullptr;
10650}
10651
10653 const MachineInstr &MI,
10654 unsigned *PredCost) const {
10655 if (MI.isBundle()) {
10657 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10658 unsigned Lat = 0, Count = 0;
10659 for (++I; I != E && I->isBundledWithPred(); ++I) {
10660 ++Count;
10661 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10662 }
10663 return Lat + Count - 1;
10664 }
10665
10666 return SchedModel.computeInstrLatency(&MI);
10667}
10668
10669const MachineOperand &
10671 if (const MachineOperand *CallAddrOp =
10672 getNamedOperand(MI, AMDGPU::OpName::src0))
10673 return *CallAddrOp;
10675}
10676
10679 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10680 unsigned Opcode = MI.getOpcode();
10681
10682 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10683 Register Dst = MI.getOperand(0).getReg();
10684 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10685 : MI.getOperand(1).getReg();
10686 LLT DstTy = MRI.getType(Dst);
10687 LLT SrcTy = MRI.getType(Src);
10688 unsigned DstAS = DstTy.getAddressSpace();
10689 unsigned SrcAS = SrcTy.getAddressSpace();
10690 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10691 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10692 ST.hasGloballyAddressableScratch()
10695 };
10696
10697 // If the target supports globally addressable scratch, the mapping from
10698 // scratch memory to the flat aperture changes therefore an address space cast
10699 // is no longer uniform.
10700 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10701 return HandleAddrSpaceCast(MI);
10702
10703 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10704 auto IID = GI->getIntrinsicID();
10709
10710 switch (IID) {
10711 case Intrinsic::amdgcn_addrspacecast_nonnull:
10712 return HandleAddrSpaceCast(MI);
10713 case Intrinsic::amdgcn_if:
10714 case Intrinsic::amdgcn_else:
10715 // FIXME: Uniform if second result
10716 break;
10717 }
10718
10720 }
10721
10722 // Loads from the private and flat address spaces are divergent, because
10723 // threads can execute the load instruction with the same inputs and get
10724 // different results.
10725 //
10726 // All other loads are not divergent, because if threads issue loads with the
10727 // same arguments, they will always get the same result.
10728 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10729 Opcode == AMDGPU::G_SEXTLOAD) {
10730 if (MI.memoperands_empty())
10731 return InstructionUniformity::NeverUniform; // conservative assumption
10732
10733 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10734 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10735 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10736 })) {
10737 // At least one MMO in a non-global address space.
10739 }
10741 }
10742
10743 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10744 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10745 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10746 AMDGPU::isGenericAtomic(Opcode)) {
10748 }
10750}
10751
10753 if (!Formatter)
10754 Formatter = std::make_unique<AMDGPUMIRFormatter>(ST);
10755 return Formatter.get();
10756}
10757
10760
10761 if (isNeverUniform(MI))
10763
10764 unsigned opcode = MI.getOpcode();
10765 if (opcode == AMDGPU::V_READLANE_B32 ||
10766 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10767 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10769
10770 if (isCopyInstr(MI)) {
10771 const MachineOperand &srcOp = MI.getOperand(1);
10772 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10773 const TargetRegisterClass *regClass =
10774 RI.getPhysRegBaseClass(srcOp.getReg());
10775 return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
10777 }
10779 }
10780
10781 // GMIR handling
10782 if (MI.isPreISelOpcode())
10784
10785 // Atomics are divergent because they are executed sequentially: when an
10786 // atomic operation refers to the same address in each thread, then each
10787 // thread after the first sees the value written by the previous thread as
10788 // original value.
10789
10790 if (isAtomic(MI))
10792
10793 // Loads from the private and flat address spaces are divergent, because
10794 // threads can execute the load instruction with the same inputs and get
10795 // different results.
10796 if (isFLAT(MI) && MI.mayLoad()) {
10797 if (MI.memoperands_empty())
10798 return InstructionUniformity::NeverUniform; // conservative assumption
10799
10800 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10801 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10802 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10803 })) {
10804 // At least one MMO in a non-global address space.
10806 }
10807
10809 }
10810
10811 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10812 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10813
10814 // FIXME: It's conceptually broken to report this for an instruction, and not
10815 // a specific def operand. For inline asm in particular, there could be mixed
10816 // uniform and divergent results.
10817 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10818 const MachineOperand &SrcOp = MI.getOperand(I);
10819 if (!SrcOp.isReg())
10820 continue;
10821
10822 Register Reg = SrcOp.getReg();
10823 if (!Reg || !SrcOp.readsReg())
10824 continue;
10825
10826 // If RegBank is null, this is unassigned or an unallocatable special
10827 // register, which are all scalars.
10828 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10829 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10831 }
10832
10833 // TODO: Uniformity check condtions above can be rearranged for more
10834 // redability
10835
10836 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10837 // currently turned into no-op COPYs by SelectionDAG ISel and are
10838 // therefore no longer recognizable.
10839
10841}
10842
10844 switch (MF.getFunction().getCallingConv()) {
10846 return 1;
10848 return 2;
10850 return 3;
10854 const Function &F = MF.getFunction();
10855 F.getContext().diagnose(DiagnosticInfoUnsupported(
10856 F, "ds_ordered_count unsupported for this calling conv"));
10857 [[fallthrough]];
10858 }
10861 case CallingConv::C:
10862 case CallingConv::Fast:
10863 default:
10864 // Assume other calling conventions are various compute callable functions
10865 return 0;
10866 }
10867}
10868
10870 Register &SrcReg2, int64_t &CmpMask,
10871 int64_t &CmpValue) const {
10872 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
10873 return false;
10874
10875 switch (MI.getOpcode()) {
10876 default:
10877 break;
10878 case AMDGPU::S_CMP_EQ_U32:
10879 case AMDGPU::S_CMP_EQ_I32:
10880 case AMDGPU::S_CMP_LG_U32:
10881 case AMDGPU::S_CMP_LG_I32:
10882 case AMDGPU::S_CMP_LT_U32:
10883 case AMDGPU::S_CMP_LT_I32:
10884 case AMDGPU::S_CMP_GT_U32:
10885 case AMDGPU::S_CMP_GT_I32:
10886 case AMDGPU::S_CMP_LE_U32:
10887 case AMDGPU::S_CMP_LE_I32:
10888 case AMDGPU::S_CMP_GE_U32:
10889 case AMDGPU::S_CMP_GE_I32:
10890 case AMDGPU::S_CMP_EQ_U64:
10891 case AMDGPU::S_CMP_LG_U64:
10892 SrcReg = MI.getOperand(0).getReg();
10893 if (MI.getOperand(1).isReg()) {
10894 if (MI.getOperand(1).getSubReg())
10895 return false;
10896 SrcReg2 = MI.getOperand(1).getReg();
10897 CmpValue = 0;
10898 } else if (MI.getOperand(1).isImm()) {
10899 SrcReg2 = Register();
10900 CmpValue = MI.getOperand(1).getImm();
10901 } else {
10902 return false;
10903 }
10904 CmpMask = ~0;
10905 return true;
10906 case AMDGPU::S_CMPK_EQ_U32:
10907 case AMDGPU::S_CMPK_EQ_I32:
10908 case AMDGPU::S_CMPK_LG_U32:
10909 case AMDGPU::S_CMPK_LG_I32:
10910 case AMDGPU::S_CMPK_LT_U32:
10911 case AMDGPU::S_CMPK_LT_I32:
10912 case AMDGPU::S_CMPK_GT_U32:
10913 case AMDGPU::S_CMPK_GT_I32:
10914 case AMDGPU::S_CMPK_LE_U32:
10915 case AMDGPU::S_CMPK_LE_I32:
10916 case AMDGPU::S_CMPK_GE_U32:
10917 case AMDGPU::S_CMPK_GE_I32:
10918 SrcReg = MI.getOperand(0).getReg();
10919 SrcReg2 = Register();
10920 CmpValue = MI.getOperand(1).getImm();
10921 CmpMask = ~0;
10922 return true;
10923 }
10924
10925 return false;
10926}
10927
10929 for (MachineBasicBlock *S : MBB->successors()) {
10930 if (S->isLiveIn(AMDGPU::SCC))
10931 return false;
10932 }
10933 return true;
10934}
10935
10936// Invert all uses of SCC following SCCDef because SCCDef may be deleted and
10937// (incoming SCC) = !(SCC defined by SCCDef).
10938// Return true if all uses can be re-written, false otherwise.
10939bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
10940 MachineBasicBlock *MBB = SCCDef->getParent();
10941 SmallVector<MachineInstr *> InvertInstr;
10942 bool SCCIsDead = false;
10943
10944 // Scan instructions for SCC uses that need to be inverted until SCC is dead.
10945 constexpr unsigned ScanLimit = 12;
10946 unsigned Count = 0;
10947 for (MachineInstr &MI :
10948 make_range(std::next(MachineBasicBlock::iterator(SCCDef)), MBB->end())) {
10949 if (++Count > ScanLimit)
10950 return false;
10951 if (MI.readsRegister(AMDGPU::SCC, &RI)) {
10952 if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
10953 MI.getOpcode() == AMDGPU::S_CSELECT_B64 ||
10954 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
10955 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC1)
10956 InvertInstr.push_back(&MI);
10957 else
10958 return false;
10959 }
10960 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
10961 SCCIsDead = true;
10962 break;
10963 }
10964 }
10965 if (!SCCIsDead && isSCCDeadOnExit(MBB))
10966 SCCIsDead = true;
10967
10968 // SCC may have more uses. Can't invert all of them.
10969 if (!SCCIsDead)
10970 return false;
10971
10972 // Invert uses
10973 for (MachineInstr *MI : InvertInstr) {
10974 if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
10975 MI->getOpcode() == AMDGPU::S_CSELECT_B64) {
10976 swapOperands(*MI);
10977 } else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
10978 MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) {
10979 MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
10980 ? AMDGPU::S_CBRANCH_SCC1
10981 : AMDGPU::S_CBRANCH_SCC0));
10982 } else {
10983 llvm_unreachable("SCC used but no inversion handling");
10984 }
10985 }
10986 return true;
10987}
10988
10989// SCC is already valid after SCCValid.
10990// SCCRedefine will redefine SCC to the same value already available after
10991// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
10992// update kill/dead flags if necessary.
10993bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
10994 bool NeedInversion) const {
10995 MachineInstr *KillsSCC = nullptr;
10996 if (SCCValid->getParent() != SCCRedefine->getParent())
10997 return false;
10998 for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
10999 SCCRedefine->getIterator())) {
11000 if (MI.modifiesRegister(AMDGPU::SCC, &RI))
11001 return false;
11002 if (MI.killsRegister(AMDGPU::SCC, &RI))
11003 KillsSCC = &MI;
11004 }
11005 if (NeedInversion && !invertSCCUse(SCCRedefine))
11006 return false;
11007 if (MachineOperand *SccDef =
11008 SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
11009 SccDef->setIsDead(false);
11010 if (KillsSCC)
11011 KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
11012 SCCRedefine->eraseFromParent();
11013 return true;
11014}
11015
11016static bool foldableSelect(const MachineInstr &Def) {
11017 if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
11018 Def.getOpcode() != AMDGPU::S_CSELECT_B64)
11019 return false;
11020 bool Op1IsNonZeroImm =
11021 Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
11022 bool Op2IsZeroImm =
11023 Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
11024 if (!Op1IsNonZeroImm || !Op2IsZeroImm)
11025 return false;
11026 return true;
11027}
11028
11029static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion,
11030 unsigned &NewDefOpc) {
11031 // S_ADD_U32 X, 1 sets SCC on carryout which can only happen if result==0.
11032 // S_ADD_I32 X, 1 can be converted to S_ADD_U32 X, 1 if SCC is dead.
11033 if (Def.getOpcode() != AMDGPU::S_ADD_I32 &&
11034 Def.getOpcode() != AMDGPU::S_ADD_U32)
11035 return false;
11036 const MachineOperand &AddSrc1 = Def.getOperand(1);
11037 const MachineOperand &AddSrc2 = Def.getOperand(2);
11038 int64_t addend;
11039
11040 if ((!AddSrc1.isImm() || AddSrc1.getImm() != 1) &&
11041 (!AddSrc2.isImm() || AddSrc2.getImm() != 1) &&
11042 (!getFoldableImm(&AddSrc1, addend) || addend != 1) &&
11043 (!getFoldableImm(&AddSrc2, addend) || addend != 1))
11044 return false;
11045
11046 if (Def.getOpcode() == AMDGPU::S_ADD_I32) {
11047 const MachineOperand *SccDef =
11048 Def.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
11049 if (!SccDef->isDead())
11050 return false;
11051 NewDefOpc = AMDGPU::S_ADD_U32;
11052 }
11053 NeedInversion = !NeedInversion;
11054 return true;
11055}
11056
11058 Register SrcReg2, int64_t CmpMask,
11059 int64_t CmpValue,
11060 const MachineRegisterInfo *MRI) const {
11061 if (!SrcReg || SrcReg.isPhysical())
11062 return false;
11063
11064 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
11065 return false;
11066
11067 const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
11068 this](bool NeedInversion) -> bool {
11069 if (CmpValue != 0)
11070 return false;
11071
11072 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11073 if (!Def)
11074 return false;
11075
11076 // For S_OP that set SCC = DST!=0, do the transformation
11077 //
11078 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11079 //
11080 // For (S_OP ...) that set SCC = DST==0, invert NeedInversion and
11081 // do the transformation:
11082 //
11083 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11084 //
11085 // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
11086 // for S_CSELECT* already has the same value that will be calculated by
11087 // s_cmp_lg_*
11088 //
11089 // s_cmp_[lg|eq]_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT*
11090 // (non-zero imm), 0)
11091
11092 unsigned NewDefOpc = Def->getOpcode();
11093 if (!setsSCCIfResultIsNonZero(*Def) &&
11094 !setsSCCIfResultIsZero(*Def, NeedInversion, NewDefOpc) &&
11095 !foldableSelect(*Def))
11096 return false;
11097
11098 if (!optimizeSCC(Def, &CmpInstr, NeedInversion))
11099 return false;
11100
11101 if (NewDefOpc != Def->getOpcode())
11102 Def->setDesc(get(NewDefOpc));
11103
11104 // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
11105 // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
11106 // 64-bit foldableSelect then delete s_or_b32 in the sequence:
11107 // sX = s_cselect_b64 (non-zero imm), 0
11108 // sLo = copy sX.sub0
11109 // sHi = copy sX.sub1
11110 // sY = s_or_b32 sLo, sHi
11111 if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
11112 MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
11113 const MachineOperand &OrOpnd1 = Def->getOperand(1);
11114 const MachineOperand &OrOpnd2 = Def->getOperand(2);
11115 if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
11116 MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
11117 MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
11118 if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
11119 Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
11120 Def2->getOperand(1).isReg() &&
11121 Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
11122 Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
11123 Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
11124 MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
11125 if (Select && foldableSelect(*Select))
11126 optimizeSCC(Select, Def, /*NeedInversion=*/false);
11127 }
11128 }
11129 }
11130 return true;
11131 };
11132
11133 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
11134 this](int64_t ExpectedValue, unsigned SrcSize,
11135 bool IsReversible, bool IsSigned) -> bool {
11136 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11137 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11138 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11139 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11140 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
11141 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11142 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11143 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11144 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11145 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
11146 //
11147 // Signed ge/gt are not used for the sign bit.
11148 //
11149 // If result of the AND is unused except in the compare:
11150 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
11151 //
11152 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11153 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11154 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
11155 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11156 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11157 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
11158
11159 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11160 if (!Def)
11161 return false;
11162
11163 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
11164 Def->getOpcode() != AMDGPU::S_AND_B64)
11165 return false;
11166
11167 int64_t Mask;
11168 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
11169 if (MO->isImm())
11170 Mask = MO->getImm();
11171 else if (!getFoldableImm(MO, Mask))
11172 return false;
11173 Mask &= maxUIntN(SrcSize);
11174 return isPowerOf2_64(Mask);
11175 };
11176
11177 MachineOperand *SrcOp = &Def->getOperand(1);
11178 if (isMask(SrcOp))
11179 SrcOp = &Def->getOperand(2);
11180 else if (isMask(&Def->getOperand(2)))
11181 SrcOp = &Def->getOperand(1);
11182 else
11183 return false;
11184
11185 // A valid Mask is required to have a single bit set, hence a non-zero and
11186 // power-of-two value. This verifies that we will not do 64-bit shift below.
11187 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
11188 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
11189 if (IsSigned && BitNo == SrcSize - 1)
11190 return false;
11191
11192 ExpectedValue <<= BitNo;
11193
11194 bool IsReversedCC = false;
11195 if (CmpValue != ExpectedValue) {
11196 if (!IsReversible)
11197 return false;
11198 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
11199 if (!IsReversedCC)
11200 return false;
11201 }
11202
11203 Register DefReg = Def->getOperand(0).getReg();
11204 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
11205 return false;
11206
11207 if (!optimizeSCC(Def, &CmpInstr, /*NeedInversion=*/false))
11208 return false;
11209
11210 if (!MRI->use_nodbg_empty(DefReg)) {
11211 assert(!IsReversedCC);
11212 return true;
11213 }
11214
11215 // Replace AND with unused result with a S_BITCMP.
11216 MachineBasicBlock *MBB = Def->getParent();
11217
11218 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
11219 : AMDGPU::S_BITCMP1_B32
11220 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
11221 : AMDGPU::S_BITCMP1_B64;
11222
11223 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
11224 .add(*SrcOp)
11225 .addImm(BitNo);
11226 Def->eraseFromParent();
11227
11228 return true;
11229 };
11230
11231 switch (CmpInstr.getOpcode()) {
11232 default:
11233 break;
11234 case AMDGPU::S_CMP_EQ_U32:
11235 case AMDGPU::S_CMP_EQ_I32:
11236 case AMDGPU::S_CMPK_EQ_U32:
11237 case AMDGPU::S_CMPK_EQ_I32:
11238 return optimizeCmpAnd(1, 32, true, false) ||
11239 optimizeCmpSelect(/*NeedInversion=*/true);
11240 case AMDGPU::S_CMP_GE_U32:
11241 case AMDGPU::S_CMPK_GE_U32:
11242 return optimizeCmpAnd(1, 32, false, false);
11243 case AMDGPU::S_CMP_GE_I32:
11244 case AMDGPU::S_CMPK_GE_I32:
11245 return optimizeCmpAnd(1, 32, false, true);
11246 case AMDGPU::S_CMP_EQ_U64:
11247 return optimizeCmpAnd(1, 64, true, false);
11248 case AMDGPU::S_CMP_LG_U32:
11249 case AMDGPU::S_CMP_LG_I32:
11250 case AMDGPU::S_CMPK_LG_U32:
11251 case AMDGPU::S_CMPK_LG_I32:
11252 return optimizeCmpAnd(0, 32, true, false) ||
11253 optimizeCmpSelect(/*NeedInversion=*/false);
11254 case AMDGPU::S_CMP_GT_U32:
11255 case AMDGPU::S_CMPK_GT_U32:
11256 return optimizeCmpAnd(0, 32, false, false);
11257 case AMDGPU::S_CMP_GT_I32:
11258 case AMDGPU::S_CMPK_GT_I32:
11259 return optimizeCmpAnd(0, 32, false, true);
11260 case AMDGPU::S_CMP_LG_U64:
11261 return optimizeCmpAnd(0, 64, true, false) ||
11262 optimizeCmpSelect(/*NeedInversion=*/false);
11263 }
11264
11265 return false;
11266}
11267
11269 AMDGPU::OpName OpName) const {
11270 if (!ST.needsAlignedVGPRs())
11271 return;
11272
11273 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
11274 if (OpNo < 0)
11275 return;
11276 MachineOperand &Op = MI.getOperand(OpNo);
11277 if (getOpSize(MI, OpNo) > 4)
11278 return;
11279
11280 // Add implicit aligned super-reg to force alignment on the data operand.
11281 const DebugLoc &DL = MI.getDebugLoc();
11282 MachineBasicBlock *BB = MI.getParent();
11284 Register DataReg = Op.getReg();
11285 bool IsAGPR = RI.isAGPR(MRI, DataReg);
11287 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
11288 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
11289 Register NewVR =
11290 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
11291 : &AMDGPU::VReg_64_Align2RegClass);
11292 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
11293 .addReg(DataReg, {}, Op.getSubReg())
11294 .addImm(AMDGPU::sub0)
11295 .addReg(Undef)
11296 .addImm(AMDGPU::sub1);
11297 Op.setReg(NewVR);
11298 Op.setSubReg(AMDGPU::sub0);
11299 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
11300}
11301
11303 if (isIGLP(*MI))
11304 return false;
11305
11307}
11308
11310 if (!isWMMA(MI) && !isSWMMAC(MI))
11311 return false;
11312
11313 if (ST.hasGFX1250Insts())
11314 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
11315
11316 return true;
11317}
11318
11320 unsigned Opcode = MI.getOpcode();
11321
11322 if (AMDGPU::isGFX12Plus(ST))
11323 return isDOT(MI) || isXDLWMMA(MI);
11324
11325 if (!isMAI(MI) || isDGEMM(Opcode) ||
11326 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
11327 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
11328 return false;
11329
11330 if (!ST.hasGFX940Insts())
11331 return true;
11332
11333 return AMDGPU::getMAIIsGFX940XDL(Opcode);
11334}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static MachineBasicBlock * loadScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion, unsigned &NewDefOpc)
static bool isSCCDeadOnExit(MachineBasicBlock *MBB)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool foldableSelect(const MachineInstr &Def)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
#define LLVM_DEBUG(...)
Definition Debug.h:114
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:145
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1577
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:92
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
MIRFormater - Interface to format MIR operand based on target.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool isBundle() const
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI void eraseFromBundle()
Unlink 'this' from its basic block and delete it.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
LLVM_ABI void clearRegisterKills(Register Reg, const TargetRegisterInfo *RegInfo)
Clear all kill flags affecting Reg.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
iterator_range< use_nodbg_iterator > use_nodbg_operands(Register Reg) const
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI void moveOperands(MachineOperand *Dst, MachineOperand *Src, unsigned NumOps)
Move NumOps operands from Src to Dst, updating use-def lists as needed.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
bool reservedRegsFrozen() const
reservedRegsFrozen - Returns true after freezeReservedRegs() was called to ensure the set of reserved...
LLVM_ABI void clearVirtRegs()
clearVirtRegs - Remove all virtual registers (after physreg assignment).
iterator_range< use_instr_nodbg_iterator > use_nodbg_instructions(Register Reg) const
void setRegAllocationHint(Register VReg, unsigned Type, Register PrefReg)
setRegAllocationHint - Specify a register allocation hint for the specified virtual register.
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
void setSimpleHint(Register VReg, Register PrefReg)
Specify the preferred (target independent) register allocation hint for the specified virtual registe...
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
iterator_range< use_iterator > use_operands(Register Reg) const
LLVM_ABI void removeRegOperandFromUseList(MachineOperand *MO)
Remove MO from its use-def list.
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
LLVM_ABI void addRegOperandToUseList(MachineOperand *MO)
Add MO to the linked list of operands for its register.
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
bool canAddToBBProlog(const MachineInstr &MI) const
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
bool isSpill(uint32_t Opcode) const
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
bool mayAccessScratch(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static unsigned getFoldableCopySrcIdx(const MachineInstr &MI)
unsigned getOpSize(uint32_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool setsSCCIfResultIsNonZero(const MachineInstr &MI)
const MIRFormatter * getMIRFormatter() const override
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const final
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isNeverCoissue(MachineInstr &MI) const
static bool isBUF(const MachineInstr &MI)
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
static bool isFLATGlobal(const MachineInstr &MI)
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig) const override
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isTRANS(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
bool isReMaterializableImpl(const MachineInstr &MI) const override
static bool isVOP3(const MCInstrDesc &Desc)
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
static bool isMFMA(const MachineInstr &MI)
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
void mutateAndCleanupImplicit(MachineInstr &MI, const MCInstrDesc &NewDesc) const
static bool isMAI(const MCInstrDesc &Desc)
static bool usesLGKM_CNT(const MachineInstr &MI)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
const MachineOperand & getCalleeOperand(const MachineInstr &MI) const override
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
bool isAlwaysGDS(uint32_t Opcode) const
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool isWWMRegSpillOpcode(uint32_t Opcode)
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
static bool usesVM_CNT(const MachineInstr &MI)
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
std::optional< int64_t > getImmOrMaterializedImm(MachineOperand &Op) const
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
unsigned getScratchReservedForDynamicVGPRs() const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual const MachineOperand & getCalleeOperand(const MachineInstr &MI) const
Returns the callee operand from the given MI.
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
bool isPKFMACF16InlineConstant(uint32_t Literal, bool IsGFX11Plus)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int32_t getCommuteRev(uint32_t Opcode)
LLVM_READONLY int32_t getCommuteOrig(uint32_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READONLY int32_t getGlobalVaddrOp(uint32_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READONLY int32_t getMFMAEarlyClobberOp(uint32_t Opcode)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY int32_t getIfAddr64Inst(uint32_t Opcode)
Check if Opcode is an Addr64 opcode.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
LLVM_READONLY int32_t getVOPe32(uint32_t Opcode)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
LLVM_READONLY int32_t getAddr64Inst(uint32_t Opcode)
int32_t getMCOpcode(uint32_t Opcode, unsigned Gen)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:234
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:204
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:227
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:213
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:208
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:203
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:209
@ OPERAND_REG_IMM_V2FP16_SPLAT
Definition SIDefines.h:212
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:219
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:214
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:207
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:228
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:239
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:240
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:215
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:251
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:206
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:224
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:220
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:226
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:245
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:216
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:241
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:223
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:205
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:231
LLVM_READONLY int32_t getBasicFromSDWAOp(uint32_t Opcode)
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, const MCSubtargetInfo &ST)
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:598
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:600
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:597
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:599
@ TI_CONSTDATA_START
Definition AMDGPU.h:596
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READONLY int32_t getFlatScratchInstSVfromSS(uint32_t Opcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:71
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:70
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:61
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:67
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:72
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:532
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:207
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, const MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI VirtRegInfo AnalyzeVirtRegInBundle(MachineInstr &MI, Register Reg, SmallVectorImpl< std::pair< MachineInstr *, unsigned > > *Ops=nullptr)
AnalyzeVirtRegInBundle - Analyze how the current instruction or bundle uses a virtual register.
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:118
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr RegState getUndefRegState(bool B)
GenericCycleInfo< MachineSSAContext > MachineCycleInfo
MachineCycleInfo::CycleT MachineCycle
static const MachineMemOperand::Flags MOThreadPrivate
Mark the MMO of accesses to memory locations that are never written to by other threads.
Definition SIInstrInfo.h:57
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
Helper struct for the implementation of 3-address conversion to communicate updates made to instructi...
MachineInstr * RemoveMIUse
Other instruction whose def is no longer used by the converted instruction.
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:61
MachineInstr * top() const
Definition SIInstrInfo.h:66
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:85
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.
VirtRegInfo - Information about a virtual register used by a set of operands.
bool Reads
Reads - One of the operands read the virtual register.
bool Writes
Writes - One of the operands writes the virtual register.