LLVM 23.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
33#include "llvm/IR/IntrinsicsAMDGPU.h"
34#include "llvm/MC/MCContext.h"
37
38using namespace llvm;
39
40#define DEBUG_TYPE "si-instr-info"
41
42#define GET_INSTRINFO_CTOR_DTOR
43#include "AMDGPUGenInstrInfo.inc"
44
45namespace llvm::AMDGPU {
46#define GET_D16ImageDimIntrinsics_IMPL
47#define GET_ImageDimIntrinsicTable_IMPL
48#define GET_RsrcIntrinsics_IMPL
49#include "AMDGPUGenSearchableTables.inc"
50} // namespace llvm::AMDGPU
51
52// Must be at least 4 to be able to branch over minimum unconditional branch
53// code. This is only for making it possible to write reasonably small tests for
54// long branches.
56BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
57 cl::desc("Restrict range of branch instructions (DEBUG)"));
58
60 "amdgpu-fix-16-bit-physreg-copies",
61 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
62 cl::init(true),
64
66 : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
67 AMDGPU::ADJCALLSTACKDOWN),
68 RI(ST), ST(ST) {
69 SchedModel.init(&ST);
70}
71
72//===----------------------------------------------------------------------===//
73// TargetInstrInfo callbacks
74//===----------------------------------------------------------------------===//
75
76static unsigned getNumOperandsNoGlue(SDNode *Node) {
77 unsigned N = Node->getNumOperands();
78 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
79 --N;
80 return N;
81}
82
83/// Returns true if both nodes have the same value for the given
84/// operand \p Op, or if both nodes do not have this operand.
86 AMDGPU::OpName OpName) {
87 unsigned Opc0 = N0->getMachineOpcode();
88 unsigned Opc1 = N1->getMachineOpcode();
89
90 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
91 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
92
93 if (Op0Idx == -1 && Op1Idx == -1)
94 return true;
95
96
97 if ((Op0Idx == -1 && Op1Idx != -1) ||
98 (Op1Idx == -1 && Op0Idx != -1))
99 return false;
100
101 // getNamedOperandIdx returns the index for the MachineInstr's operands,
102 // which includes the result as the first operand. We are indexing into the
103 // MachineSDNode's operands, so we need to skip the result operand to get
104 // the real index.
105 --Op0Idx;
106 --Op1Idx;
107
108 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
109}
110
111static bool canRemat(const MachineInstr &MI) {
112
116 return true;
117
118 if (SIInstrInfo::isSMRD(MI)) {
119 return !MI.memoperands_empty() &&
120 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
121 return MMO->isLoad() && MMO->isInvariant();
122 });
123 }
124
125 return false;
126}
127
129 const MachineInstr &MI) const {
130
131 if (canRemat(MI)) {
132 // Normally VALU use of exec would block the rematerialization, but that
133 // is OK in this case to have an implicit exec read as all VALU do.
134 // We really want all of the generic logic for this except for this.
135
136 // Another potential implicit use is mode register. The core logic of
137 // the RA will not attempt rematerialization if mode is set anywhere
138 // in the function, otherwise it is safe since mode is not changed.
139
140 // There is difference to generic method which does not allow
141 // rematerialization if there are virtual register uses. We allow this,
142 // therefore this method includes SOP instructions as well.
143 if (!MI.hasImplicitDef() &&
144 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
145 !MI.mayRaiseFPException())
146 return true;
147 }
148
150}
151
152// Returns true if the scalar result of a VALU instruction depends on exec.
153bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
154 // Ignore comparisons which are only used masked with exec.
155 // This allows some hoisting/sinking of VALU comparisons.
156 if (MI.isCompare()) {
157 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
158 if (!Dst)
159 return true;
160
161 Register DstReg = Dst->getReg();
162 if (!DstReg.isVirtual())
163 return true;
164
165 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
166 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
167 switch (Use.getOpcode()) {
168 case AMDGPU::S_AND_SAVEEXEC_B32:
169 case AMDGPU::S_AND_SAVEEXEC_B64:
170 break;
171 case AMDGPU::S_AND_B32:
172 case AMDGPU::S_AND_B64:
173 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
174 return true;
175 break;
176 default:
177 return true;
178 }
179 }
180 return false;
181 }
182
183 switch (MI.getOpcode()) {
184 default:
185 break;
186 case AMDGPU::V_READFIRSTLANE_B32:
187 return true;
188 }
189
190 return false;
191}
192
194 // Any implicit use of exec by VALU is not a real register read.
195 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
196 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
197}
198
200 MachineBasicBlock *SuccToSinkTo,
201 MachineCycleInfo *CI) const {
202 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
203 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
204 return true;
205
206 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
207 // Check if sinking of MI would create temporal divergent use.
208 for (auto Op : MI.uses()) {
209 if (Op.isReg() && Op.getReg().isVirtual() &&
210 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
211 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
212
213 // SgprDef defined inside cycle
214 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
215 if (FromCycle == nullptr)
216 continue;
217
218 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
219 // Check if there is a FromCycle that contains SgprDef's basic block but
220 // does not contain SuccToSinkTo and also has divergent exit condition.
221 while (FromCycle && !FromCycle->contains(ToCycle)) {
223 FromCycle->getExitingBlocks(ExitingBlocks);
224
225 // FromCycle has divergent exit condition.
226 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
227 if (hasDivergentBranch(ExitingBlock))
228 return false;
229 }
230
231 FromCycle = FromCycle->getParentCycle();
232 }
233 }
234 }
235
236 return true;
237}
238
240 int64_t &Offset0,
241 int64_t &Offset1) const {
242 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
243 return false;
244
245 unsigned Opc0 = Load0->getMachineOpcode();
246 unsigned Opc1 = Load1->getMachineOpcode();
247
248 // Make sure both are actually loads.
249 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
250 return false;
251
252 // A mayLoad instruction without a def is not a load. Likely a prefetch.
253 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
254 return false;
255
256 if (isDS(Opc0) && isDS(Opc1)) {
257
258 // FIXME: Handle this case:
259 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
260 return false;
261
262 // Check base reg.
263 if (Load0->getOperand(0) != Load1->getOperand(0))
264 return false;
265
266 // Skip read2 / write2 variants for simplicity.
267 // TODO: We should report true if the used offsets are adjacent (excluded
268 // st64 versions).
269 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
270 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
271 if (Offset0Idx == -1 || Offset1Idx == -1)
272 return false;
273
274 // XXX - be careful of dataless loads
275 // getNamedOperandIdx returns the index for MachineInstrs. Since they
276 // include the output in the operand list, but SDNodes don't, we need to
277 // subtract the index by one.
278 Offset0Idx -= get(Opc0).NumDefs;
279 Offset1Idx -= get(Opc1).NumDefs;
280 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
281 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
282 return true;
283 }
284
285 if (isSMRD(Opc0) && isSMRD(Opc1)) {
286 // Skip time and cache invalidation instructions.
287 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
288 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
289 return false;
290
291 unsigned NumOps = getNumOperandsNoGlue(Load0);
292 if (NumOps != getNumOperandsNoGlue(Load1))
293 return false;
294
295 // Check base reg.
296 if (Load0->getOperand(0) != Load1->getOperand(0))
297 return false;
298
299 // Match register offsets, if both register and immediate offsets present.
300 assert(NumOps == 4 || NumOps == 5);
301 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
302 return false;
303
304 const ConstantSDNode *Load0Offset =
306 const ConstantSDNode *Load1Offset =
308
309 if (!Load0Offset || !Load1Offset)
310 return false;
311
312 Offset0 = Load0Offset->getZExtValue();
313 Offset1 = Load1Offset->getZExtValue();
314 return true;
315 }
316
317 // MUBUF and MTBUF can access the same addresses.
318 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
319
320 // MUBUF and MTBUF have vaddr at different indices.
321 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
322 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
323 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
324 return false;
325
326 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
327 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
328
329 if (OffIdx0 == -1 || OffIdx1 == -1)
330 return false;
331
332 // getNamedOperandIdx returns the index for MachineInstrs. Since they
333 // include the output in the operand list, but SDNodes don't, we need to
334 // subtract the index by one.
335 OffIdx0 -= get(Opc0).NumDefs;
336 OffIdx1 -= get(Opc1).NumDefs;
337
338 SDValue Off0 = Load0->getOperand(OffIdx0);
339 SDValue Off1 = Load1->getOperand(OffIdx1);
340
341 // The offset might be a FrameIndexSDNode.
342 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
343 return false;
344
345 Offset0 = Off0->getAsZExtVal();
346 Offset1 = Off1->getAsZExtVal();
347 return true;
348 }
349
350 return false;
351}
352
353static bool isStride64(unsigned Opc) {
354 switch (Opc) {
355 case AMDGPU::DS_READ2ST64_B32:
356 case AMDGPU::DS_READ2ST64_B64:
357 case AMDGPU::DS_WRITE2ST64_B32:
358 case AMDGPU::DS_WRITE2ST64_B64:
359 return true;
360 default:
361 return false;
362 }
363}
364
367 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
368 const TargetRegisterInfo *TRI) const {
369 if (!LdSt.mayLoadOrStore())
370 return false;
371
372 unsigned Opc = LdSt.getOpcode();
373 OffsetIsScalable = false;
374 const MachineOperand *BaseOp, *OffsetOp;
375 int DataOpIdx;
376
377 if (isDS(LdSt)) {
378 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
379 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
380 if (OffsetOp) {
381 // Normal, single offset LDS instruction.
382 if (!BaseOp) {
383 // DS_CONSUME/DS_APPEND use M0 for the base address.
384 // TODO: find the implicit use operand for M0 and use that as BaseOp?
385 return false;
386 }
387 BaseOps.push_back(BaseOp);
388 Offset = OffsetOp->getImm();
389 // Get appropriate operand, and compute width accordingly.
390 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
391 if (DataOpIdx == -1)
392 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
393 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
394 Width = LocationSize::precise(64);
395 else
396 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
397 } else {
398 // The 2 offset instructions use offset0 and offset1 instead. We can treat
399 // these as a load with a single offset if the 2 offsets are consecutive.
400 // We will use this for some partially aligned loads.
401 const MachineOperand *Offset0Op =
402 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
403 const MachineOperand *Offset1Op =
404 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
405
406 unsigned Offset0 = Offset0Op->getImm() & 0xff;
407 unsigned Offset1 = Offset1Op->getImm() & 0xff;
408 if (Offset0 + 1 != Offset1)
409 return false;
410
411 // Each of these offsets is in element sized units, so we need to convert
412 // to bytes of the individual reads.
413
414 unsigned EltSize;
415 if (LdSt.mayLoad())
416 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
417 else {
418 assert(LdSt.mayStore());
419 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
420 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
421 }
422
423 if (isStride64(Opc))
424 EltSize *= 64;
425
426 BaseOps.push_back(BaseOp);
427 Offset = EltSize * Offset0;
428 // Get appropriate operand(s), and compute width accordingly.
429 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
430 if (DataOpIdx == -1) {
431 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
432 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
433 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
434 Width = LocationSize::precise(
435 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
436 } else {
437 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
438 }
439 }
440 return true;
441 }
442
443 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
444 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
445 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
446 return false;
447 BaseOps.push_back(RSrc);
448 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
449 if (BaseOp && !BaseOp->isFI())
450 BaseOps.push_back(BaseOp);
451 const MachineOperand *OffsetImm =
452 getNamedOperand(LdSt, AMDGPU::OpName::offset);
453 Offset = OffsetImm->getImm();
454 const MachineOperand *SOffset =
455 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
456 if (SOffset) {
457 if (SOffset->isReg())
458 BaseOps.push_back(SOffset);
459 else
460 Offset += SOffset->getImm();
461 }
462 // Get appropriate operand, and compute width accordingly.
463 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
464 if (DataOpIdx == -1)
465 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
466 if (DataOpIdx == -1) // LDS DMA
467 return false;
468 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
469 return true;
470 }
471
472 if (isImage(LdSt)) {
473 auto RsrcOpName =
474 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
475 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
476 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
477 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
478 if (VAddr0Idx >= 0) {
479 // GFX10 possible NSA encoding.
480 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
481 BaseOps.push_back(&LdSt.getOperand(I));
482 } else {
483 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
484 }
485 Offset = 0;
486 // Get appropriate operand, and compute width accordingly.
487 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
488 if (DataOpIdx == -1)
489 return false; // no return sampler
490 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
491 return true;
492 }
493
494 if (isSMRD(LdSt)) {
495 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
496 if (!BaseOp) // e.g. S_MEMTIME
497 return false;
498 BaseOps.push_back(BaseOp);
499 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
500 Offset = OffsetOp ? OffsetOp->getImm() : 0;
501 // Get appropriate operand, and compute width accordingly.
502 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
503 if (DataOpIdx == -1)
504 return false;
505 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
506 return true;
507 }
508
509 if (isFLAT(LdSt)) {
510 // Instructions have either vaddr or saddr or both or none.
511 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
512 if (BaseOp)
513 BaseOps.push_back(BaseOp);
514 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
515 if (BaseOp)
516 BaseOps.push_back(BaseOp);
517 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
518 // Get appropriate operand, and compute width accordingly.
519 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
520 if (DataOpIdx == -1)
521 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
522 if (DataOpIdx == -1) // LDS DMA
523 return false;
524 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
525 return true;
526 }
527
528 return false;
529}
530
531static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
533 const MachineInstr &MI2,
535 // Only examine the first "base" operand of each instruction, on the
536 // assumption that it represents the real base address of the memory access.
537 // Other operands are typically offsets or indices from this base address.
538 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
539 return true;
540
541 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
542 return false;
543
544 auto *MO1 = *MI1.memoperands_begin();
545 auto *MO2 = *MI2.memoperands_begin();
546 if (MO1->getAddrSpace() != MO2->getAddrSpace())
547 return false;
548
549 const auto *Base1 = MO1->getValue();
550 const auto *Base2 = MO2->getValue();
551 if (!Base1 || !Base2)
552 return false;
553 Base1 = getUnderlyingObject(Base1);
554 Base2 = getUnderlyingObject(Base2);
555
556 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
557 return false;
558
559 return Base1 == Base2;
560}
561
563 int64_t Offset1, bool OffsetIsScalable1,
565 int64_t Offset2, bool OffsetIsScalable2,
566 unsigned ClusterSize,
567 unsigned NumBytes) const {
568 // If the mem ops (to be clustered) do not have the same base ptr, then they
569 // should not be clustered
570 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
571 if (!BaseOps1.empty() && !BaseOps2.empty()) {
572 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
573 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
574 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
575 return false;
576
577 const SIMachineFunctionInfo *MFI =
578 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
579 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
580 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
581 // If only one base op is empty, they do not have the same base ptr
582 return false;
583 }
584
585 // In order to avoid register pressure, on an average, the number of DWORDS
586 // loaded together by all clustered mem ops should not exceed
587 // MaxMemoryClusterDWords. This is an empirical value based on certain
588 // observations and performance related experiments.
589 // The good thing about this heuristic is - it avoids clustering of too many
590 // sub-word loads, and also avoids clustering of wide loads. Below is the
591 // brief summary of how the heuristic behaves for various `LoadSize` when
592 // MaxMemoryClusterDWords is 8.
593 //
594 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
595 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
596 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
597 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
598 // (5) LoadSize >= 17: do not cluster
599 const unsigned LoadSize = NumBytes / ClusterSize;
600 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
601 return NumDWords <= MaxMemoryClusterDWords;
602}
603
604// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
605// the first 16 loads will be interleaved with the stores, and the next 16 will
606// be clustered as expected. It should really split into 2 16 store batches.
607//
608// Loads are clustered until this returns false, rather than trying to schedule
609// groups of stores. This also means we have to deal with saying different
610// address space loads should be clustered, and ones which might cause bank
611// conflicts.
612//
613// This might be deprecated so it might not be worth that much effort to fix.
615 int64_t Offset0, int64_t Offset1,
616 unsigned NumLoads) const {
617 assert(Offset1 > Offset0 &&
618 "Second offset should be larger than first offset!");
619 // If we have less than 16 loads in a row, and the offsets are within 64
620 // bytes, then schedule together.
621
622 // A cacheline is 64 bytes (for global memory).
623 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
624}
625
628 const DebugLoc &DL, MCRegister DestReg,
629 MCRegister SrcReg, bool KillSrc,
630 const char *Msg = "illegal VGPR to SGPR copy") {
631 MachineFunction *MF = MBB.getParent();
632
634 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
635
636 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
637 .addReg(SrcReg, getKillRegState(KillSrc));
638}
639
640/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
641/// possible to have a direct copy in these cases on GFX908, so an intermediate
642/// VGPR copy is required.
646 const DebugLoc &DL, MCRegister DestReg,
647 MCRegister SrcReg, bool KillSrc,
648 RegScavenger &RS, bool RegsOverlap,
649 Register ImpDefSuperReg = Register(),
650 Register ImpUseSuperReg = Register()) {
651 assert((TII.getSubtarget().hasMAIInsts() &&
652 !TII.getSubtarget().hasGFX90AInsts()) &&
653 "Expected GFX908 subtarget.");
654
655 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
656 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
657 "Source register of the copy should be either an SGPR or an AGPR.");
658
659 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
660 "Destination register of the copy should be an AGPR.");
661
662 const SIRegisterInfo &RI = TII.getRegisterInfo();
663
664 // First try to find defining accvgpr_write to avoid temporary registers.
665 // In the case of copies of overlapping AGPRs, we conservatively do not
666 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
667 // an accvgpr_write used for this same copy due to implicit-defs
668 if (!RegsOverlap) {
669 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
670 --Def;
671
672 if (!Def->modifiesRegister(SrcReg, &RI))
673 continue;
674
675 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
676 Def->getOperand(0).getReg() != SrcReg)
677 break;
678
679 MachineOperand &DefOp = Def->getOperand(1);
680 assert(DefOp.isReg() || DefOp.isImm());
681
682 if (DefOp.isReg()) {
683 bool SafeToPropagate = true;
684 // Check that register source operand is not clobbered before MI.
685 // Immediate operands are always safe to propagate.
686 for (auto I = Def; I != MI && SafeToPropagate; ++I)
687 if (I->modifiesRegister(DefOp.getReg(), &RI))
688 SafeToPropagate = false;
689
690 if (!SafeToPropagate)
691 break;
692
693 for (auto I = Def; I != MI; ++I)
694 I->clearRegisterKills(DefOp.getReg(), &RI);
695 }
696
697 MachineInstrBuilder Builder =
698 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
699 .add(DefOp);
700 if (ImpDefSuperReg)
701 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
702
703 if (ImpUseSuperReg) {
704 Builder.addReg(ImpUseSuperReg,
706 }
707
708 return;
709 }
710 }
711
712 RS.enterBasicBlockEnd(MBB);
713 RS.backward(std::next(MI));
714
715 // Ideally we want to have three registers for a long reg_sequence copy
716 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
717 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
718 *MBB.getParent());
719
720 // Registers in the sequence are allocated contiguously so we can just
721 // use register number to pick one of three round-robin temps.
722 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
723 Register Tmp =
724 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
725 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
726 "VGPR used for an intermediate copy should have been reserved.");
727
728 // Only loop through if there are any free registers left. We don't want to
729 // spill.
730 while (RegNo--) {
731 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
732 /* RestoreAfter */ false, 0,
733 /* AllowSpill */ false);
734 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
735 break;
736 Tmp = Tmp2;
737 RS.setRegUsed(Tmp);
738 }
739
740 // Insert copy to temporary VGPR.
741 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
742 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
743 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
744 } else {
745 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
746 }
747
748 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
749 .addReg(SrcReg, getKillRegState(KillSrc));
750 if (ImpUseSuperReg) {
751 UseBuilder.addReg(ImpUseSuperReg,
753 }
754
755 MachineInstrBuilder DefBuilder
756 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
757 .addReg(Tmp, RegState::Kill);
758
759 if (ImpDefSuperReg)
760 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
761}
762
765 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
766 const TargetRegisterClass *RC, bool Forward) {
767 const SIRegisterInfo &RI = TII.getRegisterInfo();
768 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
770 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
771
772 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
773 int16_t SubIdx = BaseIndices[Idx];
774 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
775 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
776 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
777 unsigned Opcode = AMDGPU::S_MOV_B32;
778
779 // Is SGPR aligned? If so try to combine with next.
780 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
781 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
782 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
783 // Can use SGPR64 copy
784 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
785 SubIdx = RI.getSubRegFromChannel(Channel, 2);
786 DestSubReg = RI.getSubReg(DestReg, SubIdx);
787 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
788 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
789 Opcode = AMDGPU::S_MOV_B64;
790 Idx++;
791 }
792
793 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
794 .addReg(SrcSubReg)
795 .addReg(SrcReg, RegState::Implicit);
796
797 if (!FirstMI)
798 FirstMI = LastMI;
799
800 if (!Forward)
801 I--;
802 }
803
804 assert(FirstMI && LastMI);
805 if (!Forward)
806 std::swap(FirstMI, LastMI);
807
808 FirstMI->addOperand(
809 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
810
811 if (KillSrc)
812 LastMI->addRegisterKilled(SrcReg, &RI);
813}
814
817 const DebugLoc &DL, Register DestReg,
818 Register SrcReg, bool KillSrc, bool RenamableDest,
819 bool RenamableSrc) const {
820 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
821 unsigned Size = RI.getRegSizeInBits(*RC);
822 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
823 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
824
825 // The rest of copyPhysReg assumes Src and Dst size are the same size.
826 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
827 // we remove Fix16BitCopies and this code block?
828 if (Fix16BitCopies) {
829 if (((Size == 16) != (SrcSize == 16))) {
830 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
831 assert(ST.useRealTrue16Insts());
832 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
833 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
834 RegToFix = SubReg;
835
836 if (DestReg == SrcReg) {
837 // Identity copy. Insert empty bundle since ExpandPostRA expects an
838 // instruction here.
839 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
840 return;
841 }
842 RC = RI.getPhysRegBaseClass(DestReg);
843 Size = RI.getRegSizeInBits(*RC);
844 SrcRC = RI.getPhysRegBaseClass(SrcReg);
845 SrcSize = RI.getRegSizeInBits(*SrcRC);
846 }
847 }
848
849 if (RC == &AMDGPU::VGPR_32RegClass) {
850 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
851 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
852 AMDGPU::AGPR_32RegClass.contains(SrcReg));
853 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
854 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
855 BuildMI(MBB, MI, DL, get(Opc), DestReg)
856 .addReg(SrcReg, getKillRegState(KillSrc));
857 return;
858 }
859
860 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
861 RC == &AMDGPU::SReg_32RegClass) {
862 if (SrcReg == AMDGPU::SCC) {
863 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
864 .addImm(1)
865 .addImm(0);
866 return;
867 }
868
869 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
870 if (DestReg == AMDGPU::VCC_LO) {
871 // FIXME: Hack until VReg_1 removed.
872 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
873 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
874 .addImm(0)
875 .addReg(SrcReg, getKillRegState(KillSrc));
876 return;
877 }
878
879 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
880 return;
881 }
882
883 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
884 .addReg(SrcReg, getKillRegState(KillSrc));
885 return;
886 }
887
888 if (RC == &AMDGPU::SReg_64RegClass) {
889 if (SrcReg == AMDGPU::SCC) {
890 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
891 .addImm(1)
892 .addImm(0);
893 return;
894 }
895
896 if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
897 if (DestReg == AMDGPU::VCC) {
898 // FIXME: Hack until VReg_1 removed.
899 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
900 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
901 .addImm(0)
902 .addReg(SrcReg, getKillRegState(KillSrc));
903 return;
904 }
905
906 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
907 return;
908 }
909
910 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
911 .addReg(SrcReg, getKillRegState(KillSrc));
912 return;
913 }
914
915 if (DestReg == AMDGPU::SCC) {
916 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
917 // but SelectionDAG emits such copies for i1 sources.
918 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
919 // This copy can only be produced by patterns
920 // with explicit SCC, which are known to be enabled
921 // only for subtargets with S_CMP_LG_U64 present.
922 assert(ST.hasScalarCompareEq64());
923 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
924 .addReg(SrcReg, getKillRegState(KillSrc))
925 .addImm(0);
926 } else {
927 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
928 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
929 .addReg(SrcReg, getKillRegState(KillSrc))
930 .addImm(0);
931 }
932
933 return;
934 }
935
936 if (RC == &AMDGPU::AGPR_32RegClass) {
937 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
938 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
939 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
940 .addReg(SrcReg, getKillRegState(KillSrc));
941 return;
942 }
943
944 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
945 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
946 .addReg(SrcReg, getKillRegState(KillSrc));
947 return;
948 }
949
950 // FIXME: Pass should maintain scavenger to avoid scan through the block on
951 // every AGPR spill.
952 RegScavenger RS;
953 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
954 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
955 return;
956 }
957
958 if (Size == 16) {
959 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
960 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
961 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
962
963 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
964 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
965 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
966 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
967 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
968 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
969 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
970 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
971
972 if (IsSGPRDst) {
973 if (!IsSGPRSrc) {
974 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
975 return;
976 }
977
978 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
979 .addReg(NewSrcReg, getKillRegState(KillSrc));
980 return;
981 }
982
983 if (IsAGPRDst || IsAGPRSrc) {
984 if (!DstLow || !SrcLow) {
985 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
986 "Cannot use hi16 subreg with an AGPR!");
987 }
988
989 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
990 return;
991 }
992
993 if (ST.useRealTrue16Insts()) {
994 if (IsSGPRSrc) {
995 assert(SrcLow);
996 SrcReg = NewSrcReg;
997 }
998 // Use the smaller instruction encoding if possible.
999 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
1000 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
1001 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1002 .addReg(SrcReg);
1003 } else {
1004 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1005 .addImm(0) // src0_modifiers
1006 .addReg(SrcReg)
1007 .addImm(0); // op_sel
1008 }
1009 return;
1010 }
1011
1012 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1013 if (!DstLow || !SrcLow) {
1014 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1015 "Cannot use hi16 subreg on VI!");
1016 }
1017
1018 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1019 .addReg(NewSrcReg, getKillRegState(KillSrc));
1020 return;
1021 }
1022
1023 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1024 .addImm(0) // src0_modifiers
1025 .addReg(NewSrcReg)
1026 .addImm(0) // clamp
1033 // First implicit operand is $exec.
1034 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1035 return;
1036 }
1037
1038 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1039 if (ST.hasMovB64()) {
1040 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1041 .addReg(SrcReg, getKillRegState(KillSrc));
1042 return;
1043 }
1044 if (ST.hasPkMovB32()) {
1045 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1047 .addReg(SrcReg)
1049 .addReg(SrcReg)
1050 .addImm(0) // op_sel_lo
1051 .addImm(0) // op_sel_hi
1052 .addImm(0) // neg_lo
1053 .addImm(0) // neg_hi
1054 .addImm(0) // clamp
1055 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1056 return;
1057 }
1058 }
1059
1060 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1061 if (RI.isSGPRClass(RC)) {
1062 if (!RI.isSGPRClass(SrcRC)) {
1063 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1064 return;
1065 }
1066 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1067 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1068 Forward);
1069 return;
1070 }
1071
1072 unsigned EltSize = 4;
1073 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1074 if (RI.isAGPRClass(RC)) {
1075 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1076 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1077 else if (RI.hasVGPRs(SrcRC) ||
1078 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1079 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1080 else
1081 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1082 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1083 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1084 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1085 (RI.isProperlyAlignedRC(*RC) &&
1086 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1087 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1088 if (ST.hasMovB64()) {
1089 Opcode = AMDGPU::V_MOV_B64_e32;
1090 EltSize = 8;
1091 } else if (ST.hasPkMovB32()) {
1092 Opcode = AMDGPU::V_PK_MOV_B32;
1093 EltSize = 8;
1094 }
1095 }
1096
1097 // For the cases where we need an intermediate instruction/temporary register
1098 // (destination is an AGPR), we need a scavenger.
1099 //
1100 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1101 // whole block for every handled copy.
1102 std::unique_ptr<RegScavenger> RS;
1103 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1104 RS = std::make_unique<RegScavenger>();
1105
1106 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1107
1108 // If there is an overlap, we can't kill the super-register on the last
1109 // instruction, since it will also kill the components made live by this def.
1110 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1111 const bool CanKillSuperReg = KillSrc && !Overlap;
1112
1113 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1114 unsigned SubIdx;
1115 if (Forward)
1116 SubIdx = SubIndices[Idx];
1117 else
1118 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1119 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1120 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1121 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1122
1123 bool IsFirstSubreg = Idx == 0;
1124 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1125
1126 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1127 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1128 Register ImpUseSuper = SrcReg;
1129 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1130 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1131 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1133 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1135 .addReg(SrcSubReg)
1137 .addReg(SrcSubReg)
1138 .addImm(0) // op_sel_lo
1139 .addImm(0) // op_sel_hi
1140 .addImm(0) // neg_lo
1141 .addImm(0) // neg_hi
1142 .addImm(0) // clamp
1143 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1144 if (IsFirstSubreg)
1146 } else {
1147 MachineInstrBuilder Builder =
1148 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1149 if (IsFirstSubreg)
1150 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1151
1152 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1153 }
1154 }
1155}
1156
1157int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1158 int NewOpc;
1159
1160 // Try to map original to commuted opcode
1161 NewOpc = AMDGPU::getCommuteRev(Opcode);
1162 if (NewOpc != -1)
1163 // Check if the commuted (REV) opcode exists on the target.
1164 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1165
1166 // Try to map commuted to original opcode
1167 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1168 if (NewOpc != -1)
1169 // Check if the original (non-REV) opcode exists on the target.
1170 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1171
1172 return Opcode;
1173}
1174
1175const TargetRegisterClass *
1177 return &AMDGPU::VGPR_32RegClass;
1178}
1179
1182 const DebugLoc &DL, Register DstReg,
1184 Register TrueReg,
1185 Register FalseReg) const {
1186 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1187 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1189 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1190 "Not a VGPR32 reg");
1191
1192 if (Cond.size() == 1) {
1193 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1194 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1195 .add(Cond[0]);
1196 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1197 .addImm(0)
1198 .addReg(FalseReg)
1199 .addImm(0)
1200 .addReg(TrueReg)
1201 .addReg(SReg);
1202 } else if (Cond.size() == 2) {
1203 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1204 switch (Cond[0].getImm()) {
1205 case SIInstrInfo::SCC_TRUE: {
1206 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1207 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1208 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1209 .addImm(0)
1210 .addReg(FalseReg)
1211 .addImm(0)
1212 .addReg(TrueReg)
1213 .addReg(SReg);
1214 break;
1215 }
1216 case SIInstrInfo::SCC_FALSE: {
1217 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1218 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1219 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1220 .addImm(0)
1221 .addReg(FalseReg)
1222 .addImm(0)
1223 .addReg(TrueReg)
1224 .addReg(SReg);
1225 break;
1226 }
1227 case SIInstrInfo::VCCNZ: {
1228 MachineOperand RegOp = Cond[1];
1229 RegOp.setImplicit(false);
1230 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1231 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1232 .add(RegOp);
1233 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1234 .addImm(0)
1235 .addReg(FalseReg)
1236 .addImm(0)
1237 .addReg(TrueReg)
1238 .addReg(SReg);
1239 break;
1240 }
1241 case SIInstrInfo::VCCZ: {
1242 MachineOperand RegOp = Cond[1];
1243 RegOp.setImplicit(false);
1244 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1245 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1246 .add(RegOp);
1247 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1248 .addImm(0)
1249 .addReg(TrueReg)
1250 .addImm(0)
1251 .addReg(FalseReg)
1252 .addReg(SReg);
1253 break;
1254 }
1255 case SIInstrInfo::EXECNZ: {
1256 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1257 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1258 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1259 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1260 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1261 .addImm(0)
1262 .addReg(FalseReg)
1263 .addImm(0)
1264 .addReg(TrueReg)
1265 .addReg(SReg);
1266 break;
1267 }
1268 case SIInstrInfo::EXECZ: {
1269 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1270 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1271 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1272 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1273 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1274 .addImm(0)
1275 .addReg(FalseReg)
1276 .addImm(0)
1277 .addReg(TrueReg)
1278 .addReg(SReg);
1279 llvm_unreachable("Unhandled branch predicate EXECZ");
1280 break;
1281 }
1282 default:
1283 llvm_unreachable("invalid branch predicate");
1284 }
1285 } else {
1286 llvm_unreachable("Can only handle Cond size 1 or 2");
1287 }
1288}
1289
1292 const DebugLoc &DL,
1293 Register SrcReg, int Value) const {
1294 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1295 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1296 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1297 .addImm(Value)
1298 .addReg(SrcReg);
1299
1300 return Reg;
1301}
1302
1305 const DebugLoc &DL,
1306 Register SrcReg, int Value) const {
1307 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1308 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1309 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1310 .addImm(Value)
1311 .addReg(SrcReg);
1312
1313 return Reg;
1314}
1315
1317 const Register Reg,
1318 int64_t &ImmVal) const {
1319 switch (MI.getOpcode()) {
1320 case AMDGPU::V_MOV_B32_e32:
1321 case AMDGPU::S_MOV_B32:
1322 case AMDGPU::S_MOVK_I32:
1323 case AMDGPU::S_MOV_B64:
1324 case AMDGPU::V_MOV_B64_e32:
1325 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1326 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1327 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1328 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1329 case AMDGPU::V_MOV_B64_PSEUDO:
1330 case AMDGPU::V_MOV_B16_t16_e32: {
1331 const MachineOperand &Src0 = MI.getOperand(1);
1332 if (Src0.isImm()) {
1333 ImmVal = Src0.getImm();
1334 return MI.getOperand(0).getReg() == Reg;
1335 }
1336
1337 return false;
1338 }
1339 case AMDGPU::V_MOV_B16_t16_e64: {
1340 const MachineOperand &Src0 = MI.getOperand(2);
1341 if (Src0.isImm() && !MI.getOperand(1).getImm()) {
1342 ImmVal = Src0.getImm();
1343 return MI.getOperand(0).getReg() == Reg;
1344 }
1345
1346 return false;
1347 }
1348 case AMDGPU::S_BREV_B32:
1349 case AMDGPU::V_BFREV_B32_e32:
1350 case AMDGPU::V_BFREV_B32_e64: {
1351 const MachineOperand &Src0 = MI.getOperand(1);
1352 if (Src0.isImm()) {
1353 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1354 return MI.getOperand(0).getReg() == Reg;
1355 }
1356
1357 return false;
1358 }
1359 case AMDGPU::S_NOT_B32:
1360 case AMDGPU::V_NOT_B32_e32:
1361 case AMDGPU::V_NOT_B32_e64: {
1362 const MachineOperand &Src0 = MI.getOperand(1);
1363 if (Src0.isImm()) {
1364 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1365 return MI.getOperand(0).getReg() == Reg;
1366 }
1367
1368 return false;
1369 }
1370 default:
1371 return false;
1372 }
1373}
1374
1375std::optional<int64_t>
1377 if (Op.isImm())
1378 return Op.getImm();
1379
1380 if (!Op.isReg() || !Op.getReg().isVirtual())
1381 return std::nullopt;
1382 MachineRegisterInfo &MRI = Op.getParent()->getMF()->getRegInfo();
1383 const MachineInstr *Def = MRI.getVRegDef(Op.getReg());
1384 if (Def && Def->isMoveImmediate()) {
1385 const MachineOperand &ImmSrc = Def->getOperand(1);
1386 if (ImmSrc.isImm())
1387 return extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg());
1388 }
1389
1390 return std::nullopt;
1391}
1392
1394
1395 if (RI.isAGPRClass(DstRC))
1396 return AMDGPU::COPY;
1397 if (RI.getRegSizeInBits(*DstRC) == 16) {
1398 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1399 // before RA.
1400 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1401 }
1402 if (RI.getRegSizeInBits(*DstRC) == 32)
1403 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1404 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1405 return AMDGPU::S_MOV_B64;
1406 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1407 return AMDGPU::V_MOV_B64_PSEUDO;
1408 return AMDGPU::COPY;
1409}
1410
1411const MCInstrDesc &
1413 bool IsIndirectSrc) const {
1414 if (IsIndirectSrc) {
1415 if (VecSize <= 32) // 4 bytes
1416 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1417 if (VecSize <= 64) // 8 bytes
1418 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1419 if (VecSize <= 96) // 12 bytes
1420 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1421 if (VecSize <= 128) // 16 bytes
1422 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1423 if (VecSize <= 160) // 20 bytes
1424 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1425 if (VecSize <= 192) // 24 bytes
1426 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6);
1427 if (VecSize <= 224) // 28 bytes
1428 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7);
1429 if (VecSize <= 256) // 32 bytes
1430 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1431 if (VecSize <= 288) // 36 bytes
1432 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1433 if (VecSize <= 320) // 40 bytes
1434 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1435 if (VecSize <= 352) // 44 bytes
1436 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1437 if (VecSize <= 384) // 48 bytes
1438 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1439 if (VecSize <= 512) // 64 bytes
1440 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1441 if (VecSize <= 1024) // 128 bytes
1442 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1443
1444 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1445 }
1446
1447 if (VecSize <= 32) // 4 bytes
1448 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1449 if (VecSize <= 64) // 8 bytes
1450 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1451 if (VecSize <= 96) // 12 bytes
1452 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1453 if (VecSize <= 128) // 16 bytes
1454 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1455 if (VecSize <= 160) // 20 bytes
1456 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1457 if (VecSize <= 192) // 24 bytes
1458 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6);
1459 if (VecSize <= 224) // 28 bytes
1460 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7);
1461 if (VecSize <= 256) // 32 bytes
1462 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1463 if (VecSize <= 288) // 36 bytes
1464 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1465 if (VecSize <= 320) // 40 bytes
1466 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1467 if (VecSize <= 352) // 44 bytes
1468 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1469 if (VecSize <= 384) // 48 bytes
1470 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1471 if (VecSize <= 512) // 64 bytes
1472 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1473 if (VecSize <= 1024) // 128 bytes
1474 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1475
1476 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1477}
1478
1479static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1480 if (VecSize <= 32) // 4 bytes
1481 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1482 if (VecSize <= 64) // 8 bytes
1483 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1484 if (VecSize <= 96) // 12 bytes
1485 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1486 if (VecSize <= 128) // 16 bytes
1487 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1488 if (VecSize <= 160) // 20 bytes
1489 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1490 if (VecSize <= 192) // 24 bytes
1491 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1492 if (VecSize <= 224) // 28 bytes
1493 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1494 if (VecSize <= 256) // 32 bytes
1495 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1496 if (VecSize <= 288) // 36 bytes
1497 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1498 if (VecSize <= 320) // 40 bytes
1499 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1500 if (VecSize <= 352) // 44 bytes
1501 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1502 if (VecSize <= 384) // 48 bytes
1503 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1504 if (VecSize <= 512) // 64 bytes
1505 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1506 if (VecSize <= 1024) // 128 bytes
1507 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1508
1509 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1510}
1511
1512static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1513 if (VecSize <= 32) // 4 bytes
1514 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1515 if (VecSize <= 64) // 8 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1517 if (VecSize <= 96) // 12 bytes
1518 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1519 if (VecSize <= 128) // 16 bytes
1520 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1521 if (VecSize <= 160) // 20 bytes
1522 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1523 if (VecSize <= 192) // 24 bytes
1524 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1525 if (VecSize <= 224) // 28 bytes
1526 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1527 if (VecSize <= 256) // 32 bytes
1528 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1529 if (VecSize <= 288) // 36 bytes
1530 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1531 if (VecSize <= 320) // 40 bytes
1532 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1533 if (VecSize <= 352) // 44 bytes
1534 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1535 if (VecSize <= 384) // 48 bytes
1536 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1537 if (VecSize <= 512) // 64 bytes
1538 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1539 if (VecSize <= 1024) // 128 bytes
1540 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1541
1542 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1543}
1544
1545static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1546 if (VecSize <= 64) // 8 bytes
1547 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1548 if (VecSize <= 128) // 16 bytes
1549 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1550 if (VecSize <= 256) // 32 bytes
1551 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1552 if (VecSize <= 512) // 64 bytes
1553 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1554 if (VecSize <= 1024) // 128 bytes
1555 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1556
1557 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1558}
1559
1560const MCInstrDesc &
1561SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1562 bool IsSGPR) const {
1563 if (IsSGPR) {
1564 switch (EltSize) {
1565 case 32:
1566 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1567 case 64:
1568 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1569 default:
1570 llvm_unreachable("invalid reg indexing elt size");
1571 }
1572 }
1573
1574 assert(EltSize == 32 && "invalid reg indexing elt size");
1576}
1577
1578static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1579 switch (Size) {
1580 case 4:
1581 return AMDGPU::SI_SPILL_S32_SAVE;
1582 case 8:
1583 return AMDGPU::SI_SPILL_S64_SAVE;
1584 case 12:
1585 return AMDGPU::SI_SPILL_S96_SAVE;
1586 case 16:
1587 return AMDGPU::SI_SPILL_S128_SAVE;
1588 case 20:
1589 return AMDGPU::SI_SPILL_S160_SAVE;
1590 case 24:
1591 return AMDGPU::SI_SPILL_S192_SAVE;
1592 case 28:
1593 return AMDGPU::SI_SPILL_S224_SAVE;
1594 case 32:
1595 return AMDGPU::SI_SPILL_S256_SAVE;
1596 case 36:
1597 return AMDGPU::SI_SPILL_S288_SAVE;
1598 case 40:
1599 return AMDGPU::SI_SPILL_S320_SAVE;
1600 case 44:
1601 return AMDGPU::SI_SPILL_S352_SAVE;
1602 case 48:
1603 return AMDGPU::SI_SPILL_S384_SAVE;
1604 case 64:
1605 return AMDGPU::SI_SPILL_S512_SAVE;
1606 case 128:
1607 return AMDGPU::SI_SPILL_S1024_SAVE;
1608 default:
1609 llvm_unreachable("unknown register size");
1610 }
1611}
1612
1613static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1614 switch (Size) {
1615 case 2:
1616 return AMDGPU::SI_SPILL_V16_SAVE;
1617 case 4:
1618 return AMDGPU::SI_SPILL_V32_SAVE;
1619 case 8:
1620 return AMDGPU::SI_SPILL_V64_SAVE;
1621 case 12:
1622 return AMDGPU::SI_SPILL_V96_SAVE;
1623 case 16:
1624 return AMDGPU::SI_SPILL_V128_SAVE;
1625 case 20:
1626 return AMDGPU::SI_SPILL_V160_SAVE;
1627 case 24:
1628 return AMDGPU::SI_SPILL_V192_SAVE;
1629 case 28:
1630 return AMDGPU::SI_SPILL_V224_SAVE;
1631 case 32:
1632 return AMDGPU::SI_SPILL_V256_SAVE;
1633 case 36:
1634 return AMDGPU::SI_SPILL_V288_SAVE;
1635 case 40:
1636 return AMDGPU::SI_SPILL_V320_SAVE;
1637 case 44:
1638 return AMDGPU::SI_SPILL_V352_SAVE;
1639 case 48:
1640 return AMDGPU::SI_SPILL_V384_SAVE;
1641 case 64:
1642 return AMDGPU::SI_SPILL_V512_SAVE;
1643 case 128:
1644 return AMDGPU::SI_SPILL_V1024_SAVE;
1645 default:
1646 llvm_unreachable("unknown register size");
1647 }
1648}
1649
1650static unsigned getAVSpillSaveOpcode(unsigned Size) {
1651 switch (Size) {
1652 case 4:
1653 return AMDGPU::SI_SPILL_AV32_SAVE;
1654 case 8:
1655 return AMDGPU::SI_SPILL_AV64_SAVE;
1656 case 12:
1657 return AMDGPU::SI_SPILL_AV96_SAVE;
1658 case 16:
1659 return AMDGPU::SI_SPILL_AV128_SAVE;
1660 case 20:
1661 return AMDGPU::SI_SPILL_AV160_SAVE;
1662 case 24:
1663 return AMDGPU::SI_SPILL_AV192_SAVE;
1664 case 28:
1665 return AMDGPU::SI_SPILL_AV224_SAVE;
1666 case 32:
1667 return AMDGPU::SI_SPILL_AV256_SAVE;
1668 case 36:
1669 return AMDGPU::SI_SPILL_AV288_SAVE;
1670 case 40:
1671 return AMDGPU::SI_SPILL_AV320_SAVE;
1672 case 44:
1673 return AMDGPU::SI_SPILL_AV352_SAVE;
1674 case 48:
1675 return AMDGPU::SI_SPILL_AV384_SAVE;
1676 case 64:
1677 return AMDGPU::SI_SPILL_AV512_SAVE;
1678 case 128:
1679 return AMDGPU::SI_SPILL_AV1024_SAVE;
1680 default:
1681 llvm_unreachable("unknown register size");
1682 }
1683}
1684
1685static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1686 bool IsVectorSuperClass) {
1687 // Currently, there is only 32-bit WWM register spills needed.
1688 if (Size != 4)
1689 llvm_unreachable("unknown wwm register spill size");
1690
1691 if (IsVectorSuperClass)
1692 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1693
1694 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1695}
1696
1698 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1699 const SIMachineFunctionInfo &MFI) const {
1700 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1701
1702 // Choose the right opcode if spilling a WWM register.
1704 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1705
1706 // TODO: Check if AGPRs are available
1707 if (ST.hasMAIInsts())
1708 return getAVSpillSaveOpcode(Size);
1709
1711}
1712
1715 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1716 MachineInstr::MIFlag Flags) const {
1717 MachineFunction *MF = MBB.getParent();
1719 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1720 const DebugLoc &DL = MBB.findDebugLoc(MI);
1721
1722 MachinePointerInfo PtrInfo
1723 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1725 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1726 FrameInfo.getObjectAlign(FrameIndex));
1727 unsigned SpillSize = RI.getSpillSize(*RC);
1728
1730 if (RI.isSGPRClass(RC)) {
1731 MFI->setHasSpilledSGPRs();
1732 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1733 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1734 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1735
1736 // We are only allowed to create one new instruction when spilling
1737 // registers, so we need to use pseudo instruction for spilling SGPRs.
1738 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1739
1740 // The SGPR spill/restore instructions only work on number sgprs, so we need
1741 // to make sure we are using the correct register class.
1742 if (SrcReg.isVirtual() && SpillSize == 4) {
1743 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1744 }
1745
1746 BuildMI(MBB, MI, DL, OpDesc)
1747 .addReg(SrcReg, getKillRegState(isKill)) // data
1748 .addFrameIndex(FrameIndex) // addr
1749 .addMemOperand(MMO)
1751
1752 if (RI.spillSGPRToVGPR())
1753 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1754 return;
1755 }
1756
1757 unsigned Opcode =
1758 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1759 MFI->setHasSpilledVGPRs();
1760
1761 BuildMI(MBB, MI, DL, get(Opcode))
1762 .addReg(SrcReg, getKillRegState(isKill)) // data
1763 .addFrameIndex(FrameIndex) // addr
1764 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1765 .addImm(0) // offset
1766 .addMemOperand(MMO);
1767}
1768
1769static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1770 switch (Size) {
1771 case 4:
1772 return AMDGPU::SI_SPILL_S32_RESTORE;
1773 case 8:
1774 return AMDGPU::SI_SPILL_S64_RESTORE;
1775 case 12:
1776 return AMDGPU::SI_SPILL_S96_RESTORE;
1777 case 16:
1778 return AMDGPU::SI_SPILL_S128_RESTORE;
1779 case 20:
1780 return AMDGPU::SI_SPILL_S160_RESTORE;
1781 case 24:
1782 return AMDGPU::SI_SPILL_S192_RESTORE;
1783 case 28:
1784 return AMDGPU::SI_SPILL_S224_RESTORE;
1785 case 32:
1786 return AMDGPU::SI_SPILL_S256_RESTORE;
1787 case 36:
1788 return AMDGPU::SI_SPILL_S288_RESTORE;
1789 case 40:
1790 return AMDGPU::SI_SPILL_S320_RESTORE;
1791 case 44:
1792 return AMDGPU::SI_SPILL_S352_RESTORE;
1793 case 48:
1794 return AMDGPU::SI_SPILL_S384_RESTORE;
1795 case 64:
1796 return AMDGPU::SI_SPILL_S512_RESTORE;
1797 case 128:
1798 return AMDGPU::SI_SPILL_S1024_RESTORE;
1799 default:
1800 llvm_unreachable("unknown register size");
1801 }
1802}
1803
1804static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1805 switch (Size) {
1806 case 2:
1807 return AMDGPU::SI_SPILL_V16_RESTORE;
1808 case 4:
1809 return AMDGPU::SI_SPILL_V32_RESTORE;
1810 case 8:
1811 return AMDGPU::SI_SPILL_V64_RESTORE;
1812 case 12:
1813 return AMDGPU::SI_SPILL_V96_RESTORE;
1814 case 16:
1815 return AMDGPU::SI_SPILL_V128_RESTORE;
1816 case 20:
1817 return AMDGPU::SI_SPILL_V160_RESTORE;
1818 case 24:
1819 return AMDGPU::SI_SPILL_V192_RESTORE;
1820 case 28:
1821 return AMDGPU::SI_SPILL_V224_RESTORE;
1822 case 32:
1823 return AMDGPU::SI_SPILL_V256_RESTORE;
1824 case 36:
1825 return AMDGPU::SI_SPILL_V288_RESTORE;
1826 case 40:
1827 return AMDGPU::SI_SPILL_V320_RESTORE;
1828 case 44:
1829 return AMDGPU::SI_SPILL_V352_RESTORE;
1830 case 48:
1831 return AMDGPU::SI_SPILL_V384_RESTORE;
1832 case 64:
1833 return AMDGPU::SI_SPILL_V512_RESTORE;
1834 case 128:
1835 return AMDGPU::SI_SPILL_V1024_RESTORE;
1836 default:
1837 llvm_unreachable("unknown register size");
1838 }
1839}
1840
1841static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1842 switch (Size) {
1843 case 4:
1844 return AMDGPU::SI_SPILL_AV32_RESTORE;
1845 case 8:
1846 return AMDGPU::SI_SPILL_AV64_RESTORE;
1847 case 12:
1848 return AMDGPU::SI_SPILL_AV96_RESTORE;
1849 case 16:
1850 return AMDGPU::SI_SPILL_AV128_RESTORE;
1851 case 20:
1852 return AMDGPU::SI_SPILL_AV160_RESTORE;
1853 case 24:
1854 return AMDGPU::SI_SPILL_AV192_RESTORE;
1855 case 28:
1856 return AMDGPU::SI_SPILL_AV224_RESTORE;
1857 case 32:
1858 return AMDGPU::SI_SPILL_AV256_RESTORE;
1859 case 36:
1860 return AMDGPU::SI_SPILL_AV288_RESTORE;
1861 case 40:
1862 return AMDGPU::SI_SPILL_AV320_RESTORE;
1863 case 44:
1864 return AMDGPU::SI_SPILL_AV352_RESTORE;
1865 case 48:
1866 return AMDGPU::SI_SPILL_AV384_RESTORE;
1867 case 64:
1868 return AMDGPU::SI_SPILL_AV512_RESTORE;
1869 case 128:
1870 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1871 default:
1872 llvm_unreachable("unknown register size");
1873 }
1874}
1875
1876static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1877 bool IsVectorSuperClass) {
1878 // Currently, there is only 32-bit WWM register spills needed.
1879 if (Size != 4)
1880 llvm_unreachable("unknown wwm register spill size");
1881
1882 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1883 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1884
1885 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1886}
1887
1889 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1890 const SIMachineFunctionInfo &MFI) const {
1891 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1892
1893 // Choose the right opcode if restoring a WWM register.
1895 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1896
1897 // TODO: Check if AGPRs are available
1898 if (ST.hasMAIInsts())
1900
1901 assert(!RI.isAGPRClass(RC));
1903}
1904
1907 Register DestReg, int FrameIndex,
1908 const TargetRegisterClass *RC,
1909 Register VReg, unsigned SubReg,
1910 MachineInstr::MIFlag Flags) const {
1911 MachineFunction *MF = MBB.getParent();
1913 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1914 const DebugLoc &DL = MBB.findDebugLoc(MI);
1915 unsigned SpillSize = RI.getSpillSize(*RC);
1916
1917 MachinePointerInfo PtrInfo
1918 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1919
1921 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1922 FrameInfo.getObjectAlign(FrameIndex));
1923
1924 if (RI.isSGPRClass(RC)) {
1925 MFI->setHasSpilledSGPRs();
1926 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1927 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1928 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1929
1930 // FIXME: Maybe this should not include a memoperand because it will be
1931 // lowered to non-memory instructions.
1932 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1933 if (DestReg.isVirtual() && SpillSize == 4) {
1935 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1936 }
1937
1938 if (RI.spillSGPRToVGPR())
1939 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1940 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1941 .addFrameIndex(FrameIndex) // addr
1942 .addMemOperand(MMO)
1944
1945 return;
1946 }
1947
1948 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1949 SpillSize, *MFI);
1950 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1951 .addFrameIndex(FrameIndex) // vaddr
1952 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1953 .addImm(0) // offset
1954 .addMemOperand(MMO);
1955}
1956
1961
1964 unsigned Quantity) const {
1965 DebugLoc DL = MBB.findDebugLoc(MI);
1966 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1967 while (Quantity > 0) {
1968 unsigned Arg = std::min(Quantity, MaxSNopCount);
1969 Quantity -= Arg;
1970 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1971 }
1972}
1973
1975 auto *MF = MBB.getParent();
1976 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1977
1978 assert(Info->isEntryFunction());
1979
1980 if (MBB.succ_empty()) {
1981 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1982 if (HasNoTerminator) {
1983 if (Info->returnsVoid()) {
1984 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1985 } else {
1986 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1987 }
1988 }
1989 }
1990}
1991
1995 const DebugLoc &DL) const {
1996 MachineFunction *MF = MBB.getParent();
1997 constexpr unsigned DoorbellIDMask = 0x3ff;
1998 constexpr unsigned ECQueueWaveAbort = 0x400;
1999
2000 MachineBasicBlock *TrapBB = &MBB;
2001 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2002
2003 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
2004 MBB.splitAt(MI, /*UpdateLiveIns=*/false);
2005 TrapBB = MF->CreateMachineBasicBlock();
2006 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
2007 MF->push_back(TrapBB);
2008 MBB.addSuccessor(TrapBB);
2009 }
2010 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2011 // will be a nop.
2012 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
2013 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2014 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2015 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2016 DoorbellReg)
2018 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2019 .addUse(AMDGPU::M0);
2020 Register DoorbellRegMasked =
2021 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2022 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2023 .addUse(DoorbellReg)
2024 .addImm(DoorbellIDMask);
2025 Register SetWaveAbortBit =
2026 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2027 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2028 .addUse(DoorbellRegMasked)
2029 .addImm(ECQueueWaveAbort);
2030 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2031 .addUse(SetWaveAbortBit);
2032 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2034 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2035 .addUse(AMDGPU::TTMP2);
2036 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2037 TrapBB->addSuccessor(HaltLoopBB);
2038
2039 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2040 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2041 .addMBB(HaltLoopBB);
2042 MF->push_back(HaltLoopBB);
2043 HaltLoopBB->addSuccessor(HaltLoopBB);
2044
2045 return MBB.getNextNode();
2046}
2047
2049 switch (MI.getOpcode()) {
2050 default:
2051 if (MI.isMetaInstruction())
2052 return 0;
2053 return 1; // FIXME: Do wait states equal cycles?
2054
2055 case AMDGPU::S_NOP:
2056 return MI.getOperand(0).getImm() + 1;
2057 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2058 // hazard, even if one exist, won't really be visible. Should we handle it?
2059 }
2060}
2061
2063 MachineBasicBlock &MBB = *MI.getParent();
2064 DebugLoc DL = MBB.findDebugLoc(MI);
2066 switch (MI.getOpcode()) {
2067 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2068 case AMDGPU::S_MOV_B64_term:
2069 // This is only a terminator to get the correct spill code placement during
2070 // register allocation.
2071 MI.setDesc(get(AMDGPU::S_MOV_B64));
2072 break;
2073
2074 case AMDGPU::S_MOV_B32_term:
2075 // This is only a terminator to get the correct spill code placement during
2076 // register allocation.
2077 MI.setDesc(get(AMDGPU::S_MOV_B32));
2078 break;
2079
2080 case AMDGPU::S_XOR_B64_term:
2081 // This is only a terminator to get the correct spill code placement during
2082 // register allocation.
2083 MI.setDesc(get(AMDGPU::S_XOR_B64));
2084 break;
2085
2086 case AMDGPU::S_XOR_B32_term:
2087 // This is only a terminator to get the correct spill code placement during
2088 // register allocation.
2089 MI.setDesc(get(AMDGPU::S_XOR_B32));
2090 break;
2091 case AMDGPU::S_OR_B64_term:
2092 // This is only a terminator to get the correct spill code placement during
2093 // register allocation.
2094 MI.setDesc(get(AMDGPU::S_OR_B64));
2095 break;
2096 case AMDGPU::S_OR_B32_term:
2097 // This is only a terminator to get the correct spill code placement during
2098 // register allocation.
2099 MI.setDesc(get(AMDGPU::S_OR_B32));
2100 break;
2101
2102 case AMDGPU::S_ANDN2_B64_term:
2103 // This is only a terminator to get the correct spill code placement during
2104 // register allocation.
2105 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2106 break;
2107
2108 case AMDGPU::S_ANDN2_B32_term:
2109 // This is only a terminator to get the correct spill code placement during
2110 // register allocation.
2111 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2112 break;
2113
2114 case AMDGPU::S_AND_B64_term:
2115 // This is only a terminator to get the correct spill code placement during
2116 // register allocation.
2117 MI.setDesc(get(AMDGPU::S_AND_B64));
2118 break;
2119
2120 case AMDGPU::S_AND_B32_term:
2121 // This is only a terminator to get the correct spill code placement during
2122 // register allocation.
2123 MI.setDesc(get(AMDGPU::S_AND_B32));
2124 break;
2125
2126 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2127 // This is only a terminator to get the correct spill code placement during
2128 // register allocation.
2129 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2130 break;
2131
2132 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2133 // This is only a terminator to get the correct spill code placement during
2134 // register allocation.
2135 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2136 break;
2137
2138 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2139 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2140 break;
2141
2142 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2143 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2144 break;
2145 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2146 Register Dst = MI.getOperand(0).getReg();
2147 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2148 MI.setDesc(
2149 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2150 break;
2151 }
2152 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2153 Register Dst = MI.getOperand(0).getReg();
2154 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2155 int64_t Imm = MI.getOperand(1).getImm();
2156
2157 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2158 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2159 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2162 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2163 .addImm(SignExtend64<32>(Imm >> 32))
2165 MI.eraseFromParent();
2166 break;
2167 }
2168
2169 [[fallthrough]];
2170 }
2171 case AMDGPU::V_MOV_B64_PSEUDO: {
2172 Register Dst = MI.getOperand(0).getReg();
2173 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2174 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2175
2176 const MachineOperand &SrcOp = MI.getOperand(1);
2177 // FIXME: Will this work for 64-bit floating point immediates?
2178 assert(!SrcOp.isFPImm());
2179 if (ST.hasMovB64()) {
2180 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2181 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2182 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2183 break;
2184 }
2185 if (SrcOp.isImm()) {
2186 APInt Imm(64, SrcOp.getImm());
2187 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2188 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2189 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2190 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2192 .addImm(Lo.getSExtValue())
2194 .addImm(Lo.getSExtValue())
2195 .addImm(0) // op_sel_lo
2196 .addImm(0) // op_sel_hi
2197 .addImm(0) // neg_lo
2198 .addImm(0) // neg_hi
2199 .addImm(0); // clamp
2200 } else {
2201 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2202 .addImm(Lo.getSExtValue())
2204 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2205 .addImm(Hi.getSExtValue())
2207 }
2208 } else {
2209 assert(SrcOp.isReg());
2210 if (ST.hasPkMovB32() &&
2211 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2212 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2213 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2214 .addReg(SrcOp.getReg())
2216 .addReg(SrcOp.getReg())
2217 .addImm(0) // op_sel_lo
2218 .addImm(0) // op_sel_hi
2219 .addImm(0) // neg_lo
2220 .addImm(0) // neg_hi
2221 .addImm(0); // clamp
2222 } else {
2223 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2224 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2226 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2227 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2229 }
2230 }
2231 MI.eraseFromParent();
2232 break;
2233 }
2234 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2236 break;
2237 }
2238 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2239 const MachineOperand &SrcOp = MI.getOperand(1);
2240 assert(!SrcOp.isFPImm());
2241
2242 if (ST.has64BitLiterals()) {
2243 MI.setDesc(get(AMDGPU::S_MOV_B64));
2244 break;
2245 }
2246
2247 APInt Imm(64, SrcOp.getImm());
2248 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2249 MI.setDesc(get(AMDGPU::S_MOV_B64));
2250 break;
2251 }
2252
2253 Register Dst = MI.getOperand(0).getReg();
2254 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2255 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2256
2257 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2258 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2259 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2260 .addImm(Lo.getSExtValue())
2262 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2263 .addImm(Hi.getSExtValue())
2265 MI.eraseFromParent();
2266 break;
2267 }
2268 case AMDGPU::V_SET_INACTIVE_B32: {
2269 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2270 Register DstReg = MI.getOperand(0).getReg();
2271 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2272 .add(MI.getOperand(3))
2273 .add(MI.getOperand(4))
2274 .add(MI.getOperand(1))
2275 .add(MI.getOperand(2))
2276 .add(MI.getOperand(5));
2277 MI.eraseFromParent();
2278 break;
2279 }
2280 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2281 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2282 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2283 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2284 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2285 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2286 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2287 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2288 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2289 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2290 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2291 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2292 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2293 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2294 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2295 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2296 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2297 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2298 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2299 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2300 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2301 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2302 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2303 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2304 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2305 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2306 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2307 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2308 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2309 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2310 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2311 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2312 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2313 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2314
2315 unsigned Opc;
2316 if (RI.hasVGPRs(EltRC)) {
2317 Opc = AMDGPU::V_MOVRELD_B32_e32;
2318 } else {
2319 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2320 : AMDGPU::S_MOVRELD_B32;
2321 }
2322
2323 const MCInstrDesc &OpDesc = get(Opc);
2324 Register VecReg = MI.getOperand(0).getReg();
2325 bool IsUndef = MI.getOperand(1).isUndef();
2326 unsigned SubReg = MI.getOperand(3).getImm();
2327 assert(VecReg == MI.getOperand(1).getReg());
2328
2330 BuildMI(MBB, MI, DL, OpDesc)
2331 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2332 .add(MI.getOperand(2))
2334 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2335
2336 const int ImpDefIdx =
2337 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2338 const int ImpUseIdx = ImpDefIdx + 1;
2339 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2340 MI.eraseFromParent();
2341 break;
2342 }
2343 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2344 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2345 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2346 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2347 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2348 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6:
2349 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7:
2350 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2351 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2352 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2353 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2354 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2355 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2356 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2357 assert(ST.useVGPRIndexMode());
2358 Register VecReg = MI.getOperand(0).getReg();
2359 bool IsUndef = MI.getOperand(1).isUndef();
2360 MachineOperand &Idx = MI.getOperand(3);
2361 Register SubReg = MI.getOperand(4).getImm();
2362
2363 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2364 .add(Idx)
2366 SetOn->getOperand(3).setIsUndef();
2367
2368 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2370 BuildMI(MBB, MI, DL, OpDesc)
2371 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2372 .add(MI.getOperand(2))
2374 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2375
2376 const int ImpDefIdx =
2377 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2378 const int ImpUseIdx = ImpDefIdx + 1;
2379 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2380
2381 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2382
2383 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2384
2385 MI.eraseFromParent();
2386 break;
2387 }
2388 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2389 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2390 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2391 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2392 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2393 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6:
2394 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7:
2395 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2396 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2397 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2398 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2399 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2400 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2401 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2402 assert(ST.useVGPRIndexMode());
2403 Register Dst = MI.getOperand(0).getReg();
2404 Register VecReg = MI.getOperand(1).getReg();
2405 bool IsUndef = MI.getOperand(1).isUndef();
2406 Register Idx = MI.getOperand(2).getReg();
2407 Register SubReg = MI.getOperand(3).getImm();
2408
2409 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2410 .addReg(Idx)
2412 SetOn->getOperand(3).setIsUndef();
2413
2414 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2415 .addDef(Dst)
2416 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2417 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2418
2419 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2420
2421 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2422
2423 MI.eraseFromParent();
2424 break;
2425 }
2426 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2427 MachineFunction &MF = *MBB.getParent();
2428 Register Reg = MI.getOperand(0).getReg();
2429 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2430 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2431 MachineOperand OpLo = MI.getOperand(1);
2432 MachineOperand OpHi = MI.getOperand(2);
2433
2434 // Create a bundle so these instructions won't be re-ordered by the
2435 // post-RA scheduler.
2436 MIBundleBuilder Bundler(MBB, MI);
2437 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2438
2439 // What we want here is an offset from the value returned by s_getpc (which
2440 // is the address of the s_add_u32 instruction) to the global variable, but
2441 // since the encoding of $symbol starts 4 bytes after the start of the
2442 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2443 // small. This requires us to add 4 to the global variable offset in order
2444 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2445 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2446 // instruction.
2447
2448 int64_t Adjust = 0;
2449 if (ST.hasGetPCZeroExtension()) {
2450 // Fix up hardware that does not sign-extend the 48-bit PC value by
2451 // inserting: s_sext_i32_i16 reghi, reghi
2452 Bundler.append(
2453 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2454 Adjust += 4;
2455 }
2456
2457 if (OpLo.isGlobal())
2458 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2459 Bundler.append(
2460 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2461
2462 if (OpHi.isGlobal())
2463 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2464 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2465 .addReg(RegHi)
2466 .add(OpHi));
2467
2468 finalizeBundle(MBB, Bundler.begin());
2469
2470 MI.eraseFromParent();
2471 break;
2472 }
2473 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2474 MachineFunction &MF = *MBB.getParent();
2475 Register Reg = MI.getOperand(0).getReg();
2476 MachineOperand Op = MI.getOperand(1);
2477
2478 // Create a bundle so these instructions won't be re-ordered by the
2479 // post-RA scheduler.
2480 MIBundleBuilder Bundler(MBB, MI);
2481 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2482 if (Op.isGlobal())
2483 Op.setOffset(Op.getOffset() + 4);
2484 Bundler.append(
2485 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2486
2487 finalizeBundle(MBB, Bundler.begin());
2488
2489 MI.eraseFromParent();
2490 break;
2491 }
2492 case AMDGPU::ENTER_STRICT_WWM: {
2493 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2494 // Whole Wave Mode is entered.
2495 MI.setDesc(get(LMC.OrSaveExecOpc));
2496 break;
2497 }
2498 case AMDGPU::ENTER_STRICT_WQM: {
2499 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2500 // STRICT_WQM is entered.
2501 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2502 .addReg(LMC.ExecReg);
2503 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2504
2505 MI.eraseFromParent();
2506 break;
2507 }
2508 case AMDGPU::EXIT_STRICT_WWM:
2509 case AMDGPU::EXIT_STRICT_WQM: {
2510 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2511 // WWM/STICT_WQM is exited.
2512 MI.setDesc(get(LMC.MovOpc));
2513 break;
2514 }
2515 case AMDGPU::SI_RETURN: {
2516 const MachineFunction *MF = MBB.getParent();
2517 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2518 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2519 // Hiding the return address use with SI_RETURN may lead to extra kills in
2520 // the function and missing live-ins. We are fine in practice because callee
2521 // saved register handling ensures the register value is restored before
2522 // RET, but we need the undef flag here to appease the MachineVerifier
2523 // liveness checks.
2525 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2526 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2527
2528 MIB.copyImplicitOps(MI);
2529 MI.eraseFromParent();
2530 break;
2531 }
2532
2533 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2534 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2535 MI.setDesc(get(AMDGPU::S_MUL_U64));
2536 break;
2537
2538 case AMDGPU::S_GETPC_B64_pseudo:
2539 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2540 if (ST.hasGetPCZeroExtension()) {
2541 Register Dst = MI.getOperand(0).getReg();
2542 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2543 // Fix up hardware that does not sign-extend the 48-bit PC value by
2544 // inserting: s_sext_i32_i16 dsthi, dsthi
2545 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2546 DstHi)
2547 .addReg(DstHi);
2548 }
2549 break;
2550
2551 case AMDGPU::V_MAX_BF16_PSEUDO_e64: {
2552 assert(ST.hasBF16PackedInsts());
2553 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2554 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2555 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2556 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2557 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2558 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2559 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2560 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2561 break;
2562 }
2563 }
2564
2565 return true;
2566}
2567
2570 unsigned SubIdx,
2571 const MachineInstr &Orig) const {
2572
2573 // Try shrinking the instruction to remat only the part needed for current
2574 // context.
2575 // TODO: Handle more cases.
2576 unsigned Opcode = Orig.getOpcode();
2577 switch (Opcode) {
2578 case AMDGPU::S_LOAD_DWORDX16_IMM:
2579 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2580 if (SubIdx != 0)
2581 break;
2582
2583 if (I == MBB.end())
2584 break;
2585
2586 if (I->isBundled())
2587 break;
2588
2589 // Look for a single use of the register that is also a subreg.
2590 Register RegToFind = Orig.getOperand(0).getReg();
2591 MachineOperand *UseMO = nullptr;
2592 for (auto &CandMO : I->operands()) {
2593 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2594 continue;
2595 if (UseMO) {
2596 UseMO = nullptr;
2597 break;
2598 }
2599 UseMO = &CandMO;
2600 }
2601 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2602 break;
2603
2604 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2605 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2606
2607 MachineFunction *MF = MBB.getParent();
2609 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2610
2611 unsigned NewOpcode = -1;
2612 if (SubregSize == 256)
2613 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2614 else if (SubregSize == 128)
2615 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2616 else
2617 break;
2618
2619 const MCInstrDesc &TID = get(NewOpcode);
2620 const TargetRegisterClass *NewRC =
2621 RI.getAllocatableClass(getRegClass(TID, 0));
2622 MRI.setRegClass(DestReg, NewRC);
2623
2624 UseMO->setReg(DestReg);
2625 UseMO->setSubReg(AMDGPU::NoSubRegister);
2626
2627 // Use a smaller load with the desired size, possibly with updated offset.
2628 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2629 MI->setDesc(TID);
2630 MI->getOperand(0).setReg(DestReg);
2631 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2632 if (Offset) {
2633 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2634 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2635 OffsetMO->setImm(FinalOffset);
2636 }
2638 for (const MachineMemOperand *MemOp : Orig.memoperands())
2639 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2640 SubregSize / 8));
2641 MI->setMemRefs(*MF, NewMMOs);
2642
2643 MBB.insert(I, MI);
2644 return;
2645 }
2646
2647 default:
2648 break;
2649 }
2650
2651 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig);
2652}
2653
2654std::pair<MachineInstr*, MachineInstr*>
2656 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2657
2658 if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2660 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2661 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2662 return std::pair(&MI, nullptr);
2663 }
2664
2665 MachineBasicBlock &MBB = *MI.getParent();
2666 DebugLoc DL = MBB.findDebugLoc(MI);
2667 MachineFunction *MF = MBB.getParent();
2669 Register Dst = MI.getOperand(0).getReg();
2670 unsigned Part = 0;
2671 MachineInstr *Split[2];
2672
2673 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2674 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2675 if (Dst.isPhysical()) {
2676 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2677 } else {
2678 assert(MRI.isSSA());
2679 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2680 MovDPP.addDef(Tmp);
2681 }
2682
2683 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2684 const MachineOperand &SrcOp = MI.getOperand(I);
2685 assert(!SrcOp.isFPImm());
2686 if (SrcOp.isImm()) {
2687 APInt Imm(64, SrcOp.getImm());
2688 Imm.ashrInPlace(Part * 32);
2689 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2690 } else {
2691 assert(SrcOp.isReg());
2692 Register Src = SrcOp.getReg();
2693 if (Src.isPhysical())
2694 MovDPP.addReg(RI.getSubReg(Src, Sub));
2695 else
2696 MovDPP.addReg(Src, getUndefRegState(SrcOp.isUndef()), Sub);
2697 }
2698 }
2699
2700 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2701 MovDPP.addImm(MO.getImm());
2702
2703 Split[Part] = MovDPP;
2704 ++Part;
2705 }
2706
2707 if (Dst.isVirtual())
2708 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2709 .addReg(Split[0]->getOperand(0).getReg())
2710 .addImm(AMDGPU::sub0)
2711 .addReg(Split[1]->getOperand(0).getReg())
2712 .addImm(AMDGPU::sub1);
2713
2714 MI.eraseFromParent();
2715 return std::pair(Split[0], Split[1]);
2716}
2717
2718std::optional<DestSourcePair>
2720 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2721 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2722
2723 return std::nullopt;
2724}
2725
2727 AMDGPU::OpName Src0OpName,
2728 MachineOperand &Src1,
2729 AMDGPU::OpName Src1OpName) const {
2730 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2731 if (!Src0Mods)
2732 return false;
2733
2734 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2735 assert(Src1Mods &&
2736 "All commutable instructions have both src0 and src1 modifiers");
2737
2738 int Src0ModsVal = Src0Mods->getImm();
2739 int Src1ModsVal = Src1Mods->getImm();
2740
2741 Src1Mods->setImm(Src0ModsVal);
2742 Src0Mods->setImm(Src1ModsVal);
2743 return true;
2744}
2745
2747 MachineOperand &RegOp,
2748 MachineOperand &NonRegOp) {
2749 Register Reg = RegOp.getReg();
2750 unsigned SubReg = RegOp.getSubReg();
2751 bool IsKill = RegOp.isKill();
2752 bool IsDead = RegOp.isDead();
2753 bool IsUndef = RegOp.isUndef();
2754 bool IsDebug = RegOp.isDebug();
2755
2756 if (NonRegOp.isImm())
2757 RegOp.ChangeToImmediate(NonRegOp.getImm());
2758 else if (NonRegOp.isFI())
2759 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2760 else if (NonRegOp.isGlobal()) {
2761 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2762 NonRegOp.getTargetFlags());
2763 } else
2764 return nullptr;
2765
2766 // Make sure we don't reinterpret a subreg index in the target flags.
2767 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2768
2769 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2770 NonRegOp.setSubReg(SubReg);
2771
2772 return &MI;
2773}
2774
2776 MachineOperand &NonRegOp1,
2777 MachineOperand &NonRegOp2) {
2778 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2779 int64_t NonRegVal = NonRegOp1.getImm();
2780
2781 NonRegOp1.setImm(NonRegOp2.getImm());
2782 NonRegOp2.setImm(NonRegVal);
2783 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2784 NonRegOp2.setTargetFlags(TargetFlags);
2785 return &MI;
2786}
2787
2788bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2789 unsigned OpIdx1) const {
2790 const MCInstrDesc &InstDesc = MI.getDesc();
2791 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2792 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2793
2794 unsigned Opc = MI.getOpcode();
2795 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2796
2797 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2798 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2799
2800 // Swap doesn't breach constant bus or literal limits
2801 // It may move literal to position other than src0, this is not allowed
2802 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2803 // FIXME: After gfx9, literal can be in place other than Src0
2804 if (isVALU(MI)) {
2805 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2806 !isInlineConstant(MO0, OpInfo1))
2807 return false;
2808 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2809 !isInlineConstant(MO1, OpInfo0))
2810 return false;
2811 }
2812
2813 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2814 if (OpInfo1.RegClass == -1)
2815 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2816 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2817 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2818 }
2819 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2820 if (OpInfo0.RegClass == -1)
2821 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2822 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2823 isLegalRegOperand(MI, OpIdx0, MO1);
2824 }
2825
2826 // No need to check 64-bit literals since swapping does not bring new
2827 // 64-bit literals into current instruction to fold to 32-bit
2828
2829 return isImmOperandLegal(MI, OpIdx1, MO0);
2830}
2831
2833 unsigned Src0Idx,
2834 unsigned Src1Idx) const {
2835 assert(!NewMI && "this should never be used");
2836
2837 unsigned Opc = MI.getOpcode();
2838 int CommutedOpcode = commuteOpcode(Opc);
2839 if (CommutedOpcode == -1)
2840 return nullptr;
2841
2842 if (Src0Idx > Src1Idx)
2843 std::swap(Src0Idx, Src1Idx);
2844
2845 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2846 static_cast<int>(Src0Idx) &&
2847 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2848 static_cast<int>(Src1Idx) &&
2849 "inconsistency with findCommutedOpIndices");
2850
2851 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2852 return nullptr;
2853
2854 MachineInstr *CommutedMI = nullptr;
2855 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2856 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2857 if (Src0.isReg() && Src1.isReg()) {
2858 // Be sure to copy the source modifiers to the right place.
2859 CommutedMI =
2860 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2861 } else if (Src0.isReg() && !Src1.isReg()) {
2862 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2863 } else if (!Src0.isReg() && Src1.isReg()) {
2864 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2865 } else if (Src0.isImm() && Src1.isImm()) {
2866 CommutedMI = swapImmOperands(MI, Src0, Src1);
2867 } else {
2868 // FIXME: Found two non registers to commute. This does happen.
2869 return nullptr;
2870 }
2871
2872 if (CommutedMI) {
2873 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2874 Src1, AMDGPU::OpName::src1_modifiers);
2875
2876 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2877 AMDGPU::OpName::src1_sel);
2878
2879 CommutedMI->setDesc(get(CommutedOpcode));
2880 }
2881
2882 return CommutedMI;
2883}
2884
2885// This needs to be implemented because the source modifiers may be inserted
2886// between the true commutable operands, and the base
2887// TargetInstrInfo::commuteInstruction uses it.
2889 unsigned &SrcOpIdx0,
2890 unsigned &SrcOpIdx1) const {
2891 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2892}
2893
2895 unsigned &SrcOpIdx0,
2896 unsigned &SrcOpIdx1) const {
2897 if (!Desc.isCommutable())
2898 return false;
2899
2900 unsigned Opc = Desc.getOpcode();
2901 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2902 if (Src0Idx == -1)
2903 return false;
2904
2905 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2906 if (Src1Idx == -1)
2907 return false;
2908
2909 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2910}
2911
2913 int64_t BrOffset) const {
2914 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2915 // because its dest block is unanalyzable.
2916 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2917
2918 // Convert to dwords.
2919 BrOffset /= 4;
2920
2921 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2922 // from the next instruction.
2923 BrOffset -= 1;
2924
2925 return isIntN(BranchOffsetBits, BrOffset);
2926}
2927
2930 return MI.getOperand(0).getMBB();
2931}
2932
2934 for (const MachineInstr &MI : MBB->terminators()) {
2935 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2936 MI.getOpcode() == AMDGPU::SI_LOOP)
2937 return true;
2938 }
2939 return false;
2940}
2941
2943 MachineBasicBlock &DestBB,
2944 MachineBasicBlock &RestoreBB,
2945 const DebugLoc &DL, int64_t BrOffset,
2946 RegScavenger *RS) const {
2947 assert(MBB.empty() &&
2948 "new block should be inserted for expanding unconditional branch");
2949 assert(MBB.pred_size() == 1);
2950 assert(RestoreBB.empty() &&
2951 "restore block should be inserted for restoring clobbered registers");
2952
2953 MachineFunction *MF = MBB.getParent();
2956 auto I = MBB.end();
2957 auto &MCCtx = MF->getContext();
2958
2959 if (ST.useAddPC64Inst()) {
2960 MCSymbol *Offset =
2961 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
2962 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
2964 MCSymbol *PostAddPCLabel =
2965 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
2966 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
2967 auto *OffsetExpr = MCBinaryExpr::createSub(
2968 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
2969 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
2970 Offset->setVariableValue(OffsetExpr);
2971 return;
2972 }
2973
2974 assert(RS && "RegScavenger required for long branching");
2975
2976 // FIXME: Virtual register workaround for RegScavenger not working with empty
2977 // blocks.
2978 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2979
2980 // Note: as this is used after hazard recognizer we need to apply some hazard
2981 // workarounds directly.
2982 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2983 ST.hasVALUReadSGPRHazard();
2984 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2985 if (FlushSGPRWrites)
2986 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2988 };
2989
2990 // We need to compute the offset relative to the instruction immediately after
2991 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2992 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2993 ApplyHazardWorkarounds();
2994
2995 MCSymbol *PostGetPCLabel =
2996 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2997 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2998
2999 MCSymbol *OffsetLo =
3000 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
3001 MCSymbol *OffsetHi =
3002 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
3003 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
3004 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
3005 .addReg(PCReg, {}, AMDGPU::sub0)
3006 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
3007 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
3008 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
3009 .addReg(PCReg, {}, AMDGPU::sub1)
3010 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
3011 ApplyHazardWorkarounds();
3012
3013 // Insert the indirect branch after the other terminator.
3014 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
3015 .addReg(PCReg);
3016
3017 // If a spill is needed for the pc register pair, we need to insert a spill
3018 // restore block right before the destination block, and insert a short branch
3019 // into the old destination block's fallthrough predecessor.
3020 // e.g.:
3021 //
3022 // s_cbranch_scc0 skip_long_branch:
3023 //
3024 // long_branch_bb:
3025 // spill s[8:9]
3026 // s_getpc_b64 s[8:9]
3027 // s_add_u32 s8, s8, restore_bb
3028 // s_addc_u32 s9, s9, 0
3029 // s_setpc_b64 s[8:9]
3030 //
3031 // skip_long_branch:
3032 // foo;
3033 //
3034 // .....
3035 //
3036 // dest_bb_fallthrough_predecessor:
3037 // bar;
3038 // s_branch dest_bb
3039 //
3040 // restore_bb:
3041 // restore s[8:9]
3042 // fallthrough dest_bb
3043 ///
3044 // dest_bb:
3045 // buzz;
3046
3047 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3048 Register Scav;
3049
3050 // If we've previously reserved a register for long branches
3051 // avoid running the scavenger and just use those registers
3052 if (LongBranchReservedReg) {
3053 RS->enterBasicBlock(MBB);
3054 Scav = LongBranchReservedReg;
3055 } else {
3056 RS->enterBasicBlockEnd(MBB);
3057 Scav = RS->scavengeRegisterBackwards(
3058 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3059 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3060 }
3061 if (Scav) {
3062 RS->setRegUsed(Scav);
3063 MRI.replaceRegWith(PCReg, Scav);
3064 MRI.clearVirtRegs();
3065 } else {
3066 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3067 // SGPR spill.
3068 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3069 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3070 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3071 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3072 MRI.clearVirtRegs();
3073 }
3074
3075 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3076 // Now, the distance could be defined.
3078 MCSymbolRefExpr::create(DestLabel, MCCtx),
3079 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3080 // Add offset assignments.
3081 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3082 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3083 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3084 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3085}
3086
3087unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3088 switch (Cond) {
3089 case SIInstrInfo::SCC_TRUE:
3090 return AMDGPU::S_CBRANCH_SCC1;
3091 case SIInstrInfo::SCC_FALSE:
3092 return AMDGPU::S_CBRANCH_SCC0;
3093 case SIInstrInfo::VCCNZ:
3094 return AMDGPU::S_CBRANCH_VCCNZ;
3095 case SIInstrInfo::VCCZ:
3096 return AMDGPU::S_CBRANCH_VCCZ;
3097 case SIInstrInfo::EXECNZ:
3098 return AMDGPU::S_CBRANCH_EXECNZ;
3099 case SIInstrInfo::EXECZ:
3100 return AMDGPU::S_CBRANCH_EXECZ;
3101 default:
3102 llvm_unreachable("invalid branch predicate");
3103 }
3104}
3105
3106SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3107 switch (Opcode) {
3108 case AMDGPU::S_CBRANCH_SCC0:
3109 return SCC_FALSE;
3110 case AMDGPU::S_CBRANCH_SCC1:
3111 return SCC_TRUE;
3112 case AMDGPU::S_CBRANCH_VCCNZ:
3113 return VCCNZ;
3114 case AMDGPU::S_CBRANCH_VCCZ:
3115 return VCCZ;
3116 case AMDGPU::S_CBRANCH_EXECNZ:
3117 return EXECNZ;
3118 case AMDGPU::S_CBRANCH_EXECZ:
3119 return EXECZ;
3120 default:
3121 return INVALID_BR;
3122 }
3123}
3124
3128 MachineBasicBlock *&FBB,
3130 bool AllowModify) const {
3131 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3132 // Unconditional Branch
3133 TBB = I->getOperand(0).getMBB();
3134 return false;
3135 }
3136
3137 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3138 if (Pred == INVALID_BR)
3139 return true;
3140
3141 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3142 Cond.push_back(MachineOperand::CreateImm(Pred));
3143 Cond.push_back(I->getOperand(1)); // Save the branch register.
3144
3145 ++I;
3146
3147 if (I == MBB.end()) {
3148 // Conditional branch followed by fall-through.
3149 TBB = CondBB;
3150 return false;
3151 }
3152
3153 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3154 TBB = CondBB;
3155 FBB = I->getOperand(0).getMBB();
3156 return false;
3157 }
3158
3159 return true;
3160}
3161
3163 MachineBasicBlock *&FBB,
3165 bool AllowModify) const {
3166 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3167 auto E = MBB.end();
3168 if (I == E)
3169 return false;
3170
3171 // Skip over the instructions that are artificially terminators for special
3172 // exec management.
3173 while (I != E && !I->isBranch() && !I->isReturn()) {
3174 switch (I->getOpcode()) {
3175 case AMDGPU::S_MOV_B64_term:
3176 case AMDGPU::S_XOR_B64_term:
3177 case AMDGPU::S_OR_B64_term:
3178 case AMDGPU::S_ANDN2_B64_term:
3179 case AMDGPU::S_AND_B64_term:
3180 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3181 case AMDGPU::S_MOV_B32_term:
3182 case AMDGPU::S_XOR_B32_term:
3183 case AMDGPU::S_OR_B32_term:
3184 case AMDGPU::S_ANDN2_B32_term:
3185 case AMDGPU::S_AND_B32_term:
3186 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3187 break;
3188 case AMDGPU::SI_IF:
3189 case AMDGPU::SI_ELSE:
3190 case AMDGPU::SI_KILL_I1_TERMINATOR:
3191 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3192 // FIXME: It's messy that these need to be considered here at all.
3193 return true;
3194 default:
3195 llvm_unreachable("unexpected non-branch terminator inst");
3196 }
3197
3198 ++I;
3199 }
3200
3201 if (I == E)
3202 return false;
3203
3204 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3205}
3206
3208 int *BytesRemoved) const {
3209 unsigned Count = 0;
3210 unsigned RemovedSize = 0;
3211 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3212 // Skip over artificial terminators when removing instructions.
3213 if (MI.isBranch() || MI.isReturn()) {
3214 RemovedSize += getInstSizeInBytes(MI);
3215 MI.eraseFromParent();
3216 ++Count;
3217 }
3218 }
3219
3220 if (BytesRemoved)
3221 *BytesRemoved = RemovedSize;
3222
3223 return Count;
3224}
3225
3226// Copy the flags onto the implicit condition register operand.
3228 const MachineOperand &OrigCond) {
3229 CondReg.setIsUndef(OrigCond.isUndef());
3230 CondReg.setIsKill(OrigCond.isKill());
3231}
3232
3235 MachineBasicBlock *FBB,
3237 const DebugLoc &DL,
3238 int *BytesAdded) const {
3239 if (!FBB && Cond.empty()) {
3240 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3241 .addMBB(TBB);
3242 if (BytesAdded)
3243 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3244 return 1;
3245 }
3246
3247 assert(TBB && Cond[0].isImm());
3248
3249 unsigned Opcode
3250 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3251
3252 if (!FBB) {
3253 MachineInstr *CondBr =
3254 BuildMI(&MBB, DL, get(Opcode))
3255 .addMBB(TBB);
3256
3257 // Copy the flags onto the implicit condition register operand.
3258 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3259 fixImplicitOperands(*CondBr);
3260
3261 if (BytesAdded)
3262 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3263 return 1;
3264 }
3265
3266 assert(TBB && FBB);
3267
3268 MachineInstr *CondBr =
3269 BuildMI(&MBB, DL, get(Opcode))
3270 .addMBB(TBB);
3271 fixImplicitOperands(*CondBr);
3272 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3273 .addMBB(FBB);
3274
3275 MachineOperand &CondReg = CondBr->getOperand(1);
3276 CondReg.setIsUndef(Cond[1].isUndef());
3277 CondReg.setIsKill(Cond[1].isKill());
3278
3279 if (BytesAdded)
3280 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3281
3282 return 2;
3283}
3284
3287 if (Cond.size() != 2) {
3288 return true;
3289 }
3290
3291 if (Cond[0].isImm()) {
3292 Cond[0].setImm(-Cond[0].getImm());
3293 return false;
3294 }
3295
3296 return true;
3297}
3298
3301 Register DstReg, Register TrueReg,
3302 Register FalseReg, int &CondCycles,
3303 int &TrueCycles, int &FalseCycles) const {
3304 switch (Cond[0].getImm()) {
3305 case VCCNZ:
3306 case VCCZ: {
3307 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3308 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3309 if (MRI.getRegClass(FalseReg) != RC)
3310 return false;
3311
3312 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3313 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3314
3315 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3316 return RI.hasVGPRs(RC) && NumInsts <= 6;
3317 }
3318 case SCC_TRUE:
3319 case SCC_FALSE: {
3320 // FIXME: We could insert for VGPRs if we could replace the original compare
3321 // with a vector one.
3322 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3323 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3324 if (MRI.getRegClass(FalseReg) != RC)
3325 return false;
3326
3327 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3328
3329 // Multiples of 8 can do s_cselect_b64
3330 if (NumInsts % 2 == 0)
3331 NumInsts /= 2;
3332
3333 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3334 return RI.isSGPRClass(RC);
3335 }
3336 default:
3337 return false;
3338 }
3339}
3340
3344 Register TrueReg, Register FalseReg) const {
3345 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3346 if (Pred == VCCZ || Pred == SCC_FALSE) {
3347 Pred = static_cast<BranchPredicate>(-Pred);
3348 std::swap(TrueReg, FalseReg);
3349 }
3350
3351 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3352 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3353 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3354
3355 if (DstSize == 32) {
3357 if (Pred == SCC_TRUE) {
3358 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3359 .addReg(TrueReg)
3360 .addReg(FalseReg);
3361 } else {
3362 // Instruction's operands are backwards from what is expected.
3363 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3364 .addReg(FalseReg)
3365 .addReg(TrueReg);
3366 }
3367
3368 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3369 return;
3370 }
3371
3372 if (DstSize == 64 && Pred == SCC_TRUE) {
3374 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3375 .addReg(TrueReg)
3376 .addReg(FalseReg);
3377
3378 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3379 return;
3380 }
3381
3382 static const int16_t Sub0_15[] = {
3383 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3384 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3385 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3386 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3387 };
3388
3389 static const int16_t Sub0_15_64[] = {
3390 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3391 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3392 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3393 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3394 };
3395
3396 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3397 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3398 const int16_t *SubIndices = Sub0_15;
3399 int NElts = DstSize / 32;
3400
3401 // 64-bit select is only available for SALU.
3402 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3403 if (Pred == SCC_TRUE) {
3404 if (NElts % 2) {
3405 SelOp = AMDGPU::S_CSELECT_B32;
3406 EltRC = &AMDGPU::SGPR_32RegClass;
3407 } else {
3408 SelOp = AMDGPU::S_CSELECT_B64;
3409 EltRC = &AMDGPU::SGPR_64RegClass;
3410 SubIndices = Sub0_15_64;
3411 NElts /= 2;
3412 }
3413 }
3414
3416 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3417
3418 I = MIB->getIterator();
3419
3421 for (int Idx = 0; Idx != NElts; ++Idx) {
3422 Register DstElt = MRI.createVirtualRegister(EltRC);
3423 Regs.push_back(DstElt);
3424
3425 unsigned SubIdx = SubIndices[Idx];
3426
3428 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3429 Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
3430 .addReg(FalseReg, {}, SubIdx)
3431 .addReg(TrueReg, {}, SubIdx);
3432 } else {
3433 Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
3434 .addReg(TrueReg, {}, SubIdx)
3435 .addReg(FalseReg, {}, SubIdx);
3436 }
3437
3438 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3440
3441 MIB.addReg(DstElt)
3442 .addImm(SubIdx);
3443 }
3444}
3445
3447 switch (MI.getOpcode()) {
3448 case AMDGPU::V_MOV_B16_t16_e32:
3449 case AMDGPU::V_MOV_B16_t16_e64:
3450 case AMDGPU::V_MOV_B32_e32:
3451 case AMDGPU::V_MOV_B32_e64:
3452 case AMDGPU::V_MOV_B64_PSEUDO:
3453 case AMDGPU::V_MOV_B64_e32:
3454 case AMDGPU::V_MOV_B64_e64:
3455 case AMDGPU::S_MOV_B32:
3456 case AMDGPU::S_MOV_B64:
3457 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3458 case AMDGPU::COPY:
3459 case AMDGPU::WWM_COPY:
3460 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3461 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3462 case AMDGPU::V_ACCVGPR_MOV_B32:
3463 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3464 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3465 return true;
3466 default:
3467 return false;
3468 }
3469}
3470
3472 switch (MI.getOpcode()) {
3473 case AMDGPU::V_MOV_B16_t16_e32:
3474 case AMDGPU::V_MOV_B16_t16_e64:
3475 return 2;
3476 case AMDGPU::V_MOV_B32_e32:
3477 case AMDGPU::V_MOV_B32_e64:
3478 case AMDGPU::V_MOV_B64_PSEUDO:
3479 case AMDGPU::V_MOV_B64_e32:
3480 case AMDGPU::V_MOV_B64_e64:
3481 case AMDGPU::S_MOV_B32:
3482 case AMDGPU::S_MOV_B64:
3483 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3484 case AMDGPU::COPY:
3485 case AMDGPU::WWM_COPY:
3486 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3487 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3488 case AMDGPU::V_ACCVGPR_MOV_B32:
3489 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3490 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3491 return 1;
3492 default:
3493 llvm_unreachable("MI is not a foldable copy");
3494 }
3495}
3496
3497static constexpr AMDGPU::OpName ModifierOpNames[] = {
3498 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3499 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3500 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3501
3503 unsigned Opc = MI.getOpcode();
3504 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3505 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3506 if (Idx >= 0)
3507 MI.removeOperand(Idx);
3508 }
3509}
3510
3512 const MCInstrDesc &NewDesc) const {
3513 MI.setDesc(NewDesc);
3514
3515 // Remove any leftover implicit operands from mutating the instruction. e.g.
3516 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
3517 // anymore.
3518 const MCInstrDesc &Desc = MI.getDesc();
3519 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
3520 Desc.implicit_defs().size();
3521
3522 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
3523 MI.removeOperand(I);
3524}
3525
3526std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3527 unsigned SubRegIndex) {
3528 switch (SubRegIndex) {
3529 case AMDGPU::NoSubRegister:
3530 return Imm;
3531 case AMDGPU::sub0:
3532 return SignExtend64<32>(Imm);
3533 case AMDGPU::sub1:
3534 return SignExtend64<32>(Imm >> 32);
3535 case AMDGPU::lo16:
3536 return SignExtend64<16>(Imm);
3537 case AMDGPU::hi16:
3538 return SignExtend64<16>(Imm >> 16);
3539 case AMDGPU::sub1_lo16:
3540 return SignExtend64<16>(Imm >> 32);
3541 case AMDGPU::sub1_hi16:
3542 return SignExtend64<16>(Imm >> 48);
3543 default:
3544 return std::nullopt;
3545 }
3546
3547 llvm_unreachable("covered subregister switch");
3548}
3549
3550static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3551 switch (Opc) {
3552 case AMDGPU::V_MAC_F16_e32:
3553 case AMDGPU::V_MAC_F16_e64:
3554 case AMDGPU::V_MAD_F16_e64:
3555 return AMDGPU::V_MADAK_F16;
3556 case AMDGPU::V_MAC_F32_e32:
3557 case AMDGPU::V_MAC_F32_e64:
3558 case AMDGPU::V_MAD_F32_e64:
3559 return AMDGPU::V_MADAK_F32;
3560 case AMDGPU::V_FMAC_F32_e32:
3561 case AMDGPU::V_FMAC_F32_e64:
3562 case AMDGPU::V_FMA_F32_e64:
3563 return AMDGPU::V_FMAAK_F32;
3564 case AMDGPU::V_FMAC_F16_e32:
3565 case AMDGPU::V_FMAC_F16_e64:
3566 case AMDGPU::V_FMAC_F16_t16_e64:
3567 case AMDGPU::V_FMAC_F16_fake16_e64:
3568 case AMDGPU::V_FMAC_F16_t16_e32:
3569 case AMDGPU::V_FMAC_F16_fake16_e32:
3570 case AMDGPU::V_FMA_F16_e64:
3571 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3572 ? AMDGPU::V_FMAAK_F16_t16
3573 : AMDGPU::V_FMAAK_F16_fake16
3574 : AMDGPU::V_FMAAK_F16;
3575 case AMDGPU::V_FMAC_F64_e32:
3576 case AMDGPU::V_FMAC_F64_e64:
3577 case AMDGPU::V_FMA_F64_e64:
3578 return AMDGPU::V_FMAAK_F64;
3579 default:
3580 llvm_unreachable("invalid instruction");
3581 }
3582}
3583
3584static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3585 switch (Opc) {
3586 case AMDGPU::V_MAC_F16_e32:
3587 case AMDGPU::V_MAC_F16_e64:
3588 case AMDGPU::V_MAD_F16_e64:
3589 return AMDGPU::V_MADMK_F16;
3590 case AMDGPU::V_MAC_F32_e32:
3591 case AMDGPU::V_MAC_F32_e64:
3592 case AMDGPU::V_MAD_F32_e64:
3593 return AMDGPU::V_MADMK_F32;
3594 case AMDGPU::V_FMAC_F32_e32:
3595 case AMDGPU::V_FMAC_F32_e64:
3596 case AMDGPU::V_FMA_F32_e64:
3597 return AMDGPU::V_FMAMK_F32;
3598 case AMDGPU::V_FMAC_F16_e32:
3599 case AMDGPU::V_FMAC_F16_e64:
3600 case AMDGPU::V_FMAC_F16_t16_e64:
3601 case AMDGPU::V_FMAC_F16_fake16_e64:
3602 case AMDGPU::V_FMAC_F16_t16_e32:
3603 case AMDGPU::V_FMAC_F16_fake16_e32:
3604 case AMDGPU::V_FMA_F16_e64:
3605 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3606 ? AMDGPU::V_FMAMK_F16_t16
3607 : AMDGPU::V_FMAMK_F16_fake16
3608 : AMDGPU::V_FMAMK_F16;
3609 case AMDGPU::V_FMAC_F64_e32:
3610 case AMDGPU::V_FMAC_F64_e64:
3611 case AMDGPU::V_FMA_F64_e64:
3612 return AMDGPU::V_FMAMK_F64;
3613 default:
3614 llvm_unreachable("invalid instruction");
3615 }
3616}
3617
3619 Register Reg, MachineRegisterInfo *MRI) const {
3620 int64_t Imm;
3621 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3622 return false;
3623
3624 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3625
3626 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3627
3628 unsigned Opc = UseMI.getOpcode();
3629 if (Opc == AMDGPU::COPY) {
3630 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3631
3632 Register DstReg = UseMI.getOperand(0).getReg();
3633 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3634
3635 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3636
3637 if (HasMultipleUses) {
3638 // TODO: This should fold in more cases with multiple use, but we need to
3639 // more carefully consider what those uses are.
3640 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3641
3642 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3643 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3644 return false;
3645
3646 // Most of the time folding a 32-bit inline constant is free (though this
3647 // might not be true if we can't later fold it into a real user).
3648 //
3649 // FIXME: This isInlineConstant check is imprecise if
3650 // getConstValDefinedInReg handled the tricky non-mov cases.
3651 if (ImmDefSize == 32 &&
3653 return false;
3654 }
3655
3656 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3657 RI.getSubRegIdxSize(UseSubReg) == 16;
3658
3659 if (Is16Bit) {
3660 if (RI.hasVGPRs(DstRC))
3661 return false; // Do not clobber vgpr_hi16
3662
3663 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3664 return false;
3665 }
3666
3667 MachineFunction *MF = UseMI.getMF();
3668
3669 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3670 MCRegister MovDstPhysReg =
3671 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3672
3673 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3674
3675 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3676 for (unsigned MovOp :
3677 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3678 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3679 const MCInstrDesc &MovDesc = get(MovOp);
3680
3681 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0);
3682 if (Is16Bit) {
3683 // We just need to find a correctly sized register class, so the
3684 // subregister index compatibility doesn't matter since we're statically
3685 // extracting the immediate value.
3686 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3687 if (!MovDstRC)
3688 continue;
3689
3690 if (MovDstPhysReg) {
3691 // FIXME: We probably should not do this. If there is a live value in
3692 // the high half of the register, it will be corrupted.
3693 MovDstPhysReg =
3694 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3695 if (!MovDstPhysReg)
3696 continue;
3697 }
3698 }
3699
3700 // Result class isn't the right size, try the next instruction.
3701 if (MovDstPhysReg) {
3702 if (!MovDstRC->contains(MovDstPhysReg))
3703 return false;
3704 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3705 // TODO: This will be overly conservative in the case of 16-bit virtual
3706 // SGPRs. We could hack up the virtual register uses to use a compatible
3707 // 32-bit class.
3708 continue;
3709 }
3710
3711 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3712
3713 // Ensure the interpreted immediate value is a valid operand in the new
3714 // mov.
3715 //
3716 // FIXME: isImmOperandLegal should have form that doesn't require existing
3717 // MachineInstr or MachineOperand
3718 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3719 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3720 break;
3721
3722 NewOpc = MovOp;
3723 break;
3724 }
3725
3726 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3727 return false;
3728
3729 if (Is16Bit) {
3730 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3731 if (MovDstPhysReg)
3732 UseMI.getOperand(0).setReg(MovDstPhysReg);
3733 assert(UseMI.getOperand(1).getReg().isVirtual());
3734 }
3735
3736 const MCInstrDesc &NewMCID = get(NewOpc);
3737 UseMI.setDesc(NewMCID);
3738 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3739 UseMI.addImplicitDefUseOperands(*MF);
3740 return true;
3741 }
3742
3743 if (HasMultipleUses)
3744 return false;
3745
3746 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3747 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3748 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3749 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3750 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3751 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3752 Opc == AMDGPU::V_FMAC_F64_e64) {
3753 // Don't fold if we are using source or output modifiers. The new VOP2
3754 // instructions don't have them.
3756 return false;
3757
3758 // If this is a free constant, there's no reason to do this.
3759 // TODO: We could fold this here instead of letting SIFoldOperands do it
3760 // later.
3761 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3762
3763 // Any src operand can be used for the legality check.
3764 if (isInlineConstant(UseMI, Src0Idx, Imm))
3765 return false;
3766
3767 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3768
3769 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3770 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3771
3772 auto CopyRegOperandToNarrowerRC =
3773 [MRI, this](MachineInstr &MI, unsigned OpNo,
3774 const TargetRegisterClass *NewRC) -> void {
3775 if (!MI.getOperand(OpNo).isReg())
3776 return;
3777 Register Reg = MI.getOperand(OpNo).getReg();
3778 const TargetRegisterClass *RC = RI.getRegClassForReg(*MRI, Reg);
3779 if (RI.getCommonSubClass(RC, NewRC) != NewRC)
3780 return;
3781 Register Tmp = MRI->createVirtualRegister(NewRC);
3782 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
3783 get(AMDGPU::COPY), Tmp)
3784 .addReg(Reg);
3785 MI.getOperand(OpNo).setReg(Tmp);
3786 MI.getOperand(OpNo).setIsKill();
3787 };
3788
3789 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3790 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3791 (Src1->isReg() && Src1->getReg() == Reg)) {
3792 MachineOperand *RegSrc =
3793 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3794 if (!RegSrc->isReg())
3795 return false;
3796 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3797 ST.getConstantBusLimit(Opc) < 2)
3798 return false;
3799
3800 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3801 return false;
3802
3803 // If src2 is also a literal constant then we have to choose which one to
3804 // fold. In general it is better to choose madak so that the other literal
3805 // can be materialized in an sgpr instead of a vgpr:
3806 // s_mov_b32 s0, literal
3807 // v_madak_f32 v0, s0, v0, literal
3808 // Instead of:
3809 // v_mov_b32 v1, literal
3810 // v_madmk_f32 v0, v0, literal, v1
3811 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3812 if (Def && Def->isMoveImmediate() &&
3813 !isInlineConstant(Def->getOperand(1)))
3814 return false;
3815
3816 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3817 if (pseudoToMCOpcode(NewOpc) == -1)
3818 return false;
3819
3820 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3821 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3822
3823 // FIXME: This would be a lot easier if we could return a new instruction
3824 // instead of having to modify in place.
3825
3826 Register SrcReg = RegSrc->getReg();
3827 unsigned SrcSubReg = RegSrc->getSubReg();
3828 Src0->setReg(SrcReg);
3829 Src0->setSubReg(SrcSubReg);
3830 Src0->setIsKill(RegSrc->isKill());
3831
3832 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3833 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3834 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3835 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3836 UseMI.untieRegOperand(
3837 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3838
3839 Src1->ChangeToImmediate(*SubRegImm);
3840
3842 UseMI.setDesc(get(NewOpc));
3843
3844 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3845 NewOpc == AMDGPU::V_FMAMK_F16_fake16) {
3846 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3847 Register Tmp = MRI->createVirtualRegister(NewRC);
3848 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3849 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3850 UseMI.getOperand(0).getReg())
3851 .addReg(Tmp, RegState::Kill);
3852 UseMI.getOperand(0).setReg(Tmp);
3853 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3854 CopyRegOperandToNarrowerRC(UseMI, 3, NewRC);
3855 }
3856
3857 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3858 if (DeleteDef)
3859 DefMI.eraseFromParent();
3860
3861 return true;
3862 }
3863
3864 // Added part is the constant: Use v_madak_{f16, f32}.
3865 if (Src2->isReg() && Src2->getReg() == Reg) {
3866 if (ST.getConstantBusLimit(Opc) < 2) {
3867 // Not allowed to use constant bus for another operand.
3868 // We can however allow an inline immediate as src0.
3869 bool Src0Inlined = false;
3870 if (Src0->isReg()) {
3871 // Try to inline constant if possible.
3872 // If the Def moves immediate and the use is single
3873 // We are saving VGPR here.
3874 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3875 if (Def && Def->isMoveImmediate() &&
3876 isInlineConstant(Def->getOperand(1)) &&
3877 MRI->hasOneNonDBGUse(Src0->getReg())) {
3878 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3879 Src0Inlined = true;
3880 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3881 RI.isSGPRReg(*MRI, Src0->getReg())) {
3882 return false;
3883 }
3884 // VGPR is okay as Src0 - fallthrough
3885 }
3886
3887 if (Src1->isReg() && !Src0Inlined) {
3888 // We have one slot for inlinable constant so far - try to fill it
3889 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3890 if (Def && Def->isMoveImmediate() &&
3891 isInlineConstant(Def->getOperand(1)) &&
3892 MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI))
3893 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3894 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3895 return false;
3896 // VGPR is okay as Src1 - fallthrough
3897 }
3898 }
3899
3900 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3901 if (pseudoToMCOpcode(NewOpc) == -1)
3902 return false;
3903
3904 // FIXME: This would be a lot easier if we could return a new instruction
3905 // instead of having to modify in place.
3906
3907 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3908 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3909 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3910 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3911 UseMI.untieRegOperand(
3912 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3913
3914 const std::optional<int64_t> SubRegImm =
3915 extractSubregFromImm(Imm, Src2->getSubReg());
3916
3917 // ChangingToImmediate adds Src2 back to the instruction.
3918 Src2->ChangeToImmediate(*SubRegImm);
3919
3920 // These come before src2.
3922 UseMI.setDesc(get(NewOpc));
3923
3924 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3925 NewOpc == AMDGPU::V_FMAAK_F16_fake16) {
3926 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3927 Register Tmp = MRI->createVirtualRegister(NewRC);
3928 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3929 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3930 UseMI.getOperand(0).getReg())
3931 .addReg(Tmp, RegState::Kill);
3932 UseMI.getOperand(0).setReg(Tmp);
3933 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3934 CopyRegOperandToNarrowerRC(UseMI, 2, NewRC);
3935 }
3936
3937 // It might happen that UseMI was commuted
3938 // and we now have SGPR as SRC1. If so 2 inlined
3939 // constant and SGPR are illegal.
3941
3942 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3943 if (DeleteDef)
3944 DefMI.eraseFromParent();
3945
3946 return true;
3947 }
3948 }
3949
3950 return false;
3951}
3952
3953static bool
3956 if (BaseOps1.size() != BaseOps2.size())
3957 return false;
3958 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3959 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3960 return false;
3961 }
3962 return true;
3963}
3964
3965static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3966 LocationSize WidthB, int OffsetB) {
3967 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3968 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3969 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3970 return LowWidth.hasValue() &&
3971 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3972}
3973
3974bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3975 const MachineInstr &MIb) const {
3976 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3977 int64_t Offset0, Offset1;
3978 LocationSize Dummy0 = LocationSize::precise(0);
3979 LocationSize Dummy1 = LocationSize::precise(0);
3980 bool Offset0IsScalable, Offset1IsScalable;
3981 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3982 Dummy0, &RI) ||
3983 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3984 Dummy1, &RI))
3985 return false;
3986
3987 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3988 return false;
3989
3990 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3991 // FIXME: Handle ds_read2 / ds_write2.
3992 return false;
3993 }
3994 LocationSize Width0 = MIa.memoperands().front()->getSize();
3995 LocationSize Width1 = MIb.memoperands().front()->getSize();
3996 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3997}
3998
4000 const MachineInstr &MIb) const {
4001 assert(MIa.mayLoadOrStore() &&
4002 "MIa must load from or modify a memory location");
4003 assert(MIb.mayLoadOrStore() &&
4004 "MIb must load from or modify a memory location");
4005
4007 return false;
4008
4009 // XXX - Can we relax this between address spaces?
4010 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
4011 return false;
4012
4013 if (isLDSDMA(MIa) || isLDSDMA(MIb))
4014 return false;
4015
4016 if (MIa.isBundle() || MIb.isBundle())
4017 return false;
4018
4019 // TODO: Should we check the address space from the MachineMemOperand? That
4020 // would allow us to distinguish objects we know don't alias based on the
4021 // underlying address space, even if it was lowered to a different one,
4022 // e.g. private accesses lowered to use MUBUF instructions on a scratch
4023 // buffer.
4024 if (isDS(MIa)) {
4025 if (isDS(MIb))
4026 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4027
4028 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
4029 }
4030
4031 if (isMUBUF(MIa) || isMTBUF(MIa)) {
4032 if (isMUBUF(MIb) || isMTBUF(MIb))
4033 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4034
4035 if (isFLAT(MIb))
4036 return isFLATScratch(MIb);
4037
4038 return !isSMRD(MIb);
4039 }
4040
4041 if (isSMRD(MIa)) {
4042 if (isSMRD(MIb))
4043 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4044
4045 if (isFLAT(MIb))
4046 return isFLATScratch(MIb);
4047
4048 return !isMUBUF(MIb) && !isMTBUF(MIb);
4049 }
4050
4051 if (isFLAT(MIa)) {
4052 if (isFLAT(MIb)) {
4053 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
4054 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
4055 return true;
4056
4057 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4058 }
4059
4060 return false;
4061 }
4062
4063 return false;
4064}
4065
4067 int64_t &Imm, MachineInstr **DefMI = nullptr) {
4068 if (Reg.isPhysical())
4069 return false;
4070 auto *Def = MRI.getUniqueVRegDef(Reg);
4071 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
4072 Imm = Def->getOperand(1).getImm();
4073 if (DefMI)
4074 *DefMI = Def;
4075 return true;
4076 }
4077 return false;
4078}
4079
4080static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
4081 MachineInstr **DefMI = nullptr) {
4082 if (!MO->isReg())
4083 return false;
4084 const MachineFunction *MF = MO->getParent()->getMF();
4085 const MachineRegisterInfo &MRI = MF->getRegInfo();
4086 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
4087}
4088
4090 MachineInstr &NewMI) {
4091 if (LV) {
4092 unsigned NumOps = MI.getNumOperands();
4093 for (unsigned I = 1; I < NumOps; ++I) {
4094 MachineOperand &Op = MI.getOperand(I);
4095 if (Op.isReg() && Op.isKill())
4096 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
4097 }
4098 }
4099}
4100
4101static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4102 switch (Opc) {
4103 case AMDGPU::V_MAC_F16_e32:
4104 case AMDGPU::V_MAC_F16_e64:
4105 return AMDGPU::V_MAD_F16_e64;
4106 case AMDGPU::V_MAC_F32_e32:
4107 case AMDGPU::V_MAC_F32_e64:
4108 return AMDGPU::V_MAD_F32_e64;
4109 case AMDGPU::V_MAC_LEGACY_F32_e32:
4110 case AMDGPU::V_MAC_LEGACY_F32_e64:
4111 return AMDGPU::V_MAD_LEGACY_F32_e64;
4112 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4113 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4114 return AMDGPU::V_FMA_LEGACY_F32_e64;
4115 case AMDGPU::V_FMAC_F16_e32:
4116 case AMDGPU::V_FMAC_F16_e64:
4117 case AMDGPU::V_FMAC_F16_t16_e64:
4118 case AMDGPU::V_FMAC_F16_fake16_e64:
4119 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4120 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4121 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4122 : AMDGPU::V_FMA_F16_gfx9_e64;
4123 case AMDGPU::V_FMAC_F32_e32:
4124 case AMDGPU::V_FMAC_F32_e64:
4125 return AMDGPU::V_FMA_F32_e64;
4126 case AMDGPU::V_FMAC_F64_e32:
4127 case AMDGPU::V_FMAC_F64_e64:
4128 return AMDGPU::V_FMA_F64_e64;
4129 default:
4130 llvm_unreachable("invalid instruction");
4131 }
4132}
4133
4134/// Helper struct for the implementation of 3-address conversion to communicate
4135/// updates made to instruction operands.
4137 /// Other instruction whose def is no longer used by the converted
4138 /// instruction.
4140};
4141
4143 LiveVariables *LV,
4144 LiveIntervals *LIS) const {
4145 MachineBasicBlock &MBB = *MI.getParent();
4146 MachineInstr *CandidateMI = &MI;
4147
4148 if (MI.isBundle()) {
4149 // This is a temporary placeholder for bundle handling that enables us to
4150 // exercise the relevant code paths in the two-address instruction pass.
4151 if (MI.getBundleSize() != 1)
4152 return nullptr;
4153 CandidateMI = MI.getNextNode();
4154 }
4155
4157 MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U);
4158 if (!NewMI)
4159 return nullptr;
4160
4161 if (MI.isBundle()) {
4162 CandidateMI->eraseFromBundle();
4163
4164 for (MachineOperand &MO : MI.all_defs()) {
4165 if (MO.isTied())
4166 MI.untieRegOperand(MO.getOperandNo());
4167 }
4168 } else {
4169 updateLiveVariables(LV, MI, *NewMI);
4170 if (LIS) {
4171 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
4172 // SlotIndex of defs needs to be updated when converting to early-clobber
4173 MachineOperand &Def = NewMI->getOperand(0);
4174 if (Def.isEarlyClobber() && Def.isReg() &&
4175 LIS->hasInterval(Def.getReg())) {
4176 SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
4177 SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
4178 auto &LI = LIS->getInterval(Def.getReg());
4179 auto UpdateDefIndex = [&](LiveRange &LR) {
4180 auto *S = LR.find(OldIndex);
4181 if (S != LR.end() && S->start == OldIndex) {
4182 assert(S->valno && S->valno->def == OldIndex);
4183 S->start = NewIndex;
4184 S->valno->def = NewIndex;
4185 }
4186 };
4187 UpdateDefIndex(LI);
4188 for (auto &SR : LI.subranges())
4189 UpdateDefIndex(SR);
4190 }
4191 }
4192 }
4193
4194 if (U.RemoveMIUse) {
4195 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4196 // The only user is the instruction which will be killed.
4197 Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
4198
4199 if (MRI.hasOneNonDBGUse(DefReg)) {
4200 // We cannot just remove the DefMI here, calling pass will crash.
4201 U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
4202 U.RemoveMIUse->getOperand(0).setIsDead(true);
4203 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4204 U.RemoveMIUse->removeOperand(I);
4205 if (LV)
4206 LV->getVarInfo(DefReg).AliveBlocks.clear();
4207 }
4208
4209 if (MI.isBundle()) {
4210 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4211 if (!VRI.Reads && !VRI.Writes) {
4212 for (MachineOperand &MO : MI.all_uses()) {
4213 if (MO.isReg() && MO.getReg() == DefReg) {
4214 assert(MO.getSubReg() == 0 &&
4215 "tied sub-registers in bundles currently not supported");
4216 MI.removeOperand(MO.getOperandNo());
4217 break;
4218 }
4219 }
4220
4221 if (LIS)
4222 LIS->shrinkToUses(&LIS->getInterval(DefReg));
4223 }
4224 } else if (LIS) {
4225 LiveInterval &DefLI = LIS->getInterval(DefReg);
4226
4227 // We cannot delete the original instruction here, so hack out the use
4228 // in the original instruction with a dummy register so we can use
4229 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4230 // not have the complexity of deleting a use to consider here.
4231 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4232 for (MachineOperand &MIOp : MI.uses()) {
4233 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4234 MIOp.setIsUndef(true);
4235 MIOp.setReg(DummyReg);
4236 }
4237 }
4238
4239 if (MI.isBundle()) {
4240 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4241 if (!VRI.Reads && !VRI.Writes) {
4242 for (MachineOperand &MIOp : MI.uses()) {
4243 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4244 MIOp.setIsUndef(true);
4245 MIOp.setReg(DummyReg);
4246 }
4247 }
4248 }
4249
4250 MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false,
4251 false, /*isUndef=*/true));
4252 }
4253
4254 LIS->shrinkToUses(&DefLI);
4255 }
4256 }
4257
4258 return MI.isBundle() ? &MI : NewMI;
4259}
4260
4262SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4263 ThreeAddressUpdates &U) const {
4264 MachineBasicBlock &MBB = *MI.getParent();
4265 unsigned Opc = MI.getOpcode();
4266
4267 // Handle MFMA.
4268 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4269 if (NewMFMAOpc != -1) {
4271 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4272 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4273 MIB.add(MI.getOperand(I));
4274 return MIB;
4275 }
4276
4277 if (SIInstrInfo::isWMMA(MI)) {
4278 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4279 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4280 .setMIFlags(MI.getFlags());
4281 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4282 MIB->addOperand(MI.getOperand(I));
4283 return MIB;
4284 }
4285
4286 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4287 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4288 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4289 "present pre-RA");
4290
4291 // Handle MAC/FMAC.
4292 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4293 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4294 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4295 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4296 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4297 bool Src0Literal = false;
4298
4299 switch (Opc) {
4300 default:
4301 return nullptr;
4302 case AMDGPU::V_MAC_F16_e64:
4303 case AMDGPU::V_FMAC_F16_e64:
4304 case AMDGPU::V_FMAC_F16_t16_e64:
4305 case AMDGPU::V_FMAC_F16_fake16_e64:
4306 case AMDGPU::V_MAC_F32_e64:
4307 case AMDGPU::V_MAC_LEGACY_F32_e64:
4308 case AMDGPU::V_FMAC_F32_e64:
4309 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4310 case AMDGPU::V_FMAC_F64_e64:
4311 break;
4312 case AMDGPU::V_MAC_F16_e32:
4313 case AMDGPU::V_FMAC_F16_e32:
4314 case AMDGPU::V_MAC_F32_e32:
4315 case AMDGPU::V_MAC_LEGACY_F32_e32:
4316 case AMDGPU::V_FMAC_F32_e32:
4317 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4318 case AMDGPU::V_FMAC_F64_e32: {
4319 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4320 AMDGPU::OpName::src0);
4321 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4322 if (!Src0->isReg() && !Src0->isImm())
4323 return nullptr;
4324
4325 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4326 Src0Literal = true;
4327
4328 break;
4329 }
4330 }
4331
4332 MachineInstrBuilder MIB;
4333 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4334 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4335 const MachineOperand *Src0Mods =
4336 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4337 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4338 const MachineOperand *Src1Mods =
4339 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4340 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4341 const MachineOperand *Src2Mods =
4342 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4343 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4344 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4345 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4346
4347 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4348 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4349 // If we have an SGPR input, we will violate the constant bus restriction.
4350 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4351 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4352 MachineInstr *DefMI;
4353
4354 int64_t Imm;
4355 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4356 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4357 if (pseudoToMCOpcode(NewOpc) != -1) {
4358 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4359 .add(*Dst)
4360 .add(*Src0)
4361 .add(*Src1)
4362 .addImm(Imm)
4363 .setMIFlags(MI.getFlags());
4364 U.RemoveMIUse = DefMI;
4365 return MIB;
4366 }
4367 }
4368 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4369 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4370 if (pseudoToMCOpcode(NewOpc) != -1) {
4371 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4372 .add(*Dst)
4373 .add(*Src0)
4374 .addImm(Imm)
4375 .add(*Src2)
4376 .setMIFlags(MI.getFlags());
4377 U.RemoveMIUse = DefMI;
4378 return MIB;
4379 }
4380 }
4381 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4382 if (Src0Literal) {
4383 Imm = Src0->getImm();
4384 DefMI = nullptr;
4385 }
4386 if (pseudoToMCOpcode(NewOpc) != -1 &&
4388 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4389 Src1)) {
4390 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4391 .add(*Dst)
4392 .add(*Src1)
4393 .addImm(Imm)
4394 .add(*Src2)
4395 .setMIFlags(MI.getFlags());
4396 U.RemoveMIUse = DefMI;
4397 return MIB;
4398 }
4399 }
4400 }
4401
4402 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4403 // if VOP3 does not allow a literal operand.
4404 if (Src0Literal && !ST.hasVOP3Literal())
4405 return nullptr;
4406
4407 unsigned NewOpc = getNewFMAInst(ST, Opc);
4408
4409 if (pseudoToMCOpcode(NewOpc) == -1)
4410 return nullptr;
4411
4412 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4413 .add(*Dst)
4414 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4415 .add(*Src0)
4416 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4417 .add(*Src1)
4418 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4419 .add(*Src2)
4420 .addImm(Clamp ? Clamp->getImm() : 0)
4421 .addImm(Omod ? Omod->getImm() : 0)
4422 .setMIFlags(MI.getFlags());
4423 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4424 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4425 return MIB;
4426}
4427
4428// It's not generally safe to move VALU instructions across these since it will
4429// start using the register as a base index rather than directly.
4430// XXX - Why isn't hasSideEffects sufficient for these?
4432 switch (MI.getOpcode()) {
4433 case AMDGPU::S_SET_GPR_IDX_ON:
4434 case AMDGPU::S_SET_GPR_IDX_MODE:
4435 case AMDGPU::S_SET_GPR_IDX_OFF:
4436 return true;
4437 default:
4438 return false;
4439 }
4440}
4441
4443 const MachineBasicBlock *MBB,
4444 const MachineFunction &MF) const {
4445 // Skipping the check for SP writes in the base implementation. The reason it
4446 // was added was apparently due to compile time concerns.
4447 //
4448 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4449 // but is probably avoidable.
4450
4451 // Copied from base implementation.
4452 // Terminators and labels can't be scheduled around.
4453 if (MI.isTerminator() || MI.isPosition())
4454 return true;
4455
4456 // INLINEASM_BR can jump to another block
4457 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4458 return true;
4459
4460 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4461 return true;
4462
4463 // Target-independent instructions do not have an implicit-use of EXEC, even
4464 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4465 // boundaries prevents incorrect movements of such instructions.
4466 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4467 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4468 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4469 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4470 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4472}
4473
4475 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4476 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4477 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4478}
4479
4481 // Instructions that access scratch use FLAT encoding or BUF encodings.
4482 if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
4483 return false;
4484
4485 // SCRATCH instructions always access scratch.
4486 if (isFLATScratch(MI))
4487 return true;
4488
4489 // If FLAT_SCRATCH registers are not initialized, we can never access scratch
4490 // via the aperture.
4491 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4492 return false;
4493
4494 // If there are no memory operands then conservatively assume the flat
4495 // operation may access scratch.
4496 if (MI.memoperands_empty())
4497 return true;
4498
4499 // See if any memory operand specifies an address space that involves scratch.
4500 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4501 unsigned AS = Memop->getAddrSpace();
4502 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4503 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4504 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4505 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4506 }
4507 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4508 });
4509}
4510
4512 assert(isFLAT(MI));
4513
4514 // All flat instructions use the VMEM counter except prefetch.
4515 if (!usesVM_CNT(MI))
4516 return false;
4517
4518 // If there are no memory operands then conservatively assume the flat
4519 // operation may access VMEM.
4520 if (MI.memoperands_empty())
4521 return true;
4522
4523 // See if any memory operand specifies an address space that involves VMEM.
4524 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4525 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4526 // (GDS) address space is not supported by flat operations. Therefore, simply
4527 // return true unless only the LDS address space is found.
4528 for (const MachineMemOperand *Memop : MI.memoperands()) {
4529 unsigned AS = Memop->getAddrSpace();
4531 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4532 return true;
4533 }
4534
4535 return false;
4536}
4537
4539 assert(isFLAT(MI));
4540
4541 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4542 if (!usesLGKM_CNT(MI))
4543 return false;
4544
4545 // If in tgsplit mode then there can be no use of LDS.
4546 if (ST.isTgSplitEnabled())
4547 return false;
4548
4549 // If there are no memory operands then conservatively assume the flat
4550 // operation may access LDS.
4551 if (MI.memoperands_empty())
4552 return true;
4553
4554 // See if any memory operand specifies an address space that involves LDS.
4555 for (const MachineMemOperand *Memop : MI.memoperands()) {
4556 unsigned AS = Memop->getAddrSpace();
4558 return true;
4559 }
4560
4561 return false;
4562}
4563
4565 // Skip the full operand and register alias search modifiesRegister
4566 // does. There's only a handful of instructions that touch this, it's only an
4567 // implicit def, and doesn't alias any other registers.
4568 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4569}
4570
4572 unsigned Opcode = MI.getOpcode();
4573
4574 if (MI.mayStore() && isSMRD(MI))
4575 return true; // scalar store or atomic
4576
4577 // This will terminate the function when other lanes may need to continue.
4578 if (MI.isReturn())
4579 return true;
4580
4581 // These instructions cause shader I/O that may cause hardware lockups
4582 // when executed with an empty EXEC mask.
4583 //
4584 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4585 // EXEC = 0, but checking for that case here seems not worth it
4586 // given the typical code patterns.
4587 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4588 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4589 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4590 return true;
4591
4592 if (MI.isCall() || MI.isInlineAsm())
4593 return true; // conservative assumption
4594
4595 // Assume that barrier interactions are only intended with active lanes.
4596 if (isBarrier(Opcode))
4597 return true;
4598
4599 // A mode change is a scalar operation that influences vector instructions.
4601 return true;
4602
4603 // These are like SALU instructions in terms of effects, so it's questionable
4604 // whether we should return true for those.
4605 //
4606 // However, executing them with EXEC = 0 causes them to operate on undefined
4607 // data, which we avoid by returning true here.
4608 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4609 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4610 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4611 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4612 return true;
4613
4614 return false;
4615}
4616
4618 const MachineInstr &MI) const {
4619 if (MI.isMetaInstruction())
4620 return false;
4621
4622 // This won't read exec if this is an SGPR->SGPR copy.
4623 if (MI.isCopyLike()) {
4624 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4625 return true;
4626
4627 // Make sure this isn't copying exec as a normal operand
4628 return MI.readsRegister(AMDGPU::EXEC, &RI);
4629 }
4630
4631 // Make a conservative assumption about the callee.
4632 if (MI.isCall())
4633 return true;
4634
4635 // Be conservative with any unhandled generic opcodes.
4636 if (!isTargetSpecificOpcode(MI.getOpcode()))
4637 return true;
4638
4639 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4640}
4641
4642bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4643 switch (Imm.getBitWidth()) {
4644 case 1: // This likely will be a condition code mask.
4645 return true;
4646
4647 case 32:
4648 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4649 ST.hasInv2PiInlineImm());
4650 case 64:
4651 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4652 ST.hasInv2PiInlineImm());
4653 case 16:
4654 return ST.has16BitInsts() &&
4655 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4656 ST.hasInv2PiInlineImm());
4657 default:
4658 llvm_unreachable("invalid bitwidth");
4659 }
4660}
4661
4663 APInt IntImm = Imm.bitcastToAPInt();
4664 int64_t IntImmVal = IntImm.getSExtValue();
4665 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4666 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4667 default:
4668 llvm_unreachable("invalid fltSemantics");
4671 return isInlineConstant(IntImm);
4673 return ST.has16BitInsts() &&
4674 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4676 return ST.has16BitInsts() &&
4677 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4678 }
4679}
4680
4681bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4682 // MachineOperand provides no way to tell the true operand size, since it only
4683 // records a 64-bit value. We need to know the size to determine if a 32-bit
4684 // floating point immediate bit pattern is legal for an integer immediate. It
4685 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4686 switch (OperandType) {
4696 int32_t Trunc = static_cast<int32_t>(Imm);
4697 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4698 }
4704 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4707 // We would expect inline immediates to not be concerned with an integer/fp
4708 // distinction. However, in the case of 16-bit integer operations, the
4709 // "floating point" values appear to not work. It seems read the low 16-bits
4710 // of 32-bit immediates, which happens to always work for the integer
4711 // values.
4712 //
4713 // See llvm bugzilla 46302.
4714 //
4715 // TODO: Theoretically we could use op-sel to use the high bits of the
4716 // 32-bit FP values.
4725 return AMDGPU::isPKFMACF16InlineConstant(Imm, ST.isGFX11Plus());
4730 return false;
4733 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4734 // A few special case instructions have 16-bit operands on subtargets
4735 // where 16-bit instructions are not legal.
4736 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4737 // constants in these cases
4738 int16_t Trunc = static_cast<int16_t>(Imm);
4739 return ST.has16BitInsts() &&
4740 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4741 }
4742
4743 return false;
4744 }
4747 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4748 int16_t Trunc = static_cast<int16_t>(Imm);
4749 return ST.has16BitInsts() &&
4750 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4751 }
4752 return false;
4753 }
4757 return false;
4759 return isLegalAV64PseudoImm(Imm);
4762 // Always embedded in the instruction for free.
4763 return true;
4773 // Just ignore anything else.
4774 return true;
4775 default:
4776 llvm_unreachable("invalid operand type");
4777 }
4778}
4779
4780static bool compareMachineOp(const MachineOperand &Op0,
4781 const MachineOperand &Op1) {
4782 if (Op0.getType() != Op1.getType())
4783 return false;
4784
4785 switch (Op0.getType()) {
4787 return Op0.getReg() == Op1.getReg();
4789 return Op0.getImm() == Op1.getImm();
4790 default:
4791 llvm_unreachable("Didn't expect to be comparing these operand types");
4792 }
4793}
4794
4796 const MCOperandInfo &OpInfo) const {
4797 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4798 return true;
4799
4800 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4801 return false;
4802
4803 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4804 return true;
4805
4806 return ST.hasVOP3Literal();
4807}
4808
4809bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4810 int64_t ImmVal) const {
4811 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4812 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4813 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4814 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4815 AMDGPU::OpName::src2))
4816 return false;
4817 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4818 }
4819
4820 return isLiteralOperandLegal(InstDesc, OpInfo);
4821}
4822
4823bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4824 const MachineOperand &MO) const {
4825 if (MO.isImm())
4826 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4827
4828 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4829 "unexpected imm-like operand kind");
4830 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4831 return isLiteralOperandLegal(InstDesc, OpInfo);
4832}
4833
4835 // 2 32-bit inline constants packed into one.
4836 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4837 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4838}
4839
4840bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4841 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4842 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4843 return false;
4844
4845 int Op32 = AMDGPU::getVOPe32(Opcode);
4846 if (Op32 == -1)
4847 return false;
4848
4849 return pseudoToMCOpcode(Op32) != -1;
4850}
4851
4852bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4853 // The src0_modifier operand is present on all instructions
4854 // that have modifiers.
4855
4856 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4857}
4858
4860 AMDGPU::OpName OpName) const {
4861 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4862 return Mods && Mods->getImm();
4863}
4864
4866 return any_of(ModifierOpNames,
4867 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4868}
4869
4871 const MachineRegisterInfo &MRI) const {
4872 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4873 // Can't shrink instruction with three operands.
4874 if (Src2) {
4875 switch (MI.getOpcode()) {
4876 default: return false;
4877
4878 case AMDGPU::V_ADDC_U32_e64:
4879 case AMDGPU::V_SUBB_U32_e64:
4880 case AMDGPU::V_SUBBREV_U32_e64: {
4881 const MachineOperand *Src1
4882 = getNamedOperand(MI, AMDGPU::OpName::src1);
4883 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4884 return false;
4885 // Additional verification is needed for sdst/src2.
4886 return true;
4887 }
4888 case AMDGPU::V_MAC_F16_e64:
4889 case AMDGPU::V_MAC_F32_e64:
4890 case AMDGPU::V_MAC_LEGACY_F32_e64:
4891 case AMDGPU::V_FMAC_F16_e64:
4892 case AMDGPU::V_FMAC_F16_t16_e64:
4893 case AMDGPU::V_FMAC_F16_fake16_e64:
4894 case AMDGPU::V_FMAC_F32_e64:
4895 case AMDGPU::V_FMAC_F64_e64:
4896 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4897 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4898 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4899 return false;
4900 break;
4901
4902 case AMDGPU::V_CNDMASK_B32_e64:
4903 break;
4904 }
4905 }
4906
4907 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4908 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4909 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4910 return false;
4911
4912 // We don't need to check src0, all input types are legal, so just make sure
4913 // src0 isn't using any modifiers.
4914 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4915 return false;
4916
4917 // Can it be shrunk to a valid 32 bit opcode?
4918 if (!hasVALU32BitEncoding(MI.getOpcode()))
4919 return false;
4920
4921 // Check output modifiers
4922 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4923 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4924 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4925 // TODO: Can we avoid checking bound_ctrl/fi here?
4926 // They are only used by permlane*_swap special case.
4927 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4928 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4929}
4930
4931// Set VCC operand with all flags from \p Orig, except for setting it as
4932// implicit.
4934 const MachineOperand &Orig) {
4935
4936 for (MachineOperand &Use : MI.implicit_operands()) {
4937 if (Use.isUse() &&
4938 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4939 Use.setIsUndef(Orig.isUndef());
4940 Use.setIsKill(Orig.isKill());
4941 return;
4942 }
4943 }
4944}
4945
4947 unsigned Op32) const {
4948 MachineBasicBlock *MBB = MI.getParent();
4949
4950 const MCInstrDesc &Op32Desc = get(Op32);
4951 MachineInstrBuilder Inst32 =
4952 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4953 .setMIFlags(MI.getFlags());
4954
4955 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4956 // For VOPC instructions, this is replaced by an implicit def of vcc.
4957
4958 // We assume the defs of the shrunk opcode are in the same order, and the
4959 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4960 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4961 Inst32.add(MI.getOperand(I));
4962
4963 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4964
4965 int Idx = MI.getNumExplicitDefs();
4966 for (const MachineOperand &Use : MI.explicit_uses()) {
4967 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4969 continue;
4970
4971 if (&Use == Src2) {
4972 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4973 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4974 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4975 // of vcc was already added during the initial BuildMI, but we
4976 // 1) may need to change vcc to vcc_lo to preserve the original register
4977 // 2) have to preserve the original flags.
4978 copyFlagsToImplicitVCC(*Inst32, *Src2);
4979 continue;
4980 }
4981 }
4982
4983 Inst32.add(Use);
4984 }
4985
4986 // FIXME: Losing implicit operands
4987 fixImplicitOperands(*Inst32);
4988 return Inst32;
4989}
4990
4992 // Null is free
4993 Register Reg = RegOp.getReg();
4994 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
4995 return false;
4996
4997 // SGPRs use the constant bus
4998
4999 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
5000 // physical register operands should also count, except for exec.
5001 if (RegOp.isImplicit())
5002 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
5003
5004 // SGPRs use the constant bus
5005 return AMDGPU::SReg_32RegClass.contains(Reg) ||
5006 AMDGPU::SReg_64RegClass.contains(Reg);
5007}
5008
5010 const MachineRegisterInfo &MRI) const {
5011 Register Reg = RegOp.getReg();
5012 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5013 : physRegUsesConstantBus(RegOp);
5014}
5015
5017 const MachineOperand &MO,
5018 const MCOperandInfo &OpInfo) const {
5019 // Literal constants use the constant bus.
5020 if (!MO.isReg())
5021 return !isInlineConstant(MO, OpInfo);
5022
5023 Register Reg = MO.getReg();
5024 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5026}
5027
5029 for (const MachineOperand &MO : MI.implicit_operands()) {
5030 // We only care about reads.
5031 if (MO.isDef())
5032 continue;
5033
5034 switch (MO.getReg()) {
5035 case AMDGPU::VCC:
5036 case AMDGPU::VCC_LO:
5037 case AMDGPU::VCC_HI:
5038 case AMDGPU::M0:
5039 case AMDGPU::FLAT_SCR:
5040 return MO.getReg();
5041
5042 default:
5043 break;
5044 }
5045 }
5046
5047 return Register();
5048}
5049
5050static bool shouldReadExec(const MachineInstr &MI) {
5051 if (SIInstrInfo::isVALU(MI)) {
5052 switch (MI.getOpcode()) {
5053 case AMDGPU::V_READLANE_B32:
5054 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
5055 case AMDGPU::V_WRITELANE_B32:
5056 case AMDGPU::SI_SPILL_S32_TO_VGPR:
5057 return false;
5058 }
5059
5060 return true;
5061 }
5062
5063 if (MI.isPreISelOpcode() ||
5064 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
5067 return false;
5068
5069 return true;
5070}
5071
5072static bool isRegOrFI(const MachineOperand &MO) {
5073 return MO.isReg() || MO.isFI();
5074}
5075
5076static bool isSubRegOf(const SIRegisterInfo &TRI,
5077 const MachineOperand &SuperVec,
5078 const MachineOperand &SubReg) {
5079 if (SubReg.getReg().isPhysical())
5080 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
5081
5082 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
5083 SubReg.getReg() == SuperVec.getReg();
5084}
5085
5086// Verify the illegal copy from vector register to SGPR for generic opcode COPY
5087bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
5088 const MachineRegisterInfo &MRI,
5089 StringRef &ErrInfo) const {
5090 Register DstReg = MI.getOperand(0).getReg();
5091 Register SrcReg = MI.getOperand(1).getReg();
5092 // This is a check for copy from vector register to SGPR
5093 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
5094 ErrInfo = "illegal copy from vector register to SGPR";
5095 return false;
5096 }
5097 return true;
5098}
5099
5101 StringRef &ErrInfo) const {
5102 uint16_t Opcode = MI.getOpcode();
5103 const MachineFunction *MF = MI.getMF();
5104 const MachineRegisterInfo &MRI = MF->getRegInfo();
5105
5106 // FIXME: At this point the COPY verify is done only for non-ssa forms.
5107 // Find a better property to recognize the point where instruction selection
5108 // is just done.
5109 // We can only enforce this check after SIFixSGPRCopies pass so that the
5110 // illegal copies are legalized and thereafter we don't expect a pass
5111 // inserting similar copies.
5112 if (!MRI.isSSA() && MI.isCopy())
5113 return verifyCopy(MI, MRI, ErrInfo);
5114
5115 if (SIInstrInfo::isGenericOpcode(Opcode))
5116 return true;
5117
5118 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
5119 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
5120 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
5121 int Src3Idx = -1;
5122 if (Src0Idx == -1) {
5123 // VOPD V_DUAL_* instructions use different operand names.
5124 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
5125 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
5126 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
5127 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
5128 }
5129
5130 // Make sure the number of operands is correct.
5131 const MCInstrDesc &Desc = get(Opcode);
5132 if (!Desc.isVariadic() &&
5133 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
5134 ErrInfo = "Instruction has wrong number of operands.";
5135 return false;
5136 }
5137
5138 if (MI.isInlineAsm()) {
5139 // Verify register classes for inlineasm constraints.
5140 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
5141 I != E; ++I) {
5142 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
5143 if (!RC)
5144 continue;
5145
5146 const MachineOperand &Op = MI.getOperand(I);
5147 if (!Op.isReg())
5148 continue;
5149
5150 Register Reg = Op.getReg();
5151 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5152 ErrInfo = "inlineasm operand has incorrect register class.";
5153 return false;
5154 }
5155 }
5156
5157 return true;
5158 }
5159
5160 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5161 ErrInfo = "missing memory operand from image instruction.";
5162 return false;
5163 }
5164
5165 // Make sure the register classes are correct.
5166 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5167 const MachineOperand &MO = MI.getOperand(i);
5168 if (MO.isFPImm()) {
5169 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5170 "all fp values to integers.";
5171 return false;
5172 }
5173
5174 const MCOperandInfo &OpInfo = Desc.operands()[i];
5175 int16_t RegClass = getOpRegClassID(OpInfo);
5176
5177 switch (OpInfo.OperandType) {
5179 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5180 ErrInfo = "Illegal immediate value for operand.";
5181 return false;
5182 }
5183 break;
5197 break;
5199 break;
5200 break;
5214 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5215 ErrInfo = "Illegal immediate value for operand.";
5216 return false;
5217 }
5218 break;
5219 }
5221 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5222 ErrInfo = "Expected inline constant for operand.";
5223 return false;
5224 }
5225 break;
5229 break;
5234 // Check if this operand is an immediate.
5235 // FrameIndex operands will be replaced by immediates, so they are
5236 // allowed.
5237 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5238 ErrInfo = "Expected immediate, but got non-immediate";
5239 return false;
5240 }
5241 break;
5245 break;
5246 default:
5247 if (OpInfo.isGenericType())
5248 continue;
5249 break;
5250 }
5251
5252 if (!MO.isReg())
5253 continue;
5254 Register Reg = MO.getReg();
5255 if (!Reg)
5256 continue;
5257
5258 // FIXME: Ideally we would have separate instruction definitions with the
5259 // aligned register constraint.
5260 // FIXME: We do not verify inline asm operands, but custom inline asm
5261 // verification is broken anyway
5262 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
5263 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5264 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5265 if (const TargetRegisterClass *SubRC =
5266 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5267 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5268 if (RC)
5269 RC = SubRC;
5270 }
5271 }
5272
5273 // Check that this is the aligned version of the class.
5274 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5275 ErrInfo = "Subtarget requires even aligned vector registers";
5276 return false;
5277 }
5278 }
5279
5280 if (RegClass != -1) {
5281 if (Reg.isVirtual())
5282 continue;
5283
5284 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5285 if (!RC->contains(Reg)) {
5286 ErrInfo = "Operand has incorrect register class.";
5287 return false;
5288 }
5289 }
5290 }
5291
5292 // Verify SDWA
5293 if (isSDWA(MI)) {
5294 if (!ST.hasSDWA()) {
5295 ErrInfo = "SDWA is not supported on this target";
5296 return false;
5297 }
5298
5299 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5300 AMDGPU::OpName::dst_sel}) {
5301 const MachineOperand *MO = getNamedOperand(MI, Op);
5302 if (!MO)
5303 continue;
5304 int64_t Imm = MO->getImm();
5305 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5306 ErrInfo = "Invalid SDWA selection";
5307 return false;
5308 }
5309 }
5310
5311 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5312
5313 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5314 if (OpIdx == -1)
5315 continue;
5316 const MachineOperand &MO = MI.getOperand(OpIdx);
5317
5318 if (!ST.hasSDWAScalar()) {
5319 // Only VGPRS on VI
5320 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5321 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5322 return false;
5323 }
5324 } else {
5325 // No immediates on GFX9
5326 if (!MO.isReg()) {
5327 ErrInfo =
5328 "Only reg allowed as operands in SDWA instructions on GFX9+";
5329 return false;
5330 }
5331 }
5332 }
5333
5334 if (!ST.hasSDWAOmod()) {
5335 // No omod allowed on VI
5336 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5337 if (OMod != nullptr &&
5338 (!OMod->isImm() || OMod->getImm() != 0)) {
5339 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5340 return false;
5341 }
5342 }
5343
5344 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5345 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5346 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5347 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5348 const MachineOperand *Src0ModsMO =
5349 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5350 unsigned Mods = Src0ModsMO->getImm();
5351 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5352 Mods & SISrcMods::SEXT) {
5353 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5354 return false;
5355 }
5356 }
5357
5358 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5359 if (isVOPC(BasicOpcode)) {
5360 if (!ST.hasSDWASdst() && DstIdx != -1) {
5361 // Only vcc allowed as dst on VI for VOPC
5362 const MachineOperand &Dst = MI.getOperand(DstIdx);
5363 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5364 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5365 return false;
5366 }
5367 } else if (!ST.hasSDWAOutModsVOPC()) {
5368 // No clamp allowed on GFX9 for VOPC
5369 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5370 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5371 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5372 return false;
5373 }
5374
5375 // No omod allowed on GFX9 for VOPC
5376 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5377 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5378 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5379 return false;
5380 }
5381 }
5382 }
5383
5384 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5385 if (DstUnused && DstUnused->isImm() &&
5386 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5387 const MachineOperand &Dst = MI.getOperand(DstIdx);
5388 if (!Dst.isReg() || !Dst.isTied()) {
5389 ErrInfo = "Dst register should have tied register";
5390 return false;
5391 }
5392
5393 const MachineOperand &TiedMO =
5394 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5395 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5396 ErrInfo =
5397 "Dst register should be tied to implicit use of preserved register";
5398 return false;
5399 }
5400 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5401 ErrInfo = "Dst register should use same physical register as preserved";
5402 return false;
5403 }
5404 }
5405 }
5406
5407 // Verify MIMG / VIMAGE / VSAMPLE
5408 if (isImage(Opcode) && !MI.mayStore()) {
5409 // Ensure that the return type used is large enough for all the options
5410 // being used TFE/LWE require an extra result register.
5411 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5412 if (DMask) {
5413 uint64_t DMaskImm = DMask->getImm();
5414 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5415 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5416 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5417 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5418
5419 // Adjust for packed 16 bit values
5420 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5421 RegCount = divideCeil(RegCount, 2);
5422
5423 // Adjust if using LWE or TFE
5424 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5425 RegCount += 1;
5426
5427 const uint32_t DstIdx =
5428 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5429 const MachineOperand &Dst = MI.getOperand(DstIdx);
5430 if (Dst.isReg()) {
5431 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5432 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5433 if (RegCount > DstSize) {
5434 ErrInfo = "Image instruction returns too many registers for dst "
5435 "register class";
5436 return false;
5437 }
5438 }
5439 }
5440 }
5441
5442 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5443 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5444 unsigned ConstantBusCount = 0;
5445 bool UsesLiteral = false;
5446 const MachineOperand *LiteralVal = nullptr;
5447
5448 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5449 if (ImmIdx != -1) {
5450 ++ConstantBusCount;
5451 UsesLiteral = true;
5452 LiteralVal = &MI.getOperand(ImmIdx);
5453 }
5454
5455 SmallVector<Register, 2> SGPRsUsed;
5456 Register SGPRUsed;
5457
5458 // Only look at the true operands. Only a real operand can use the constant
5459 // bus, and we don't want to check pseudo-operands like the source modifier
5460 // flags.
5461 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5462 if (OpIdx == -1)
5463 continue;
5464 const MachineOperand &MO = MI.getOperand(OpIdx);
5465 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5466 if (MO.isReg()) {
5467 SGPRUsed = MO.getReg();
5468 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5469 ++ConstantBusCount;
5470 SGPRsUsed.push_back(SGPRUsed);
5471 }
5472 } else if (!MO.isFI()) { // Treat FI like a register.
5473 if (!UsesLiteral) {
5474 ++ConstantBusCount;
5475 UsesLiteral = true;
5476 LiteralVal = &MO;
5477 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5478 assert(isVOP2(MI) || isVOP3(MI));
5479 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5480 return false;
5481 }
5482 }
5483 }
5484 }
5485
5486 SGPRUsed = findImplicitSGPRRead(MI);
5487 if (SGPRUsed) {
5488 // Implicit uses may safely overlap true operands
5489 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5490 return !RI.regsOverlap(SGPRUsed, SGPR);
5491 })) {
5492 ++ConstantBusCount;
5493 SGPRsUsed.push_back(SGPRUsed);
5494 }
5495 }
5496
5497 // v_writelane_b32 is an exception from constant bus restriction:
5498 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5499 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5500 Opcode != AMDGPU::V_WRITELANE_B32) {
5501 ErrInfo = "VOP* instruction violates constant bus restriction";
5502 return false;
5503 }
5504
5505 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5506 ErrInfo = "VOP3 instruction uses literal";
5507 return false;
5508 }
5509 }
5510
5511 // Special case for writelane - this can break the multiple constant bus rule,
5512 // but still can't use more than one SGPR register
5513 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5514 unsigned SGPRCount = 0;
5515 Register SGPRUsed;
5516
5517 for (int OpIdx : {Src0Idx, Src1Idx}) {
5518 if (OpIdx == -1)
5519 break;
5520
5521 const MachineOperand &MO = MI.getOperand(OpIdx);
5522
5523 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5524 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5525 if (MO.getReg() != SGPRUsed)
5526 ++SGPRCount;
5527 SGPRUsed = MO.getReg();
5528 }
5529 }
5530 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5531 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5532 return false;
5533 }
5534 }
5535 }
5536
5537 // Verify misc. restrictions on specific instructions.
5538 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5539 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5540 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5541 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5542 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5543 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5544 if (!compareMachineOp(Src0, Src1) &&
5545 !compareMachineOp(Src0, Src2)) {
5546 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5547 return false;
5548 }
5549 }
5550 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5551 SISrcMods::ABS) ||
5552 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5553 SISrcMods::ABS) ||
5554 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5555 SISrcMods::ABS)) {
5556 ErrInfo = "ABS not allowed in VOP3B instructions";
5557 return false;
5558 }
5559 }
5560
5561 if (isSOP2(MI) || isSOPC(MI)) {
5562 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5563 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5564
5565 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5566 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5567 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5568 !Src0.isIdenticalTo(Src1)) {
5569 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5570 return false;
5571 }
5572 }
5573
5574 if (isSOPK(MI)) {
5575 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5576 if (Desc.isBranch()) {
5577 if (!Op->isMBB()) {
5578 ErrInfo = "invalid branch target for SOPK instruction";
5579 return false;
5580 }
5581 } else {
5582 uint64_t Imm = Op->getImm();
5583 if (sopkIsZext(Opcode)) {
5584 if (!isUInt<16>(Imm)) {
5585 ErrInfo = "invalid immediate for SOPK instruction";
5586 return false;
5587 }
5588 } else {
5589 if (!isInt<16>(Imm)) {
5590 ErrInfo = "invalid immediate for SOPK instruction";
5591 return false;
5592 }
5593 }
5594 }
5595 }
5596
5597 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5598 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5599 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5600 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5601 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5602 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5603
5604 const unsigned StaticNumOps =
5605 Desc.getNumOperands() + Desc.implicit_uses().size();
5606 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5607
5608 // Require additional implicit operands. This allows a fixup done by the
5609 // post RA scheduler where the main implicit operand is killed and
5610 // implicit-defs are added for sub-registers that remain live after this
5611 // instruction.
5612 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5613 ErrInfo = "missing implicit register operands";
5614 return false;
5615 }
5616
5617 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5618 if (IsDst) {
5619 if (!Dst->isUse()) {
5620 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5621 return false;
5622 }
5623
5624 unsigned UseOpIdx;
5625 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5626 UseOpIdx != StaticNumOps + 1) {
5627 ErrInfo = "movrel implicit operands should be tied";
5628 return false;
5629 }
5630 }
5631
5632 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5633 const MachineOperand &ImpUse
5634 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5635 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5636 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5637 ErrInfo = "src0 should be subreg of implicit vector use";
5638 return false;
5639 }
5640 }
5641
5642 // Make sure we aren't losing exec uses in the td files. This mostly requires
5643 // being careful when using let Uses to try to add other use registers.
5644 if (shouldReadExec(MI)) {
5645 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5646 ErrInfo = "VALU instruction does not implicitly read exec mask";
5647 return false;
5648 }
5649 }
5650
5651 if (isSMRD(MI)) {
5652 if (MI.mayStore() &&
5653 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5654 // The register offset form of scalar stores may only use m0 as the
5655 // soffset register.
5656 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5657 if (Soff && Soff->getReg() != AMDGPU::M0) {
5658 ErrInfo = "scalar stores must use m0 as offset register";
5659 return false;
5660 }
5661 }
5662 }
5663
5664 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5665 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5666 if (Offset->getImm() != 0) {
5667 ErrInfo = "subtarget does not support offsets in flat instructions";
5668 return false;
5669 }
5670 }
5671
5672 if (isDS(MI) && !ST.hasGDS()) {
5673 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5674 if (GDSOp && GDSOp->getImm() != 0) {
5675 ErrInfo = "GDS is not supported on this subtarget";
5676 return false;
5677 }
5678 }
5679
5680 if (isImage(MI)) {
5681 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5682 if (DimOp) {
5683 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5684 AMDGPU::OpName::vaddr0);
5685 AMDGPU::OpName RSrcOpName =
5686 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5687 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5688 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5689 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5690 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5691 const AMDGPU::MIMGDimInfo *Dim =
5693
5694 if (!Dim) {
5695 ErrInfo = "dim is out of range";
5696 return false;
5697 }
5698
5699 bool IsA16 = false;
5700 if (ST.hasR128A16()) {
5701 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5702 IsA16 = R128A16->getImm() != 0;
5703 } else if (ST.hasA16()) {
5704 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5705 IsA16 = A16->getImm() != 0;
5706 }
5707
5708 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5709
5710 unsigned AddrWords =
5711 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5712
5713 unsigned VAddrWords;
5714 if (IsNSA) {
5715 VAddrWords = RsrcIdx - VAddr0Idx;
5716 if (ST.hasPartialNSAEncoding() &&
5717 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5718 unsigned LastVAddrIdx = RsrcIdx - 1;
5719 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5720 }
5721 } else {
5722 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5723 if (AddrWords > 12)
5724 AddrWords = 16;
5725 }
5726
5727 if (VAddrWords != AddrWords) {
5728 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5729 << " but got " << VAddrWords << "\n");
5730 ErrInfo = "bad vaddr size";
5731 return false;
5732 }
5733 }
5734 }
5735
5736 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5737 if (DppCt) {
5738 using namespace AMDGPU::DPP;
5739
5740 unsigned DC = DppCt->getImm();
5741 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5742 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5743 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5744 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5745 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5746 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5747 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5748 ErrInfo = "Invalid dpp_ctrl value";
5749 return false;
5750 }
5751 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5752 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5753 ErrInfo = "Invalid dpp_ctrl value: "
5754 "wavefront shifts are not supported on GFX10+";
5755 return false;
5756 }
5757 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5758 ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5759 ErrInfo = "Invalid dpp_ctrl value: "
5760 "broadcasts are not supported on GFX10+";
5761 return false;
5762 }
5763 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5764 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5765 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5766 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5767 !ST.hasGFX90AInsts()) {
5768 ErrInfo = "Invalid dpp_ctrl value: "
5769 "row_newbroadcast/row_share is not supported before "
5770 "GFX90A/GFX10";
5771 return false;
5772 }
5773 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5774 ErrInfo = "Invalid dpp_ctrl value: "
5775 "row_share and row_xmask are not supported before GFX10";
5776 return false;
5777 }
5778 }
5779
5780 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5782 AMDGPU::isDPALU_DPP(Desc, *this, ST)) {
5783 ErrInfo = "Invalid dpp_ctrl value: "
5784 "DP ALU dpp only support row_newbcast";
5785 return false;
5786 }
5787 }
5788
5789 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5790 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5791 AMDGPU::OpName DataName =
5792 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5793 const MachineOperand *Data = getNamedOperand(MI, DataName);
5794 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5795 if (Data && !Data->isReg())
5796 Data = nullptr;
5797
5798 if (ST.hasGFX90AInsts()) {
5799 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5800 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5801 ErrInfo = "Invalid register class: "
5802 "vdata and vdst should be both VGPR or AGPR";
5803 return false;
5804 }
5805 if (Data && Data2 &&
5806 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5807 ErrInfo = "Invalid register class: "
5808 "both data operands should be VGPR or AGPR";
5809 return false;
5810 }
5811 } else {
5812 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5813 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5814 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5815 ErrInfo = "Invalid register class: "
5816 "agpr loads and stores not supported on this GPU";
5817 return false;
5818 }
5819 }
5820 }
5821
5822 if (ST.needsAlignedVGPRs()) {
5823 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5825 if (!Op)
5826 return true;
5827 Register Reg = Op->getReg();
5828 if (Reg.isPhysical())
5829 return !(RI.getHWRegIndex(Reg) & 1);
5830 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5831 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5832 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5833 };
5834
5835 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5836 Opcode == AMDGPU::DS_GWS_BARRIER) {
5837
5838 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5839 ErrInfo = "Subtarget requires even aligned vector registers "
5840 "for DS_GWS instructions";
5841 return false;
5842 }
5843 }
5844
5845 if (isMIMG(MI)) {
5846 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5847 ErrInfo = "Subtarget requires even aligned vector registers "
5848 "for vaddr operand of image instructions";
5849 return false;
5850 }
5851 }
5852 }
5853
5854 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5855 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5856 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5857 ErrInfo = "Invalid register class: "
5858 "v_accvgpr_write with an SGPR is not supported on this GPU";
5859 return false;
5860 }
5861 }
5862
5863 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5864 const MachineOperand &SrcOp = MI.getOperand(1);
5865 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5866 ErrInfo = "pseudo expects only physical SGPRs";
5867 return false;
5868 }
5869 }
5870
5871 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5872 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5873 if (!ST.hasScaleOffset()) {
5874 ErrInfo = "Subtarget does not support offset scaling";
5875 return false;
5876 }
5877 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5878 ErrInfo = "Instruction does not support offset scaling";
5879 return false;
5880 }
5881 }
5882 }
5883
5884 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5885 // information.
5886 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5887 for (unsigned I = 0; I < 3; ++I) {
5889 return false;
5890 }
5891 }
5892
5893 if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
5894 MI.readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, nullptr)) {
5895 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
5896 if ((Dst && RI.getRegClassForReg(MRI, Dst->getReg()) ==
5897 &AMDGPU::SReg_64RegClass) ||
5898 Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
5899 ErrInfo = "Instruction cannot read flat_scratch_base_hi";
5900 return false;
5901 }
5902 }
5903
5904 return true;
5905}
5906
5907// It is more readable to list mapped opcodes on the same line.
5908// clang-format off
5909
5911 switch (MI.getOpcode()) {
5912 default: return AMDGPU::INSTRUCTION_LIST_END;
5913 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5914 case AMDGPU::COPY: return AMDGPU::COPY;
5915 case AMDGPU::PHI: return AMDGPU::PHI;
5916 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5917 case AMDGPU::WQM: return AMDGPU::WQM;
5918 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5919 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5920 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5921 case AMDGPU::S_MOV_B32: {
5922 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
5923 return MI.getOperand(1).isReg() ||
5924 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5925 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5926 }
5927 case AMDGPU::S_ADD_I32:
5928 return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5929 case AMDGPU::S_ADDC_U32:
5930 return AMDGPU::V_ADDC_U32_e32;
5931 case AMDGPU::S_SUB_I32:
5932 return ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5933 // FIXME: These are not consistently handled, and selected when the carry is
5934 // used.
5935 case AMDGPU::S_ADD_U32:
5936 return AMDGPU::V_ADD_CO_U32_e32;
5937 case AMDGPU::S_SUB_U32:
5938 return AMDGPU::V_SUB_CO_U32_e32;
5939 case AMDGPU::S_ADD_U64_PSEUDO:
5940 return AMDGPU::V_ADD_U64_PSEUDO;
5941 case AMDGPU::S_SUB_U64_PSEUDO:
5942 return AMDGPU::V_SUB_U64_PSEUDO;
5943 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5944 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5945 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5946 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5947 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5948 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5949 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5950 case AMDGPU::S_XNOR_B32:
5951 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5952 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5953 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5954 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5955 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5956 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5957 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5958 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5959 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5960 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5961 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5962 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5963 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5964 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5965 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5966 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5967 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5968 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5969 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5970 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5971 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5972 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5973 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5974 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5975 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5976 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5977 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5978 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5979 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5980 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5981 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5982 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5983 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5984 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5985 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5986 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5987 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5988 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5989 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5990 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5991 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5992 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5993 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5994 case AMDGPU::S_CVT_F32_F16:
5995 case AMDGPU::S_CVT_HI_F32_F16:
5996 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5997 : AMDGPU::V_CVT_F32_F16_fake16_e64;
5998 case AMDGPU::S_CVT_F16_F32:
5999 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
6000 : AMDGPU::V_CVT_F16_F32_fake16_e64;
6001 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
6002 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
6003 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
6004 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
6005 case AMDGPU::S_CEIL_F16:
6006 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
6007 : AMDGPU::V_CEIL_F16_fake16_e64;
6008 case AMDGPU::S_FLOOR_F16:
6009 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
6010 : AMDGPU::V_FLOOR_F16_fake16_e64;
6011 case AMDGPU::S_TRUNC_F16:
6012 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
6013 : AMDGPU::V_TRUNC_F16_fake16_e64;
6014 case AMDGPU::S_RNDNE_F16:
6015 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
6016 : AMDGPU::V_RNDNE_F16_fake16_e64;
6017 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
6018 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
6019 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
6020 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
6021 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
6022 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
6023 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
6024 case AMDGPU::S_ADD_F16:
6025 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
6026 : AMDGPU::V_ADD_F16_fake16_e64;
6027 case AMDGPU::S_SUB_F16:
6028 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
6029 : AMDGPU::V_SUB_F16_fake16_e64;
6030 case AMDGPU::S_MIN_F16:
6031 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
6032 : AMDGPU::V_MIN_F16_fake16_e64;
6033 case AMDGPU::S_MAX_F16:
6034 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
6035 : AMDGPU::V_MAX_F16_fake16_e64;
6036 case AMDGPU::S_MINIMUM_F16:
6037 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
6038 : AMDGPU::V_MINIMUM_F16_fake16_e64;
6039 case AMDGPU::S_MAXIMUM_F16:
6040 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
6041 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
6042 case AMDGPU::S_MUL_F16:
6043 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
6044 : AMDGPU::V_MUL_F16_fake16_e64;
6045 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
6046 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
6047 case AMDGPU::S_FMAC_F16:
6048 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
6049 : AMDGPU::V_FMAC_F16_fake16_e64;
6050 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
6051 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
6052 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
6053 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
6054 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
6055 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
6056 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
6057 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
6058 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
6059 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
6060 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
6061 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
6062 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
6063 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
6064 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
6065 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
6066 case AMDGPU::S_CMP_LT_F16:
6067 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
6068 : AMDGPU::V_CMP_LT_F16_fake16_e64;
6069 case AMDGPU::S_CMP_EQ_F16:
6070 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
6071 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
6072 case AMDGPU::S_CMP_LE_F16:
6073 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
6074 : AMDGPU::V_CMP_LE_F16_fake16_e64;
6075 case AMDGPU::S_CMP_GT_F16:
6076 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
6077 : AMDGPU::V_CMP_GT_F16_fake16_e64;
6078 case AMDGPU::S_CMP_LG_F16:
6079 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
6080 : AMDGPU::V_CMP_LG_F16_fake16_e64;
6081 case AMDGPU::S_CMP_GE_F16:
6082 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
6083 : AMDGPU::V_CMP_GE_F16_fake16_e64;
6084 case AMDGPU::S_CMP_O_F16:
6085 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
6086 : AMDGPU::V_CMP_O_F16_fake16_e64;
6087 case AMDGPU::S_CMP_U_F16:
6088 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
6089 : AMDGPU::V_CMP_U_F16_fake16_e64;
6090 case AMDGPU::S_CMP_NGE_F16:
6091 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
6092 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
6093 case AMDGPU::S_CMP_NLG_F16:
6094 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
6095 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
6096 case AMDGPU::S_CMP_NGT_F16:
6097 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
6098 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
6099 case AMDGPU::S_CMP_NLE_F16:
6100 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
6101 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
6102 case AMDGPU::S_CMP_NEQ_F16:
6103 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
6104 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
6105 case AMDGPU::S_CMP_NLT_F16:
6106 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
6107 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
6108 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
6109 case AMDGPU::V_S_EXP_F16_e64:
6110 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
6111 : AMDGPU::V_EXP_F16_fake16_e64;
6112 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
6113 case AMDGPU::V_S_LOG_F16_e64:
6114 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
6115 : AMDGPU::V_LOG_F16_fake16_e64;
6116 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
6117 case AMDGPU::V_S_RCP_F16_e64:
6118 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
6119 : AMDGPU::V_RCP_F16_fake16_e64;
6120 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
6121 case AMDGPU::V_S_RSQ_F16_e64:
6122 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
6123 : AMDGPU::V_RSQ_F16_fake16_e64;
6124 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
6125 case AMDGPU::V_S_SQRT_F16_e64:
6126 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
6127 : AMDGPU::V_SQRT_F16_fake16_e64;
6128 }
6130 "Unexpected scalar opcode without corresponding vector one!");
6131}
6132
6133// clang-format on
6134
6138 const DebugLoc &DL, Register Reg,
6139 bool IsSCCLive,
6140 SlotIndexes *Indexes) const {
6141 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6142 const SIInstrInfo *TII = ST.getInstrInfo();
6144 if (IsSCCLive) {
6145 // Insert two move instructions, one to save the original value of EXEC and
6146 // the other to turn on all bits in EXEC. This is required as we can't use
6147 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
6148 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
6150 auto FlipExecMI =
6151 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
6152 if (Indexes) {
6153 Indexes->insertMachineInstrInMaps(*StoreExecMI);
6154 Indexes->insertMachineInstrInMaps(*FlipExecMI);
6155 }
6156 } else {
6157 auto SaveExec =
6158 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
6159 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
6160 if (Indexes)
6161 Indexes->insertMachineInstrInMaps(*SaveExec);
6162 }
6163}
6164
6167 const DebugLoc &DL, Register Reg,
6168 SlotIndexes *Indexes) const {
6170 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
6171 .addReg(Reg, RegState::Kill);
6172 if (Indexes)
6173 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
6174}
6175
6179 "Not a whole wave func");
6180 MachineBasicBlock &MBB = *MF.begin();
6181 for (MachineInstr &MI : MBB)
6182 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6183 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6184 return &MI;
6185
6186 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6187}
6188
6190 unsigned OpNo) const {
6191 const MCInstrDesc &Desc = get(MI.getOpcode());
6192 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6193 Desc.operands()[OpNo].RegClass == -1) {
6194 Register Reg = MI.getOperand(OpNo).getReg();
6195
6196 if (Reg.isVirtual()) {
6197 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6198 return MRI.getRegClass(Reg);
6199 }
6200 return RI.getPhysRegBaseClass(Reg);
6201 }
6202
6203 int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]);
6204 return RegClass < 0 ? nullptr : RI.getRegClass(RegClass);
6205}
6206
6209 MachineBasicBlock *MBB = MI.getParent();
6210 MachineOperand &MO = MI.getOperand(OpIdx);
6211 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6212 unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]);
6213 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6214 unsigned Size = RI.getRegSizeInBits(*RC);
6215 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6216 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6217 : AMDGPU::V_MOV_B32_e32;
6218 if (MO.isReg())
6219 Opcode = AMDGPU::COPY;
6220 else if (RI.isSGPRClass(RC))
6221 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6222
6223 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6224 Register Reg = MRI.createVirtualRegister(VRC);
6225 DebugLoc DL = MBB->findDebugLoc(I);
6226 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6227 MO.ChangeToRegister(Reg, false);
6228}
6229
6232 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6233 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6234 if (!SuperReg.getReg().isVirtual())
6235 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6236
6237 MachineBasicBlock *MBB = MI->getParent();
6238 const DebugLoc &DL = MI->getDebugLoc();
6239 Register SubReg = MRI.createVirtualRegister(SubRC);
6240
6241 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6242 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6243 .addReg(SuperReg.getReg(), {}, NewSubIdx);
6244 return SubReg;
6245}
6246
6249 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6250 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6251 if (Op.isImm()) {
6252 if (SubIdx == AMDGPU::sub0)
6253 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6254 if (SubIdx == AMDGPU::sub1)
6255 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6256
6257 llvm_unreachable("Unhandled register index for immediate");
6258 }
6259
6260 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6261 SubIdx, SubRC);
6262 return MachineOperand::CreateReg(SubReg, false);
6263}
6264
6265// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6266void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6267 assert(Inst.getNumExplicitOperands() == 3);
6268 MachineOperand Op1 = Inst.getOperand(1);
6269 Inst.removeOperand(1);
6270 Inst.addOperand(Op1);
6271}
6272
6274 const MCOperandInfo &OpInfo,
6275 const MachineOperand &MO) const {
6276 if (!MO.isReg())
6277 return false;
6278
6279 Register Reg = MO.getReg();
6280
6281 const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo));
6282 if (Reg.isPhysical())
6283 return DRC->contains(Reg);
6284
6285 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6286
6287 if (MO.getSubReg()) {
6288 const MachineFunction *MF = MO.getParent()->getMF();
6289 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6290 if (!SuperRC)
6291 return false;
6292 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6293 }
6294
6295 return RI.getCommonSubClass(DRC, RC) != nullptr;
6296}
6297
6299 const MachineOperand &MO) const {
6300 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6301 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6302 unsigned Opc = MI.getOpcode();
6303
6304 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6305 // information.
6306 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6307 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6308 constexpr AMDGPU::OpName OpNames[] = {
6309 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6310
6311 for (auto [I, OpName] : enumerate(OpNames)) {
6312 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6313 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6315 return false;
6316 }
6317 }
6318
6319 if (!isLegalRegOperand(MRI, OpInfo, MO))
6320 return false;
6321
6322 // check Accumulate GPR operand
6323 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6324 if (IsAGPR && !ST.hasMAIInsts())
6325 return false;
6326 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6327 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6328 return false;
6329 // Atomics should have both vdst and vdata either vgpr or agpr.
6330 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6331 const int DataIdx = AMDGPU::getNamedOperandIdx(
6332 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6333 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6334 MI.getOperand(DataIdx).isReg() &&
6335 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6336 return false;
6337 if ((int)OpIdx == DataIdx) {
6338 if (VDstIdx != -1 &&
6339 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6340 return false;
6341 // DS instructions with 2 src operands also must have tied RC.
6342 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6343 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6344 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6345 return false;
6346 }
6347
6348 // Check V_ACCVGPR_WRITE_B32_e64
6349 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6350 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6351 RI.isSGPRReg(MRI, MO.getReg()))
6352 return false;
6353
6354 if (ST.hasFlatScratchHiInB64InstHazard() &&
6355 MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
6356 if (const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst)) {
6357 if (AMDGPU::getRegBitWidth(*RI.getRegClassForReg(MRI, Dst->getReg())) ==
6358 64)
6359 return false;
6360 }
6361 if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
6362 return false;
6363 }
6364
6365 return true;
6366}
6367
6369 const MCOperandInfo &OpInfo,
6370 const MachineOperand &MO) const {
6371 if (MO.isReg())
6372 return isLegalRegOperand(MRI, OpInfo, MO);
6373
6374 // Handle non-register types that are treated like immediates.
6375 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6376 return true;
6377}
6378
6380 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6381 const MachineOperand *MO) const {
6382 constexpr unsigned NumOps = 3;
6383 constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
6384 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6385 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6386 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6387
6388 assert(SrcN < NumOps);
6389
6390 if (!MO) {
6391 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6392 if (SrcIdx == -1)
6393 return true;
6394 MO = &MI.getOperand(SrcIdx);
6395 }
6396
6397 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6398 return true;
6399
6400 int ModsIdx =
6401 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6402 if (ModsIdx == -1)
6403 return true;
6404
6405 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6406 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6407 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6408
6409 return !OpSel && !OpSelHi;
6410}
6411
6413 const MachineOperand *MO) const {
6414 const MachineFunction &MF = *MI.getMF();
6415 const MachineRegisterInfo &MRI = MF.getRegInfo();
6416 const MCInstrDesc &InstDesc = MI.getDesc();
6417 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6418 int64_t RegClass = getOpRegClassID(OpInfo);
6419 const TargetRegisterClass *DefinedRC =
6420 RegClass != -1 ? RI.getRegClass(RegClass) : nullptr;
6421 if (!MO)
6422 MO = &MI.getOperand(OpIdx);
6423
6424 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6425
6426 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6427 const MachineOperand *UsedLiteral = nullptr;
6428
6429 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6430 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6431
6432 // TODO: Be more permissive with frame indexes.
6433 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6434 if (!LiteralLimit--)
6435 return false;
6436
6437 UsedLiteral = MO;
6438 }
6439
6441 if (MO->isReg())
6442 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6443
6444 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6445 if (i == OpIdx)
6446 continue;
6447 const MachineOperand &Op = MI.getOperand(i);
6448 if (Op.isReg()) {
6449 if (Op.isUse()) {
6450 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6451 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6452 if (--ConstantBusLimit <= 0)
6453 return false;
6454 }
6455 }
6456 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6457 !isInlineConstant(Op, InstDesc.operands()[i])) {
6458 // The same literal may be used multiple times.
6459 if (!UsedLiteral)
6460 UsedLiteral = &Op;
6461 else if (UsedLiteral->isIdenticalTo(Op))
6462 continue;
6463
6464 if (!LiteralLimit--)
6465 return false;
6466 if (--ConstantBusLimit <= 0)
6467 return false;
6468 }
6469 }
6470 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6471 // There can be at most one literal operand, but it can be repeated.
6472 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6473 if (i == OpIdx)
6474 continue;
6475 const MachineOperand &Op = MI.getOperand(i);
6476 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6477 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6478 !Op.isIdenticalTo(*MO))
6479 return false;
6480
6481 // Do not fold a non-inlineable and non-register operand into an
6482 // instruction that already has a frame index. The frame index handling
6483 // code could not handle well when a frame index co-exists with another
6484 // non-register operand, unless that operand is an inlineable immediate.
6485 if (Op.isFI())
6486 return false;
6487 }
6488 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6489 isF16PseudoScalarTrans(MI.getOpcode())) {
6490 return false;
6491 }
6492
6493 if (MO->isReg()) {
6494 if (!DefinedRC)
6495 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6496 return isLegalRegOperand(MI, OpIdx, *MO);
6497 }
6498
6499 if (MO->isImm()) {
6500 uint64_t Imm = MO->getImm();
6501 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6502 bool Is64BitOp = Is64BitFPOp ||
6503 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6504 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6505 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6506 if (Is64BitOp &&
6507 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6508 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6509 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6510 return false;
6511
6512 // FIXME: We can use sign extended 64-bit literals, but only for signed
6513 // operands. At the moment we do not know if an operand is signed.
6514 // Such operand will be encoded as its low 32 bits and then either
6515 // correctly sign extended or incorrectly zero extended by HW.
6516 // If 64-bit literals are supported and the literal will be encoded
6517 // as full 64 bit we still can use it.
6518 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6519 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6520 return false;
6521 }
6522 }
6523
6524 // Handle non-register types that are treated like immediates.
6525 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6526
6527 if (!DefinedRC) {
6528 // This operand expects an immediate.
6529 return true;
6530 }
6531
6532 return isImmOperandLegal(MI, OpIdx, *MO);
6533}
6534
6536 bool IsGFX950Only = ST.hasGFX950Insts();
6537 bool IsGFX940Only = ST.hasGFX940Insts();
6538
6539 if (!IsGFX950Only && !IsGFX940Only)
6540 return false;
6541
6542 if (!isVALU(MI))
6543 return false;
6544
6545 // V_COS, V_EXP, V_RCP, etc.
6546 if (isTRANS(MI))
6547 return true;
6548
6549 // DOT2, DOT2C, DOT4, etc.
6550 if (isDOT(MI))
6551 return true;
6552
6553 // MFMA, SMFMA
6554 if (isMFMA(MI))
6555 return true;
6556
6557 unsigned Opcode = MI.getOpcode();
6558 switch (Opcode) {
6559 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6560 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6561 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6562 case AMDGPU::V_MQSAD_U32_U8_e64:
6563 case AMDGPU::V_PK_ADD_F16:
6564 case AMDGPU::V_PK_ADD_F32:
6565 case AMDGPU::V_PK_ADD_I16:
6566 case AMDGPU::V_PK_ADD_U16:
6567 case AMDGPU::V_PK_ASHRREV_I16:
6568 case AMDGPU::V_PK_FMA_F16:
6569 case AMDGPU::V_PK_FMA_F32:
6570 case AMDGPU::V_PK_FMAC_F16_e32:
6571 case AMDGPU::V_PK_FMAC_F16_e64:
6572 case AMDGPU::V_PK_LSHLREV_B16:
6573 case AMDGPU::V_PK_LSHRREV_B16:
6574 case AMDGPU::V_PK_MAD_I16:
6575 case AMDGPU::V_PK_MAD_U16:
6576 case AMDGPU::V_PK_MAX_F16:
6577 case AMDGPU::V_PK_MAX_I16:
6578 case AMDGPU::V_PK_MAX_U16:
6579 case AMDGPU::V_PK_MIN_F16:
6580 case AMDGPU::V_PK_MIN_I16:
6581 case AMDGPU::V_PK_MIN_U16:
6582 case AMDGPU::V_PK_MOV_B32:
6583 case AMDGPU::V_PK_MUL_F16:
6584 case AMDGPU::V_PK_MUL_F32:
6585 case AMDGPU::V_PK_MUL_LO_U16:
6586 case AMDGPU::V_PK_SUB_I16:
6587 case AMDGPU::V_PK_SUB_U16:
6588 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6589 return true;
6590 default:
6591 return false;
6592 }
6593}
6594
6596 MachineInstr &MI) const {
6597 unsigned Opc = MI.getOpcode();
6598 const MCInstrDesc &InstrDesc = get(Opc);
6599
6600 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6601 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6602
6603 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6604 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6605
6606 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6607 // we need to only have one constant bus use before GFX10.
6608 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6609 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6610 RI.isSGPRReg(MRI, Src0.getReg()))
6611 legalizeOpWithMove(MI, Src0Idx);
6612
6613 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6614 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6615 // src0/src1 with V_READFIRSTLANE.
6616 if (Opc == AMDGPU::V_WRITELANE_B32) {
6617 const DebugLoc &DL = MI.getDebugLoc();
6618 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6619 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6620 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6621 .add(Src0);
6622 Src0.ChangeToRegister(Reg, false);
6623 }
6624 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6625 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6626 const DebugLoc &DL = MI.getDebugLoc();
6627 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6628 .add(Src1);
6629 Src1.ChangeToRegister(Reg, false);
6630 }
6631 return;
6632 }
6633
6634 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6635 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6636 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6637 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6638 legalizeOpWithMove(MI, Src2Idx);
6639 }
6640
6641 // VOP2 src0 instructions support all operand types, so we don't need to check
6642 // their legality. If src1 is already legal, we don't need to do anything.
6643 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6644 return;
6645
6646 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6647 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6648 // select is uniform.
6649 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6650 RI.isVGPR(MRI, Src1.getReg())) {
6651 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6652 const DebugLoc &DL = MI.getDebugLoc();
6653 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6654 .add(Src1);
6655 Src1.ChangeToRegister(Reg, false);
6656 return;
6657 }
6658
6659 // We do not use commuteInstruction here because it is too aggressive and will
6660 // commute if it is possible. We only want to commute here if it improves
6661 // legality. This can be called a fairly large number of times so don't waste
6662 // compile time pointlessly swapping and checking legality again.
6663 if (HasImplicitSGPR || !MI.isCommutable()) {
6664 legalizeOpWithMove(MI, Src1Idx);
6665 return;
6666 }
6667
6668 // If src0 can be used as src1, commuting will make the operands legal.
6669 // Otherwise we have to give up and insert a move.
6670 //
6671 // TODO: Other immediate-like operand kinds could be commuted if there was a
6672 // MachineOperand::ChangeTo* for them.
6673 if ((!Src1.isImm() && !Src1.isReg()) ||
6674 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6675 legalizeOpWithMove(MI, Src1Idx);
6676 return;
6677 }
6678
6679 int CommutedOpc = commuteOpcode(MI);
6680 if (CommutedOpc == -1) {
6681 legalizeOpWithMove(MI, Src1Idx);
6682 return;
6683 }
6684
6685 MI.setDesc(get(CommutedOpc));
6686
6687 Register Src0Reg = Src0.getReg();
6688 unsigned Src0SubReg = Src0.getSubReg();
6689 bool Src0Kill = Src0.isKill();
6690
6691 if (Src1.isImm())
6692 Src0.ChangeToImmediate(Src1.getImm());
6693 else if (Src1.isReg()) {
6694 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6695 Src0.setSubReg(Src1.getSubReg());
6696 } else
6697 llvm_unreachable("Should only have register or immediate operands");
6698
6699 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6700 Src1.setSubReg(Src0SubReg);
6702}
6703
6704// Legalize VOP3 operands. All operand types are supported for any operand
6705// but only one literal constant and only starting from GFX10.
6707 MachineInstr &MI) const {
6708 unsigned Opc = MI.getOpcode();
6709
6710 int VOP3Idx[3] = {
6711 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6712 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6713 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6714 };
6715
6716 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6717 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6718 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6719 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6720 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6721 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6722 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6723 // src1 and src2 must be scalar
6724 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6725 const DebugLoc &DL = MI.getDebugLoc();
6726 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6727 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6728 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6729 .add(Src1);
6730 Src1.ChangeToRegister(Reg, false);
6731 }
6732 if (VOP3Idx[2] != -1) {
6733 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6734 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6735 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6736 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6737 .add(Src2);
6738 Src2.ChangeToRegister(Reg, false);
6739 }
6740 }
6741 }
6742
6743 // Find the one SGPR operand we are allowed to use.
6744 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6745 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6746 SmallDenseSet<unsigned> SGPRsUsed;
6747 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6748 if (SGPRReg) {
6749 SGPRsUsed.insert(SGPRReg);
6750 --ConstantBusLimit;
6751 }
6752
6753 for (int Idx : VOP3Idx) {
6754 if (Idx == -1)
6755 break;
6756 MachineOperand &MO = MI.getOperand(Idx);
6757
6758 if (!MO.isReg()) {
6759 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6760 continue;
6761
6762 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6763 --LiteralLimit;
6764 --ConstantBusLimit;
6765 continue;
6766 }
6767
6768 --LiteralLimit;
6769 --ConstantBusLimit;
6770 legalizeOpWithMove(MI, Idx);
6771 continue;
6772 }
6773
6774 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6775 continue; // VGPRs are legal
6776
6777 // We can use one SGPR in each VOP3 instruction prior to GFX10
6778 // and two starting from GFX10.
6779 if (SGPRsUsed.count(MO.getReg()))
6780 continue;
6781 if (ConstantBusLimit > 0) {
6782 SGPRsUsed.insert(MO.getReg());
6783 --ConstantBusLimit;
6784 continue;
6785 }
6786
6787 // If we make it this far, then the operand is not legal and we must
6788 // legalize it.
6789 legalizeOpWithMove(MI, Idx);
6790 }
6791
6792 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6793 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6794 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6795 legalizeOpWithMove(MI, VOP3Idx[2]);
6796
6797 // Fix the register class of packed FP32 instructions on gfx12+. See
6798 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6800 for (unsigned I = 0; I < 3; ++I) {
6802 legalizeOpWithMove(MI, VOP3Idx[I]);
6803 }
6804 }
6805}
6806
6809 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6810 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6811 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6812 if (DstRC)
6813 SRC = RI.getCommonSubClass(SRC, DstRC);
6814
6815 Register DstReg = MRI.createVirtualRegister(SRC);
6816 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6817
6818 if (RI.hasAGPRs(VRC)) {
6819 VRC = RI.getEquivalentVGPRClass(VRC);
6820 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6821 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6822 get(TargetOpcode::COPY), NewSrcReg)
6823 .addReg(SrcReg);
6824 SrcReg = NewSrcReg;
6825 }
6826
6827 if (SubRegs == 1) {
6828 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6829 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6830 .addReg(SrcReg);
6831 return DstReg;
6832 }
6833
6835 for (unsigned i = 0; i < SubRegs; ++i) {
6836 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6837 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6838 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6839 .addReg(SrcReg, {}, RI.getSubRegFromChannel(i));
6840 SRegs.push_back(SGPR);
6841 }
6842
6844 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6845 get(AMDGPU::REG_SEQUENCE), DstReg);
6846 for (unsigned i = 0; i < SubRegs; ++i) {
6847 MIB.addReg(SRegs[i]);
6848 MIB.addImm(RI.getSubRegFromChannel(i));
6849 }
6850 return DstReg;
6851}
6852
6854 MachineInstr &MI) const {
6855
6856 // If the pointer is store in VGPRs, then we need to move them to
6857 // SGPRs using v_readfirstlane. This is safe because we only select
6858 // loads with uniform pointers to SMRD instruction so we know the
6859 // pointer value is uniform.
6860 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6861 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6862 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6863 SBase->setReg(SGPR);
6864 }
6865 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6866 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6867 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6868 SOff->setReg(SGPR);
6869 }
6870}
6871
6873 unsigned Opc = Inst.getOpcode();
6874 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6875 if (OldSAddrIdx < 0)
6876 return false;
6877
6878 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6879
6880 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6881 if (NewOpc < 0)
6883 if (NewOpc < 0)
6884 return false;
6885
6887 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6888 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6889 return false;
6890
6891 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6892 if (NewVAddrIdx < 0)
6893 return false;
6894
6895 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6896
6897 // Check vaddr, it shall be zero or absent.
6898 MachineInstr *VAddrDef = nullptr;
6899 if (OldVAddrIdx >= 0) {
6900 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6901 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6902 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6903 !VAddrDef->getOperand(1).isImm() ||
6904 VAddrDef->getOperand(1).getImm() != 0)
6905 return false;
6906 }
6907
6908 const MCInstrDesc &NewDesc = get(NewOpc);
6909 Inst.setDesc(NewDesc);
6910
6911 // Callers expect iterator to be valid after this call, so modify the
6912 // instruction in place.
6913 if (OldVAddrIdx == NewVAddrIdx) {
6914 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6915 // Clear use list from the old vaddr holding a zero register.
6916 MRI.removeRegOperandFromUseList(&NewVAddr);
6917 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6918 Inst.removeOperand(OldSAddrIdx);
6919 // Update the use list with the pointer we have just moved from vaddr to
6920 // saddr position. Otherwise new vaddr will be missing from the use list.
6921 MRI.removeRegOperandFromUseList(&NewVAddr);
6922 MRI.addRegOperandToUseList(&NewVAddr);
6923 } else {
6924 assert(OldSAddrIdx == NewVAddrIdx);
6925
6926 if (OldVAddrIdx >= 0) {
6927 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6928 AMDGPU::OpName::vdst_in);
6929
6930 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6931 // it asserts. Untie the operands for now and retie them afterwards.
6932 if (NewVDstIn != -1) {
6933 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6934 Inst.untieRegOperand(OldVDstIn);
6935 }
6936
6937 Inst.removeOperand(OldVAddrIdx);
6938
6939 if (NewVDstIn != -1) {
6940 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6941 Inst.tieOperands(NewVDst, NewVDstIn);
6942 }
6943 }
6944 }
6945
6946 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6947 VAddrDef->eraseFromParent();
6948
6949 return true;
6950}
6951
6952// FIXME: Remove this when SelectionDAG is obsoleted.
6954 MachineInstr &MI) const {
6955 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
6956 return;
6957
6958 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6959 // thinks they are uniform, so a readfirstlane should be valid.
6960 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6961 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6962 return;
6963
6965 return;
6966
6967 const TargetRegisterClass *DeclaredRC =
6968 getRegClass(MI.getDesc(), SAddr->getOperandNo());
6969
6970 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6971 SAddr->setReg(ToSGPR);
6972}
6973
6976 const TargetRegisterClass *DstRC,
6979 const DebugLoc &DL) const {
6980 Register OpReg = Op.getReg();
6981 unsigned OpSubReg = Op.getSubReg();
6982
6983 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6984 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6985
6986 // Check if operand is already the correct register class.
6987 if (DstRC == OpRC)
6988 return;
6989
6990 Register DstReg = MRI.createVirtualRegister(DstRC);
6991 auto Copy =
6992 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
6993 Op.setReg(DstReg);
6994
6995 MachineInstr *Def = MRI.getVRegDef(OpReg);
6996 if (!Def)
6997 return;
6998
6999 // Try to eliminate the copy if it is copying an immediate value.
7000 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
7001 foldImmediate(*Copy, *Def, OpReg, &MRI);
7002
7003 bool ImpDef = Def->isImplicitDef();
7004 while (!ImpDef && Def && Def->isCopy()) {
7005 if (Def->getOperand(1).getReg().isPhysical())
7006 break;
7007 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
7008 ImpDef = Def && Def->isImplicitDef();
7009 }
7010 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
7011 !ImpDef)
7012 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
7013}
7014
7015// Emit the actual waterfall loop, executing the wrapped instruction for each
7016// unique value of \p ScalarOps across all lanes. In the best case we execute 1
7017// iteration, in the worst case we execute 64 (once per lane).
7018static void
7021 MachineBasicBlock &LoopBB,
7022 MachineBasicBlock &BodyBB,
7023 const DebugLoc &DL,
7024 ArrayRef<MachineOperand *> ScalarOps) {
7025 MachineFunction &MF = *LoopBB.getParent();
7026 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7027 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7029 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7030
7032 Register CondReg;
7033
7034 for (MachineOperand *ScalarOp : ScalarOps) {
7035 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
7036 unsigned NumSubRegs = RegSize / 32;
7037 Register VScalarOp = ScalarOp->getReg();
7038
7039 if (NumSubRegs == 1) {
7040 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7041
7042 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
7043 .addReg(VScalarOp);
7044
7045 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7046
7047 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
7048 .addReg(CurReg)
7049 .addReg(VScalarOp);
7050
7051 // Combine the comparison results with AND.
7052 if (!CondReg) // First.
7053 CondReg = NewCondReg;
7054 else { // If not the first, we create an AND.
7055 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7056 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7057 .addReg(CondReg)
7058 .addReg(NewCondReg);
7059 CondReg = AndReg;
7060 }
7061
7062 // Update ScalarOp operand to use the SGPR ScalarOp.
7063 ScalarOp->setReg(CurReg);
7064 ScalarOp->setIsKill();
7065 } else {
7066 SmallVector<Register, 8> ReadlanePieces;
7067 RegState VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
7068 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
7069 "Unhandled register size");
7070
7071 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
7072 Register CurRegLo =
7073 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7074 Register CurRegHi =
7075 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7076
7077 // Read the next variant <- also loop target.
7078 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
7079 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
7080
7081 // Read the next variant <- also loop target.
7082 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
7083 .addReg(VScalarOp, VScalarOpUndef,
7084 TRI->getSubRegFromChannel(Idx + 1));
7085
7086 ReadlanePieces.push_back(CurRegLo);
7087 ReadlanePieces.push_back(CurRegHi);
7088
7089 // Comparison is to be done as 64-bit.
7090 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
7091 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
7092 .addReg(CurRegLo)
7093 .addImm(AMDGPU::sub0)
7094 .addReg(CurRegHi)
7095 .addImm(AMDGPU::sub1);
7096
7097 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7098 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
7099 NewCondReg)
7100 .addReg(CurReg);
7101 if (NumSubRegs <= 2)
7102 Cmp.addReg(VScalarOp);
7103 else
7104 Cmp.addReg(VScalarOp, VScalarOpUndef,
7105 TRI->getSubRegFromChannel(Idx, 2));
7106
7107 // Combine the comparison results with AND.
7108 if (!CondReg) // First.
7109 CondReg = NewCondReg;
7110 else { // If not the first, we create an AND.
7111 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7112 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7113 .addReg(CondReg)
7114 .addReg(NewCondReg);
7115 CondReg = AndReg;
7116 }
7117 } // End for loop.
7118
7119 const auto *SScalarOpRC =
7120 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
7121 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
7122
7123 // Build scalar ScalarOp.
7124 auto Merge =
7125 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
7126 unsigned Channel = 0;
7127 for (Register Piece : ReadlanePieces) {
7128 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
7129 }
7130
7131 // Update ScalarOp operand to use the SGPR ScalarOp.
7132 ScalarOp->setReg(SScalarOp);
7133 ScalarOp->setIsKill();
7134 }
7135 }
7136
7137 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7138 MRI.setSimpleHint(SaveExec, CondReg);
7139
7140 // Update EXEC to matching lanes, saving original to SaveExec.
7141 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
7142 .addReg(CondReg, RegState::Kill);
7143
7144 // The original instruction is here; we insert the terminators after it.
7145 I = BodyBB.end();
7146
7147 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
7148 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
7149 .addReg(LMC.ExecReg)
7150 .addReg(SaveExec);
7151
7152 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
7153}
7154
7155// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7156// with SGPRs by iterating over all unique values across all lanes.
7157// Returns the loop basic block that now contains \p MI.
7158static MachineBasicBlock *
7162 MachineBasicBlock::iterator Begin = nullptr,
7163 MachineBasicBlock::iterator End = nullptr) {
7164 MachineBasicBlock &MBB = *MI.getParent();
7165 MachineFunction &MF = *MBB.getParent();
7166 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7167 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7169 if (!Begin.isValid())
7170 Begin = &MI;
7171 if (!End.isValid()) {
7172 End = &MI;
7173 ++End;
7174 }
7175 const DebugLoc &DL = MI.getDebugLoc();
7177 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7178
7179 // Save SCC. Waterfall Loop may overwrite SCC.
7180 Register SaveSCCReg;
7181
7182 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7183 // rather than unlimited scan everywhere
7184 bool SCCNotDead =
7185 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
7186 std::numeric_limits<unsigned>::max()) !=
7188 if (SCCNotDead) {
7189 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7190 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
7191 .addImm(1)
7192 .addImm(0);
7193 }
7194
7195 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7196
7197 // Save the EXEC mask
7198 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
7199
7200 // Killed uses in the instruction we are waterfalling around will be
7201 // incorrect due to the added control-flow.
7203 ++AfterMI;
7204 for (auto I = Begin; I != AfterMI; I++) {
7205 for (auto &MO : I->all_uses())
7206 MRI.clearKillFlags(MO.getReg());
7207 }
7208
7209 // To insert the loop we need to split the block. Move everything after this
7210 // point to a new block, and insert a new empty block between the two.
7213 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7215 ++MBBI;
7216
7217 MF.insert(MBBI, LoopBB);
7218 MF.insert(MBBI, BodyBB);
7219 MF.insert(MBBI, RemainderBB);
7220
7221 LoopBB->addSuccessor(BodyBB);
7222 BodyBB->addSuccessor(LoopBB);
7223 BodyBB->addSuccessor(RemainderBB);
7224
7225 // Move Begin to MI to the BodyBB, and the remainder of the block to
7226 // RemainderBB.
7227 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7228 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7229 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7230
7231 MBB.addSuccessor(LoopBB);
7232
7233 // Update dominators. We know that MBB immediately dominates LoopBB, that
7234 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7235 // RemainderBB. RemainderBB immediately dominates all of the successors
7236 // transferred to it from MBB that MBB used to properly dominate.
7237 if (MDT) {
7238 MDT->addNewBlock(LoopBB, &MBB);
7239 MDT->addNewBlock(BodyBB, LoopBB);
7240 MDT->addNewBlock(RemainderBB, BodyBB);
7241 for (auto &Succ : RemainderBB->successors()) {
7242 if (MDT->properlyDominates(&MBB, Succ)) {
7243 MDT->changeImmediateDominator(Succ, RemainderBB);
7244 }
7245 }
7246 }
7247
7248 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
7249
7250 MachineBasicBlock::iterator First = RemainderBB->begin();
7251 // Restore SCC
7252 if (SCCNotDead) {
7253 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7254 .addReg(SaveSCCReg, RegState::Kill)
7255 .addImm(0);
7256 }
7257
7258 // Restore the EXEC mask
7259 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7260 .addReg(SaveExec);
7261 return BodyBB;
7262}
7263
7264// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7265static std::tuple<unsigned, unsigned>
7267 MachineBasicBlock &MBB = *MI.getParent();
7268 MachineFunction &MF = *MBB.getParent();
7270
7271 // Extract the ptr from the resource descriptor.
7272 unsigned RsrcPtr =
7273 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7274 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7275
7276 // Create an empty resource descriptor
7277 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7278 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7279 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7280 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7281 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7282
7283 // Zero64 = 0
7284 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7285 .addImm(0);
7286
7287 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7288 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7289 .addImm(Lo_32(RsrcDataFormat));
7290
7291 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7292 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7293 .addImm(Hi_32(RsrcDataFormat));
7294
7295 // NewSRsrc = {Zero64, SRsrcFormat}
7296 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7297 .addReg(Zero64)
7298 .addImm(AMDGPU::sub0_sub1)
7299 .addReg(SRsrcFormatLo)
7300 .addImm(AMDGPU::sub2)
7301 .addReg(SRsrcFormatHi)
7302 .addImm(AMDGPU::sub3);
7303
7304 return std::tuple(RsrcPtr, NewSRsrc);
7305}
7306
7309 MachineDominatorTree *MDT) const {
7310 MachineFunction &MF = *MI.getMF();
7312 MachineBasicBlock *CreatedBB = nullptr;
7313
7314 // Legalize VOP2
7315 if (isVOP2(MI) || isVOPC(MI)) {
7317 return CreatedBB;
7318 }
7319
7320 // Legalize VOP3
7321 if (isVOP3(MI)) {
7323 return CreatedBB;
7324 }
7325
7326 // Legalize SMRD
7327 if (isSMRD(MI)) {
7329 return CreatedBB;
7330 }
7331
7332 // Legalize FLAT
7333 if (isFLAT(MI)) {
7335 return CreatedBB;
7336 }
7337
7338 // Legalize PHI
7339 // The register class of the operands must be the same type as the register
7340 // class of the output.
7341 if (MI.getOpcode() == AMDGPU::PHI) {
7342 const TargetRegisterClass *VRC = getOpRegClass(MI, 0);
7343 assert(!RI.isSGPRClass(VRC));
7344
7345 // Update all the operands so they have the same type.
7346 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7347 MachineOperand &Op = MI.getOperand(I);
7348 if (!Op.isReg() || !Op.getReg().isVirtual())
7349 continue;
7350
7351 // MI is a PHI instruction.
7352 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7354
7355 // Avoid creating no-op copies with the same src and dst reg class. These
7356 // confuse some of the machine passes.
7357 legalizeGenericOperand(*InsertBB, Insert, VRC, Op, MRI, MI.getDebugLoc());
7358 }
7359 }
7360
7361 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7362 // VGPR dest type and SGPR sources, insert copies so all operands are
7363 // VGPRs. This seems to help operand folding / the register coalescer.
7364 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7365 MachineBasicBlock *MBB = MI.getParent();
7366 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7367 if (RI.hasVGPRs(DstRC)) {
7368 // Update all the operands so they are VGPR register classes. These may
7369 // not be the same register class because REG_SEQUENCE supports mixing
7370 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7371 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7372 MachineOperand &Op = MI.getOperand(I);
7373 if (!Op.isReg() || !Op.getReg().isVirtual())
7374 continue;
7375
7376 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7377 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7378 if (VRC == OpRC)
7379 continue;
7380
7381 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7382 Op.setIsKill();
7383 }
7384 }
7385
7386 return CreatedBB;
7387 }
7388
7389 // Legalize INSERT_SUBREG
7390 // src0 must have the same register class as dst
7391 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7392 Register Dst = MI.getOperand(0).getReg();
7393 Register Src0 = MI.getOperand(1).getReg();
7394 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7395 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7396 if (DstRC != Src0RC) {
7397 MachineBasicBlock *MBB = MI.getParent();
7398 MachineOperand &Op = MI.getOperand(1);
7399 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7400 }
7401 return CreatedBB;
7402 }
7403
7404 // Legalize SI_INIT_M0
7405 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7406 MachineOperand &Src = MI.getOperand(0);
7407 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7408 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7409 return CreatedBB;
7410 }
7411
7412 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7413 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7414 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7415 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7416 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7417 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7418 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7419 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7420 MachineOperand &Src = MI.getOperand(1);
7421 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7422 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7423 return CreatedBB;
7424 }
7425
7426 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7427 //
7428 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7429 // scratch memory access. In both cases, the legalization never involves
7430 // conversion to the addr64 form.
7432 (isMUBUF(MI) || isMTBUF(MI)))) {
7433 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7434 ? AMDGPU::OpName::rsrc
7435 : AMDGPU::OpName::srsrc;
7436 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7437 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7438 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
7439
7440 AMDGPU::OpName SampOpName =
7441 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7442 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7443 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7444 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
7445
7446 return CreatedBB;
7447 }
7448
7449 // Legalize SI_CALL
7450 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7451 MachineOperand *Dest = &MI.getOperand(0);
7452 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7453 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7454 // following copies, we also need to move copies from and to physical
7455 // registers into the loop block.
7456 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
7457 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
7458
7459 // Also move the copies to physical registers into the loop block
7460 MachineBasicBlock &MBB = *MI.getParent();
7462 while (Start->getOpcode() != FrameSetupOpcode)
7463 --Start;
7465 while (End->getOpcode() != FrameDestroyOpcode)
7466 ++End;
7467 // Also include following copies of the return value
7468 ++End;
7469 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
7470 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
7471 ++End;
7472 CreatedBB =
7473 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
7474 }
7475 }
7476
7477 // Legalize s_sleep_var.
7478 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7479 const DebugLoc &DL = MI.getDebugLoc();
7480 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7481 int Src0Idx =
7482 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7483 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7484 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7485 .add(Src0);
7486 Src0.ChangeToRegister(Reg, false);
7487 return nullptr;
7488 }
7489
7490 // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,
7491 // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.
7492 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
7493 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
7494 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||
7495 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {
7496 for (MachineOperand &Src : MI.explicit_operands()) {
7497 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7498 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7499 }
7500 return CreatedBB;
7501 }
7502
7503 // Legalize MUBUF instructions.
7504 bool isSoffsetLegal = true;
7505 int SoffsetIdx =
7506 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7507 if (SoffsetIdx != -1) {
7508 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7509 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7510 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7511 isSoffsetLegal = false;
7512 }
7513 }
7514
7515 bool isRsrcLegal = true;
7516 int RsrcIdx =
7517 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7518 if (RsrcIdx != -1) {
7519 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7520 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7521 isRsrcLegal = false;
7522 }
7523
7524 // The operands are legal.
7525 if (isRsrcLegal && isSoffsetLegal)
7526 return CreatedBB;
7527
7528 if (!isRsrcLegal) {
7529 // Legalize a VGPR Rsrc
7530 //
7531 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7532 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7533 // a zero-value SRsrc.
7534 //
7535 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7536 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7537 // above.
7538 //
7539 // Otherwise we are on non-ADDR64 hardware, and/or we have
7540 // idxen/offen/bothen and we fall back to a waterfall loop.
7541
7542 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7543 MachineBasicBlock &MBB = *MI.getParent();
7544
7545 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7546 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7547 // This is already an ADDR64 instruction so we need to add the pointer
7548 // extracted from the resource descriptor to the current value of VAddr.
7549 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7550 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7551 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7552
7553 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7554 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7555 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7556
7557 unsigned RsrcPtr, NewSRsrc;
7558 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7559
7560 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7561 const DebugLoc &DL = MI.getDebugLoc();
7562 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7563 .addDef(CondReg0)
7564 .addReg(RsrcPtr, {}, AMDGPU::sub0)
7565 .addReg(VAddr->getReg(), {}, AMDGPU::sub0)
7566 .addImm(0);
7567
7568 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7569 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7570 .addDef(CondReg1, RegState::Dead)
7571 .addReg(RsrcPtr, {}, AMDGPU::sub1)
7572 .addReg(VAddr->getReg(), {}, AMDGPU::sub1)
7573 .addReg(CondReg0, RegState::Kill)
7574 .addImm(0);
7575
7576 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7577 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7578 .addReg(NewVAddrLo)
7579 .addImm(AMDGPU::sub0)
7580 .addReg(NewVAddrHi)
7581 .addImm(AMDGPU::sub1);
7582
7583 VAddr->setReg(NewVAddr);
7584 Rsrc->setReg(NewSRsrc);
7585 } else if (!VAddr && ST.hasAddr64()) {
7586 // This instructions is the _OFFSET variant, so we need to convert it to
7587 // ADDR64.
7588 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7589 "FIXME: Need to emit flat atomics here");
7590
7591 unsigned RsrcPtr, NewSRsrc;
7592 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7593
7594 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7595 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7596 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7597 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7598 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7599
7600 // Atomics with return have an additional tied operand and are
7601 // missing some of the special bits.
7602 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7603 MachineInstr *Addr64;
7604
7605 if (!VDataIn) {
7606 // Regular buffer load / store.
7608 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7609 .add(*VData)
7610 .addReg(NewVAddr)
7611 .addReg(NewSRsrc)
7612 .add(*SOffset)
7613 .add(*Offset);
7614
7615 if (const MachineOperand *CPol =
7616 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7617 MIB.addImm(CPol->getImm());
7618 }
7619
7620 if (const MachineOperand *TFE =
7621 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7622 MIB.addImm(TFE->getImm());
7623 }
7624
7625 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7626
7627 MIB.cloneMemRefs(MI);
7628 Addr64 = MIB;
7629 } else {
7630 // Atomics with return.
7631 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7632 .add(*VData)
7633 .add(*VDataIn)
7634 .addReg(NewVAddr)
7635 .addReg(NewSRsrc)
7636 .add(*SOffset)
7637 .add(*Offset)
7638 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7639 .cloneMemRefs(MI);
7640 }
7641
7642 MI.removeFromParent();
7643
7644 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7645 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7646 NewVAddr)
7647 .addReg(RsrcPtr, {}, AMDGPU::sub0)
7648 .addImm(AMDGPU::sub0)
7649 .addReg(RsrcPtr, {}, AMDGPU::sub1)
7650 .addImm(AMDGPU::sub1);
7651 } else {
7652 // Legalize a VGPR Rsrc and soffset together.
7653 if (!isSoffsetLegal) {
7654 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7655 CreatedBB =
7656 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7657 return CreatedBB;
7658 }
7659 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7660 return CreatedBB;
7661 }
7662 }
7663
7664 // Legalize a VGPR soffset.
7665 if (!isSoffsetLegal) {
7666 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7667 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7668 return CreatedBB;
7669 }
7670 return CreatedBB;
7671}
7672
7674 InstrList.insert(MI);
7675 // Add MBUF instructiosn to deferred list.
7676 int RsrcIdx =
7677 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7678 if (RsrcIdx != -1) {
7679 DeferredList.insert(MI);
7680 }
7681}
7682
7684 return DeferredList.contains(MI);
7685}
7686
7687// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7688// lowering (change spgr to vgpr).
7689// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7690// size. Need to legalize the size of the operands during the vgpr lowering
7691// chain. This can be removed after we have sgpr16 in place
7693 MachineRegisterInfo &MRI) const {
7694 if (!ST.useRealTrue16Insts())
7695 return;
7696
7697 unsigned Opcode = MI.getOpcode();
7698 MachineBasicBlock *MBB = MI.getParent();
7699 // Legalize operands and check for size mismatch
7700 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7701 OpIdx >= get(Opcode).getNumOperands() ||
7702 get(Opcode).operands()[OpIdx].RegClass == -1)
7703 return;
7704
7705 MachineOperand &Op = MI.getOperand(OpIdx);
7706 if (!Op.isReg() || !Op.getReg().isVirtual())
7707 return;
7708
7709 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7710 if (!RI.isVGPRClass(CurrRC))
7711 return;
7712
7713 int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]);
7714 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7715 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7716 Op.setSubReg(AMDGPU::lo16);
7717 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7718 const DebugLoc &DL = MI.getDebugLoc();
7719 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7720 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7721 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7722 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7723 .addReg(Op.getReg())
7724 .addImm(AMDGPU::lo16)
7725 .addReg(Undef)
7726 .addImm(AMDGPU::hi16);
7727 Op.setReg(NewDstReg);
7728 }
7729}
7731 MachineRegisterInfo &MRI) const {
7732 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7734}
7735
7737 MachineDominatorTree *MDT) const {
7738
7739 while (!Worklist.empty()) {
7740 MachineInstr &Inst = *Worklist.top();
7741 Worklist.erase_top();
7742 // Skip MachineInstr in the deferred list.
7743 if (Worklist.isDeferred(&Inst))
7744 continue;
7745 moveToVALUImpl(Worklist, MDT, Inst);
7746 }
7747
7748 // Deferred list of instructions will be processed once
7749 // all the MachineInstr in the worklist are done.
7750 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7751 moveToVALUImpl(Worklist, MDT, *Inst);
7752 assert(Worklist.empty() &&
7753 "Deferred MachineInstr are not supposed to re-populate worklist");
7754 }
7755}
7756
7759 MachineInstr &Inst) const {
7760
7762 if (!MBB)
7763 return;
7764 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7765 unsigned Opcode = Inst.getOpcode();
7766 unsigned NewOpcode = getVALUOp(Inst);
7767 const DebugLoc &DL = Inst.getDebugLoc();
7768
7769 // Handle some special cases
7770 switch (Opcode) {
7771 default:
7772 break;
7773 case AMDGPU::S_ADD_I32:
7774 case AMDGPU::S_SUB_I32: {
7775 // FIXME: The u32 versions currently selected use the carry.
7776 bool Changed;
7777 MachineBasicBlock *CreatedBBTmp = nullptr;
7778 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7779 if (Changed)
7780 return;
7781
7782 // Default handling
7783 break;
7784 }
7785
7786 case AMDGPU::S_MUL_U64:
7787 if (ST.hasVectorMulU64()) {
7788 NewOpcode = AMDGPU::V_MUL_U64_e64;
7789 break;
7790 }
7791 // Split s_mul_u64 in 32-bit vector multiplications.
7792 splitScalarSMulU64(Worklist, Inst, MDT);
7793 Inst.eraseFromParent();
7794 return;
7795
7796 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7797 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7798 // This is a special case of s_mul_u64 where all the operands are either
7799 // zero extended or sign extended.
7800 splitScalarSMulPseudo(Worklist, Inst, MDT);
7801 Inst.eraseFromParent();
7802 return;
7803
7804 case AMDGPU::S_AND_B64:
7805 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7806 Inst.eraseFromParent();
7807 return;
7808
7809 case AMDGPU::S_OR_B64:
7810 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7811 Inst.eraseFromParent();
7812 return;
7813
7814 case AMDGPU::S_XOR_B64:
7815 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7816 Inst.eraseFromParent();
7817 return;
7818
7819 case AMDGPU::S_NAND_B64:
7820 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7821 Inst.eraseFromParent();
7822 return;
7823
7824 case AMDGPU::S_NOR_B64:
7825 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7826 Inst.eraseFromParent();
7827 return;
7828
7829 case AMDGPU::S_XNOR_B64:
7830 if (ST.hasDLInsts())
7831 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7832 else
7833 splitScalar64BitXnor(Worklist, Inst, MDT);
7834 Inst.eraseFromParent();
7835 return;
7836
7837 case AMDGPU::S_ANDN2_B64:
7838 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7839 Inst.eraseFromParent();
7840 return;
7841
7842 case AMDGPU::S_ORN2_B64:
7843 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7844 Inst.eraseFromParent();
7845 return;
7846
7847 case AMDGPU::S_BREV_B64:
7848 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7849 Inst.eraseFromParent();
7850 return;
7851
7852 case AMDGPU::S_NOT_B64:
7853 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7854 Inst.eraseFromParent();
7855 return;
7856
7857 case AMDGPU::S_BCNT1_I32_B64:
7858 splitScalar64BitBCNT(Worklist, Inst);
7859 Inst.eraseFromParent();
7860 return;
7861
7862 case AMDGPU::S_BFE_I64:
7863 splitScalar64BitBFE(Worklist, Inst);
7864 Inst.eraseFromParent();
7865 return;
7866
7867 case AMDGPU::S_FLBIT_I32_B64:
7868 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7869 Inst.eraseFromParent();
7870 return;
7871 case AMDGPU::S_FF1_I32_B64:
7872 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7873 Inst.eraseFromParent();
7874 return;
7875
7876 case AMDGPU::S_LSHL_B32:
7877 if (ST.hasOnlyRevVALUShifts()) {
7878 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7879 swapOperands(Inst);
7880 }
7881 break;
7882 case AMDGPU::S_ASHR_I32:
7883 if (ST.hasOnlyRevVALUShifts()) {
7884 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7885 swapOperands(Inst);
7886 }
7887 break;
7888 case AMDGPU::S_LSHR_B32:
7889 if (ST.hasOnlyRevVALUShifts()) {
7890 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7891 swapOperands(Inst);
7892 }
7893 break;
7894 case AMDGPU::S_LSHL_B64:
7895 if (ST.hasOnlyRevVALUShifts()) {
7896 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7897 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7898 : AMDGPU::V_LSHLREV_B64_e64;
7899 swapOperands(Inst);
7900 }
7901 break;
7902 case AMDGPU::S_ASHR_I64:
7903 if (ST.hasOnlyRevVALUShifts()) {
7904 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7905 swapOperands(Inst);
7906 }
7907 break;
7908 case AMDGPU::S_LSHR_B64:
7909 if (ST.hasOnlyRevVALUShifts()) {
7910 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7911 swapOperands(Inst);
7912 }
7913 break;
7914
7915 case AMDGPU::S_ABS_I32:
7916 lowerScalarAbs(Worklist, Inst);
7917 Inst.eraseFromParent();
7918 return;
7919
7920 case AMDGPU::S_ABSDIFF_I32:
7921 lowerScalarAbsDiff(Worklist, Inst);
7922 Inst.eraseFromParent();
7923 return;
7924
7925 case AMDGPU::S_CBRANCH_SCC0:
7926 case AMDGPU::S_CBRANCH_SCC1: {
7927 // Clear unused bits of vcc
7928 Register CondReg = Inst.getOperand(1).getReg();
7929 bool IsSCC = CondReg == AMDGPU::SCC;
7931 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
7932 .addReg(LMC.ExecReg)
7933 .addReg(IsSCC ? LMC.VccReg : CondReg);
7934 Inst.removeOperand(1);
7935 } break;
7936
7937 case AMDGPU::S_BFE_U64:
7938 case AMDGPU::S_BFM_B64:
7939 llvm_unreachable("Moving this op to VALU not implemented");
7940
7941 case AMDGPU::S_PACK_LL_B32_B16:
7942 case AMDGPU::S_PACK_LH_B32_B16:
7943 case AMDGPU::S_PACK_HL_B32_B16:
7944 case AMDGPU::S_PACK_HH_B32_B16:
7945 movePackToVALU(Worklist, MRI, Inst);
7946 Inst.eraseFromParent();
7947 return;
7948
7949 case AMDGPU::S_XNOR_B32:
7950 lowerScalarXnor(Worklist, Inst);
7951 Inst.eraseFromParent();
7952 return;
7953
7954 case AMDGPU::S_NAND_B32:
7955 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7956 Inst.eraseFromParent();
7957 return;
7958
7959 case AMDGPU::S_NOR_B32:
7960 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7961 Inst.eraseFromParent();
7962 return;
7963
7964 case AMDGPU::S_ANDN2_B32:
7965 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7966 Inst.eraseFromParent();
7967 return;
7968
7969 case AMDGPU::S_ORN2_B32:
7970 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7971 Inst.eraseFromParent();
7972 return;
7973
7974 // TODO: remove as soon as everything is ready
7975 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7976 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7977 // can only be selected from the uniform SDNode.
7978 case AMDGPU::S_ADD_CO_PSEUDO:
7979 case AMDGPU::S_SUB_CO_PSEUDO: {
7980 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7981 ? AMDGPU::V_ADDC_U32_e64
7982 : AMDGPU::V_SUBB_U32_e64;
7983 const auto *CarryRC = RI.getWaveMaskRegClass();
7984
7985 Register CarryInReg = Inst.getOperand(4).getReg();
7986 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7987 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7988 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7989 .addReg(CarryInReg);
7990 }
7991
7992 Register CarryOutReg = Inst.getOperand(1).getReg();
7993
7994 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7995 MRI.getRegClass(Inst.getOperand(0).getReg())));
7996 MachineInstr *CarryOp =
7997 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7998 .addReg(CarryOutReg, RegState::Define)
7999 .add(Inst.getOperand(2))
8000 .add(Inst.getOperand(3))
8001 .addReg(CarryInReg)
8002 .addImm(0);
8003 legalizeOperands(*CarryOp);
8004 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
8005 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8006 Inst.eraseFromParent();
8007 }
8008 return;
8009 case AMDGPU::S_UADDO_PSEUDO:
8010 case AMDGPU::S_USUBO_PSEUDO: {
8011 MachineOperand &Dest0 = Inst.getOperand(0);
8012 MachineOperand &Dest1 = Inst.getOperand(1);
8013 MachineOperand &Src0 = Inst.getOperand(2);
8014 MachineOperand &Src1 = Inst.getOperand(3);
8015
8016 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
8017 ? AMDGPU::V_ADD_CO_U32_e64
8018 : AMDGPU::V_SUB_CO_U32_e64;
8019 const TargetRegisterClass *NewRC =
8020 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
8021 Register DestReg = MRI.createVirtualRegister(NewRC);
8022 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
8023 .addReg(Dest1.getReg(), RegState::Define)
8024 .add(Src0)
8025 .add(Src1)
8026 .addImm(0); // clamp bit
8027
8028 legalizeOperands(*NewInstr, MDT);
8029 MRI.replaceRegWith(Dest0.getReg(), DestReg);
8030 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8031 Inst.eraseFromParent();
8032 }
8033 return;
8034 case AMDGPU::S_LSHL1_ADD_U32:
8035 case AMDGPU::S_LSHL2_ADD_U32:
8036 case AMDGPU::S_LSHL3_ADD_U32:
8037 case AMDGPU::S_LSHL4_ADD_U32: {
8038 MachineOperand &Dest = Inst.getOperand(0);
8039 MachineOperand &Src0 = Inst.getOperand(1);
8040 MachineOperand &Src1 = Inst.getOperand(2);
8041 unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
8042 : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
8043 : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
8044 : 4);
8045
8046 const TargetRegisterClass *NewRC =
8047 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
8048 Register DestReg = MRI.createVirtualRegister(NewRC);
8049 MachineInstr *NewInstr =
8050 BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
8051 .add(Src0)
8052 .addImm(ShiftAmt)
8053 .add(Src1);
8054
8055 legalizeOperands(*NewInstr, MDT);
8056 MRI.replaceRegWith(Dest.getReg(), DestReg);
8057 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8058 Inst.eraseFromParent();
8059 }
8060 return;
8061 case AMDGPU::S_CSELECT_B32:
8062 case AMDGPU::S_CSELECT_B64:
8063 lowerSelect(Worklist, Inst, MDT);
8064 Inst.eraseFromParent();
8065 return;
8066 case AMDGPU::S_CMP_EQ_I32:
8067 case AMDGPU::S_CMP_LG_I32:
8068 case AMDGPU::S_CMP_GT_I32:
8069 case AMDGPU::S_CMP_GE_I32:
8070 case AMDGPU::S_CMP_LT_I32:
8071 case AMDGPU::S_CMP_LE_I32:
8072 case AMDGPU::S_CMP_EQ_U32:
8073 case AMDGPU::S_CMP_LG_U32:
8074 case AMDGPU::S_CMP_GT_U32:
8075 case AMDGPU::S_CMP_GE_U32:
8076 case AMDGPU::S_CMP_LT_U32:
8077 case AMDGPU::S_CMP_LE_U32:
8078 case AMDGPU::S_CMP_EQ_U64:
8079 case AMDGPU::S_CMP_LG_U64:
8080 case AMDGPU::S_CMP_LT_F32:
8081 case AMDGPU::S_CMP_EQ_F32:
8082 case AMDGPU::S_CMP_LE_F32:
8083 case AMDGPU::S_CMP_GT_F32:
8084 case AMDGPU::S_CMP_LG_F32:
8085 case AMDGPU::S_CMP_GE_F32:
8086 case AMDGPU::S_CMP_O_F32:
8087 case AMDGPU::S_CMP_U_F32:
8088 case AMDGPU::S_CMP_NGE_F32:
8089 case AMDGPU::S_CMP_NLG_F32:
8090 case AMDGPU::S_CMP_NGT_F32:
8091 case AMDGPU::S_CMP_NLE_F32:
8092 case AMDGPU::S_CMP_NEQ_F32:
8093 case AMDGPU::S_CMP_NLT_F32: {
8094 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8095 auto NewInstr =
8096 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8097 .setMIFlags(Inst.getFlags());
8098 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
8099 0) {
8100 NewInstr
8101 .addImm(0) // src0_modifiers
8102 .add(Inst.getOperand(0)) // src0
8103 .addImm(0) // src1_modifiers
8104 .add(Inst.getOperand(1)) // src1
8105 .addImm(0); // clamp
8106 } else {
8107 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
8108 }
8109 legalizeOperands(*NewInstr, MDT);
8110 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8111 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8112 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8113 Inst.eraseFromParent();
8114 return;
8115 }
8116 case AMDGPU::S_CMP_LT_F16:
8117 case AMDGPU::S_CMP_EQ_F16:
8118 case AMDGPU::S_CMP_LE_F16:
8119 case AMDGPU::S_CMP_GT_F16:
8120 case AMDGPU::S_CMP_LG_F16:
8121 case AMDGPU::S_CMP_GE_F16:
8122 case AMDGPU::S_CMP_O_F16:
8123 case AMDGPU::S_CMP_U_F16:
8124 case AMDGPU::S_CMP_NGE_F16:
8125 case AMDGPU::S_CMP_NLG_F16:
8126 case AMDGPU::S_CMP_NGT_F16:
8127 case AMDGPU::S_CMP_NLE_F16:
8128 case AMDGPU::S_CMP_NEQ_F16:
8129 case AMDGPU::S_CMP_NLT_F16: {
8130 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8131 auto NewInstr =
8132 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8133 .setMIFlags(Inst.getFlags());
8134 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
8135 NewInstr
8136 .addImm(0) // src0_modifiers
8137 .add(Inst.getOperand(0)) // src0
8138 .addImm(0) // src1_modifiers
8139 .add(Inst.getOperand(1)) // src1
8140 .addImm(0); // clamp
8141 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8142 NewInstr.addImm(0); // op_sel0
8143 } else {
8144 NewInstr
8145 .add(Inst.getOperand(0))
8146 .add(Inst.getOperand(1));
8147 }
8148 legalizeOperandsVALUt16(*NewInstr, MRI);
8149 legalizeOperands(*NewInstr, MDT);
8150 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8151 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8152 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8153 Inst.eraseFromParent();
8154 return;
8155 }
8156 case AMDGPU::S_CVT_HI_F32_F16: {
8157 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8158 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8159 if (ST.useRealTrue16Insts()) {
8160 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
8161 .add(Inst.getOperand(1));
8162 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8163 .addImm(0) // src0_modifiers
8164 .addReg(TmpReg, {}, AMDGPU::hi16)
8165 .addImm(0) // clamp
8166 .addImm(0) // omod
8167 .addImm(0); // op_sel0
8168 } else {
8169 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8170 .addImm(16)
8171 .add(Inst.getOperand(1));
8172 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8173 .addImm(0) // src0_modifiers
8174 .addReg(TmpReg)
8175 .addImm(0) // clamp
8176 .addImm(0); // omod
8177 }
8178
8179 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8180 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8181 Inst.eraseFromParent();
8182 return;
8183 }
8184 case AMDGPU::S_MINIMUM_F32:
8185 case AMDGPU::S_MAXIMUM_F32: {
8186 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8187 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8188 .addImm(0) // src0_modifiers
8189 .add(Inst.getOperand(1))
8190 .addImm(0) // src1_modifiers
8191 .add(Inst.getOperand(2))
8192 .addImm(0) // clamp
8193 .addImm(0); // omod
8194 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8195
8196 legalizeOperands(*NewInstr, MDT);
8197 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8198 Inst.eraseFromParent();
8199 return;
8200 }
8201 case AMDGPU::S_MINIMUM_F16:
8202 case AMDGPU::S_MAXIMUM_F16: {
8203 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8204 ? &AMDGPU::VGPR_16RegClass
8205 : &AMDGPU::VGPR_32RegClass);
8206 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8207 .addImm(0) // src0_modifiers
8208 .add(Inst.getOperand(1))
8209 .addImm(0) // src1_modifiers
8210 .add(Inst.getOperand(2))
8211 .addImm(0) // clamp
8212 .addImm(0) // omod
8213 .addImm(0); // opsel0
8214 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8215 legalizeOperandsVALUt16(*NewInstr, MRI);
8216 legalizeOperands(*NewInstr, MDT);
8217 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8218 Inst.eraseFromParent();
8219 return;
8220 }
8221 case AMDGPU::V_S_EXP_F16_e64:
8222 case AMDGPU::V_S_LOG_F16_e64:
8223 case AMDGPU::V_S_RCP_F16_e64:
8224 case AMDGPU::V_S_RSQ_F16_e64:
8225 case AMDGPU::V_S_SQRT_F16_e64: {
8226 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8227 ? &AMDGPU::VGPR_16RegClass
8228 : &AMDGPU::VGPR_32RegClass);
8229 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8230 .add(Inst.getOperand(1)) // src0_modifiers
8231 .add(Inst.getOperand(2))
8232 .add(Inst.getOperand(3)) // clamp
8233 .add(Inst.getOperand(4)) // omod
8234 .setMIFlags(Inst.getFlags());
8235 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8236 NewInstr.addImm(0); // opsel0
8237 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8238 legalizeOperandsVALUt16(*NewInstr, MRI);
8239 legalizeOperands(*NewInstr, MDT);
8240 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8241 Inst.eraseFromParent();
8242 return;
8243 }
8244 }
8245
8246 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8247 // We cannot move this instruction to the VALU, so we should try to
8248 // legalize its operands instead.
8249 legalizeOperands(Inst, MDT);
8250 return;
8251 }
8252 // Handle converting generic instructions like COPY-to-SGPR into
8253 // COPY-to-VGPR.
8254 if (NewOpcode == Opcode) {
8255 Register DstReg = Inst.getOperand(0).getReg();
8256 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8257
8258 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
8259 // hope for the best.
8260 if (Inst.isCopy() && DstReg.isPhysical() &&
8261 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8262 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8263 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8264 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
8265 .add(Inst.getOperand(1));
8266 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
8267 DstReg)
8268 .addReg(NewDst);
8269
8270 Inst.eraseFromParent();
8271 return;
8272 }
8273
8274 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
8275 Register NewDstReg = Inst.getOperand(1).getReg();
8276 const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
8277 if (const TargetRegisterClass *CommonRC =
8278 RI.getCommonSubClass(NewDstRC, SrcRC)) {
8279 // Instead of creating a copy where src and dst are the same register
8280 // class, we just replace all uses of dst with src. These kinds of
8281 // copies interfere with the heuristics MachineSink uses to decide
8282 // whether or not to split a critical edge. Since the pass assumes
8283 // that copies will end up as machine instructions and not be
8284 // eliminated.
8285 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8286 MRI.replaceRegWith(DstReg, NewDstReg);
8287 MRI.clearKillFlags(NewDstReg);
8288 Inst.getOperand(0).setReg(DstReg);
8289
8290 if (!MRI.constrainRegClass(NewDstReg, CommonRC))
8291 llvm_unreachable("failed to constrain register");
8292
8293 Inst.eraseFromParent();
8294 // Legalize t16 operand since replaceReg is called after addUsersToVALU
8295 for (MachineOperand &MO :
8296 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8297 legalizeOperandsVALUt16(*MO.getParent(), MRI);
8298 }
8299
8300 return;
8301 }
8302 }
8303
8304 // If this is a v2s copy between 16bit and 32bit reg,
8305 // replace vgpr copy to reg_sequence/extract_subreg
8306 // This can be remove after we have sgpr16 in place
8307 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8308 Inst.getOperand(1).getReg().isVirtual() &&
8309 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8310 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8311 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8312 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8313 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8314 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8315 get(AMDGPU::IMPLICIT_DEF), Undef);
8316 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8317 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8318 .addReg(Inst.getOperand(1).getReg())
8319 .addImm(AMDGPU::lo16)
8320 .addReg(Undef)
8321 .addImm(AMDGPU::hi16);
8322 Inst.eraseFromParent();
8323 MRI.replaceRegWith(DstReg, NewDstReg);
8324 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8325 return;
8326 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8327 AMDGPU::lo16)) {
8328 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8329 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8330 MRI.replaceRegWith(DstReg, NewDstReg);
8331 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8332 return;
8333 }
8334 }
8335
8336 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8337 MRI.replaceRegWith(DstReg, NewDstReg);
8338 legalizeOperands(Inst, MDT);
8339 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8340 return;
8341 }
8342
8343 // Use the new VALU Opcode.
8344 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8345 .setMIFlags(Inst.getFlags());
8346 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8347 // Intersperse VOP3 modifiers among the SALU operands.
8348 NewInstr->addOperand(Inst.getOperand(0));
8349 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8350 AMDGPU::OpName::src0_modifiers) >= 0)
8351 NewInstr.addImm(0);
8352 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8353 const MachineOperand &Src = Inst.getOperand(1);
8354 NewInstr->addOperand(Src);
8355 }
8356
8357 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8358 // We are converting these to a BFE, so we need to add the missing
8359 // operands for the size and offset.
8360 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8361 NewInstr.addImm(0);
8362 NewInstr.addImm(Size);
8363 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8364 // The VALU version adds the second operand to the result, so insert an
8365 // extra 0 operand.
8366 NewInstr.addImm(0);
8367 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8368 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8369 // If we need to move this to VGPRs, we need to unpack the second
8370 // operand back into the 2 separate ones for bit offset and width.
8371 assert(OffsetWidthOp.isImm() &&
8372 "Scalar BFE is only implemented for constant width and offset");
8373 uint32_t Imm = OffsetWidthOp.getImm();
8374
8375 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8376 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8377 NewInstr.addImm(Offset);
8378 NewInstr.addImm(BitWidth);
8379 } else {
8380 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8381 AMDGPU::OpName::src1_modifiers) >= 0)
8382 NewInstr.addImm(0);
8383 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8384 NewInstr->addOperand(Inst.getOperand(2));
8385 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8386 AMDGPU::OpName::src2_modifiers) >= 0)
8387 NewInstr.addImm(0);
8388 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8389 NewInstr->addOperand(Inst.getOperand(3));
8390 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8391 NewInstr.addImm(0);
8392 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8393 NewInstr.addImm(0);
8394 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8395 NewInstr.addImm(0);
8396 }
8397 } else {
8398 // Just copy the SALU operands.
8399 for (const MachineOperand &Op : Inst.explicit_operands())
8400 NewInstr->addOperand(Op);
8401 }
8402
8403 // Remove any references to SCC. Vector instructions can't read from it, and
8404 // We're just about to add the implicit use / defs of VCC, and we don't want
8405 // both.
8406 for (MachineOperand &Op : Inst.implicit_operands()) {
8407 if (Op.getReg() == AMDGPU::SCC) {
8408 // Only propagate through live-def of SCC.
8409 if (Op.isDef() && !Op.isDead())
8410 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8411 if (Op.isUse())
8412 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8413 }
8414 }
8415 Inst.eraseFromParent();
8416 Register NewDstReg;
8417 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8418 Register DstReg = NewInstr->getOperand(0).getReg();
8419 assert(DstReg.isVirtual());
8420 // Update the destination register class.
8421 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8422 assert(NewDstRC);
8423 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8424 MRI.replaceRegWith(DstReg, NewDstReg);
8425 }
8426 fixImplicitOperands(*NewInstr);
8427
8428 legalizeOperandsVALUt16(*NewInstr, MRI);
8429
8430 // Legalize the operands
8431 legalizeOperands(*NewInstr, MDT);
8432 if (NewDstReg)
8433 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8434}
8435
8436// Add/sub require special handling to deal with carry outs.
8437std::pair<bool, MachineBasicBlock *>
8438SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8439 MachineDominatorTree *MDT) const {
8440 if (ST.hasAddNoCarryInsts()) {
8441 // Assume there is no user of scc since we don't select this in that case.
8442 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8443 // is used.
8444
8445 MachineBasicBlock &MBB = *Inst.getParent();
8446 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8447
8448 Register OldDstReg = Inst.getOperand(0).getReg();
8449 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8450
8451 unsigned Opc = Inst.getOpcode();
8452 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8453
8454 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8455 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8456
8457 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8458 Inst.removeOperand(3);
8459
8460 Inst.setDesc(get(NewOpc));
8461 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8462 Inst.addImplicitDefUseOperands(*MBB.getParent());
8463 MRI.replaceRegWith(OldDstReg, ResultReg);
8464 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8465
8466 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8467 return std::pair(true, NewBB);
8468 }
8469
8470 return std::pair(false, nullptr);
8471}
8472
8473void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8474 MachineDominatorTree *MDT) const {
8475
8476 MachineBasicBlock &MBB = *Inst.getParent();
8477 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8478 MachineBasicBlock::iterator MII = Inst;
8479 const DebugLoc &DL = Inst.getDebugLoc();
8480
8481 MachineOperand &Dest = Inst.getOperand(0);
8482 MachineOperand &Src0 = Inst.getOperand(1);
8483 MachineOperand &Src1 = Inst.getOperand(2);
8484 MachineOperand &Cond = Inst.getOperand(3);
8485
8486 Register CondReg = Cond.getReg();
8487 bool IsSCC = (CondReg == AMDGPU::SCC);
8488
8489 // If this is a trivial select where the condition is effectively not SCC
8490 // (CondReg is a source of copy to SCC), then the select is semantically
8491 // equivalent to copying CondReg. Hence, there is no need to create
8492 // V_CNDMASK, we can just use that and bail out.
8493 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8494 (Src1.getImm() == 0)) {
8495 MRI.replaceRegWith(Dest.getReg(), CondReg);
8496 return;
8497 }
8498
8499 Register NewCondReg = CondReg;
8500 if (IsSCC) {
8501 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8502 NewCondReg = MRI.createVirtualRegister(TC);
8503
8504 // Now look for the closest SCC def if it is a copy
8505 // replacing the CondReg with the COPY source register
8506 bool CopyFound = false;
8507 for (MachineInstr &CandI :
8509 Inst.getParent()->rend())) {
8510 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8511 -1) {
8512 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8513 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8514 .addReg(CandI.getOperand(1).getReg());
8515 CopyFound = true;
8516 }
8517 break;
8518 }
8519 }
8520 if (!CopyFound) {
8521 // SCC def is not a copy
8522 // Insert a trivial select instead of creating a copy, because a copy from
8523 // SCC would semantically mean just copying a single bit, but we may need
8524 // the result to be a vector condition mask that needs preserving.
8525 unsigned Opcode =
8526 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8527 auto NewSelect =
8528 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8529 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8530 }
8531 }
8532
8533 Register NewDestReg = MRI.createVirtualRegister(
8534 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8535 MachineInstr *NewInst;
8536 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8537 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8538 .addImm(0)
8539 .add(Src1) // False
8540 .addImm(0)
8541 .add(Src0) // True
8542 .addReg(NewCondReg);
8543 } else {
8544 NewInst =
8545 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8546 .add(Src1) // False
8547 .add(Src0) // True
8548 .addReg(NewCondReg);
8549 }
8550 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8551 legalizeOperands(*NewInst, MDT);
8552 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8553}
8554
8555void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8556 MachineInstr &Inst) const {
8557 MachineBasicBlock &MBB = *Inst.getParent();
8558 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8559 MachineBasicBlock::iterator MII = Inst;
8560 const DebugLoc &DL = Inst.getDebugLoc();
8561
8562 MachineOperand &Dest = Inst.getOperand(0);
8563 MachineOperand &Src = Inst.getOperand(1);
8564 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8565 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8566
8567 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8568 : AMDGPU::V_SUB_CO_U32_e32;
8569
8570 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8571 .addImm(0)
8572 .addReg(Src.getReg());
8573
8574 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8575 .addReg(Src.getReg())
8576 .addReg(TmpReg);
8577
8578 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8579 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8580}
8581
8582void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
8583 MachineInstr &Inst) const {
8584 MachineBasicBlock &MBB = *Inst.getParent();
8585 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8586 MachineBasicBlock::iterator MII = Inst;
8587 const DebugLoc &DL = Inst.getDebugLoc();
8588
8589 MachineOperand &Dest = Inst.getOperand(0);
8590 MachineOperand &Src1 = Inst.getOperand(1);
8591 MachineOperand &Src2 = Inst.getOperand(2);
8592 Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8593 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8594 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8595
8596 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8597 : AMDGPU::V_SUB_CO_U32_e32;
8598
8599 BuildMI(MBB, MII, DL, get(SubOp), SubResultReg)
8600 .addReg(Src1.getReg())
8601 .addReg(Src2.getReg());
8602
8603 BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg);
8604
8605 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8606 .addReg(SubResultReg)
8607 .addReg(TmpReg);
8608
8609 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8610 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8611}
8612
8613void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8614 MachineInstr &Inst) const {
8615 MachineBasicBlock &MBB = *Inst.getParent();
8616 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8617 MachineBasicBlock::iterator MII = Inst;
8618 const DebugLoc &DL = Inst.getDebugLoc();
8619
8620 MachineOperand &Dest = Inst.getOperand(0);
8621 MachineOperand &Src0 = Inst.getOperand(1);
8622 MachineOperand &Src1 = Inst.getOperand(2);
8623
8624 if (ST.hasDLInsts()) {
8625 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8626 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8627 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8628
8629 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8630 .add(Src0)
8631 .add(Src1);
8632
8633 MRI.replaceRegWith(Dest.getReg(), NewDest);
8634 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8635 } else {
8636 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8637 // invert either source and then perform the XOR. If either source is a
8638 // scalar register, then we can leave the inversion on the scalar unit to
8639 // achieve a better distribution of scalar and vector instructions.
8640 bool Src0IsSGPR = Src0.isReg() &&
8641 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8642 bool Src1IsSGPR = Src1.isReg() &&
8643 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8644 MachineInstr *Xor;
8645 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8646 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8647
8648 // Build a pair of scalar instructions and add them to the work list.
8649 // The next iteration over the work list will lower these to the vector
8650 // unit as necessary.
8651 if (Src0IsSGPR) {
8652 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8653 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8654 .addReg(Temp)
8655 .add(Src1);
8656 } else if (Src1IsSGPR) {
8657 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8658 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8659 .add(Src0)
8660 .addReg(Temp);
8661 } else {
8662 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8663 .add(Src0)
8664 .add(Src1);
8665 MachineInstr *Not =
8666 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8667 Worklist.insert(Not);
8668 }
8669
8670 MRI.replaceRegWith(Dest.getReg(), NewDest);
8671
8672 Worklist.insert(Xor);
8673
8674 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8675 }
8676}
8677
8678void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8679 MachineInstr &Inst,
8680 unsigned Opcode) const {
8681 MachineBasicBlock &MBB = *Inst.getParent();
8682 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8683 MachineBasicBlock::iterator MII = Inst;
8684 const DebugLoc &DL = Inst.getDebugLoc();
8685
8686 MachineOperand &Dest = Inst.getOperand(0);
8687 MachineOperand &Src0 = Inst.getOperand(1);
8688 MachineOperand &Src1 = Inst.getOperand(2);
8689
8690 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8691 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8692
8693 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8694 .add(Src0)
8695 .add(Src1);
8696
8697 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8698 .addReg(Interm);
8699
8700 Worklist.insert(&Op);
8701 Worklist.insert(&Not);
8702
8703 MRI.replaceRegWith(Dest.getReg(), NewDest);
8704 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8705}
8706
8707void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8708 MachineInstr &Inst,
8709 unsigned Opcode) const {
8710 MachineBasicBlock &MBB = *Inst.getParent();
8711 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8712 MachineBasicBlock::iterator MII = Inst;
8713 const DebugLoc &DL = Inst.getDebugLoc();
8714
8715 MachineOperand &Dest = Inst.getOperand(0);
8716 MachineOperand &Src0 = Inst.getOperand(1);
8717 MachineOperand &Src1 = Inst.getOperand(2);
8718
8719 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8720 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8721
8722 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8723 .add(Src1);
8724
8725 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8726 .add(Src0)
8727 .addReg(Interm);
8728
8729 Worklist.insert(&Not);
8730 Worklist.insert(&Op);
8731
8732 MRI.replaceRegWith(Dest.getReg(), NewDest);
8733 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8734}
8735
8736void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8737 MachineInstr &Inst, unsigned Opcode,
8738 bool Swap) const {
8739 MachineBasicBlock &MBB = *Inst.getParent();
8740 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8741
8742 MachineOperand &Dest = Inst.getOperand(0);
8743 MachineOperand &Src0 = Inst.getOperand(1);
8744 const DebugLoc &DL = Inst.getDebugLoc();
8745
8746 MachineBasicBlock::iterator MII = Inst;
8747
8748 const MCInstrDesc &InstDesc = get(Opcode);
8749 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8750 MRI.getRegClass(Src0.getReg()) :
8751 &AMDGPU::SGPR_32RegClass;
8752
8753 const TargetRegisterClass *Src0SubRC =
8754 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8755
8756 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8757 AMDGPU::sub0, Src0SubRC);
8758
8759 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8760 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8761 const TargetRegisterClass *NewDestSubRC =
8762 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8763
8764 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8765 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8766
8767 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8768 AMDGPU::sub1, Src0SubRC);
8769
8770 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8771 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8772
8773 if (Swap)
8774 std::swap(DestSub0, DestSub1);
8775
8776 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8777 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8778 .addReg(DestSub0)
8779 .addImm(AMDGPU::sub0)
8780 .addReg(DestSub1)
8781 .addImm(AMDGPU::sub1);
8782
8783 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8784
8785 Worklist.insert(&LoHalf);
8786 Worklist.insert(&HiHalf);
8787
8788 // We don't need to legalizeOperands here because for a single operand, src0
8789 // will support any kind of input.
8790
8791 // Move all users of this moved value.
8792 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8793}
8794
8795// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8796// split the s_mul_u64 in 32-bit vector multiplications.
8797void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8798 MachineInstr &Inst,
8799 MachineDominatorTree *MDT) const {
8800 MachineBasicBlock &MBB = *Inst.getParent();
8801 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8802
8803 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8804 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8805 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8806
8807 MachineOperand &Dest = Inst.getOperand(0);
8808 MachineOperand &Src0 = Inst.getOperand(1);
8809 MachineOperand &Src1 = Inst.getOperand(2);
8810 const DebugLoc &DL = Inst.getDebugLoc();
8811 MachineBasicBlock::iterator MII = Inst;
8812
8813 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8814 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8815 const TargetRegisterClass *Src0SubRC =
8816 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8817 if (RI.isSGPRClass(Src0SubRC))
8818 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8819 const TargetRegisterClass *Src1SubRC =
8820 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8821 if (RI.isSGPRClass(Src1SubRC))
8822 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8823
8824 // First, we extract the low 32-bit and high 32-bit values from each of the
8825 // operands.
8826 MachineOperand Op0L =
8827 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8828 MachineOperand Op1L =
8829 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8830 MachineOperand Op0H =
8831 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8832 MachineOperand Op1H =
8833 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8834
8835 // The multilication is done as follows:
8836 //
8837 // Op1H Op1L
8838 // * Op0H Op0L
8839 // --------------------
8840 // Op1H*Op0L Op1L*Op0L
8841 // + Op1H*Op0H Op1L*Op0H
8842 // -----------------------------------------
8843 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8844 //
8845 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8846 // value and that would overflow.
8847 // The low 32-bit value is Op1L*Op0L.
8848 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8849
8850 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8851 MachineInstr *Op1L_Op0H =
8852 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8853 .add(Op1L)
8854 .add(Op0H);
8855
8856 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8857 MachineInstr *Op1H_Op0L =
8858 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8859 .add(Op1H)
8860 .add(Op0L);
8861
8862 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8863 MachineInstr *Carry =
8864 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8865 .add(Op1L)
8866 .add(Op0L);
8867
8868 MachineInstr *LoHalf =
8869 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8870 .add(Op1L)
8871 .add(Op0L);
8872
8873 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8874 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8875 .addReg(Op1L_Op0H_Reg)
8876 .addReg(Op1H_Op0L_Reg);
8877
8878 MachineInstr *HiHalf =
8879 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8880 .addReg(AddReg)
8881 .addReg(CarryReg);
8882
8883 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8884 .addReg(DestSub0)
8885 .addImm(AMDGPU::sub0)
8886 .addReg(DestSub1)
8887 .addImm(AMDGPU::sub1);
8888
8889 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8890
8891 // Try to legalize the operands in case we need to swap the order to keep it
8892 // valid.
8893 legalizeOperands(*Op1L_Op0H, MDT);
8894 legalizeOperands(*Op1H_Op0L, MDT);
8895 legalizeOperands(*Carry, MDT);
8896 legalizeOperands(*LoHalf, MDT);
8897 legalizeOperands(*Add, MDT);
8898 legalizeOperands(*HiHalf, MDT);
8899
8900 // Move all users of this moved value.
8901 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8902}
8903
8904// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8905// multiplications.
8906void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8907 MachineInstr &Inst,
8908 MachineDominatorTree *MDT) const {
8909 MachineBasicBlock &MBB = *Inst.getParent();
8910 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8911
8912 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8913 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8914 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8915
8916 MachineOperand &Dest = Inst.getOperand(0);
8917 MachineOperand &Src0 = Inst.getOperand(1);
8918 MachineOperand &Src1 = Inst.getOperand(2);
8919 const DebugLoc &DL = Inst.getDebugLoc();
8920 MachineBasicBlock::iterator MII = Inst;
8921
8922 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8923 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8924 const TargetRegisterClass *Src0SubRC =
8925 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8926 if (RI.isSGPRClass(Src0SubRC))
8927 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8928 const TargetRegisterClass *Src1SubRC =
8929 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8930 if (RI.isSGPRClass(Src1SubRC))
8931 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8932
8933 // First, we extract the low 32-bit and high 32-bit values from each of the
8934 // operands.
8935 MachineOperand Op0L =
8936 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8937 MachineOperand Op1L =
8938 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8939
8940 unsigned Opc = Inst.getOpcode();
8941 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8942 ? AMDGPU::V_MUL_HI_U32_e64
8943 : AMDGPU::V_MUL_HI_I32_e64;
8944 MachineInstr *HiHalf =
8945 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8946
8947 MachineInstr *LoHalf =
8948 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8949 .add(Op1L)
8950 .add(Op0L);
8951
8952 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8953 .addReg(DestSub0)
8954 .addImm(AMDGPU::sub0)
8955 .addReg(DestSub1)
8956 .addImm(AMDGPU::sub1);
8957
8958 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8959
8960 // Try to legalize the operands in case we need to swap the order to keep it
8961 // valid.
8962 legalizeOperands(*HiHalf, MDT);
8963 legalizeOperands(*LoHalf, MDT);
8964
8965 // Move all users of this moved value.
8966 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8967}
8968
8969void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8970 MachineInstr &Inst, unsigned Opcode,
8971 MachineDominatorTree *MDT) const {
8972 MachineBasicBlock &MBB = *Inst.getParent();
8973 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8974
8975 MachineOperand &Dest = Inst.getOperand(0);
8976 MachineOperand &Src0 = Inst.getOperand(1);
8977 MachineOperand &Src1 = Inst.getOperand(2);
8978 const DebugLoc &DL = Inst.getDebugLoc();
8979
8980 MachineBasicBlock::iterator MII = Inst;
8981
8982 const MCInstrDesc &InstDesc = get(Opcode);
8983 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8984 MRI.getRegClass(Src0.getReg()) :
8985 &AMDGPU::SGPR_32RegClass;
8986
8987 const TargetRegisterClass *Src0SubRC =
8988 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8989 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8990 MRI.getRegClass(Src1.getReg()) :
8991 &AMDGPU::SGPR_32RegClass;
8992
8993 const TargetRegisterClass *Src1SubRC =
8994 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8995
8996 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8997 AMDGPU::sub0, Src0SubRC);
8998 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8999 AMDGPU::sub0, Src1SubRC);
9000 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9001 AMDGPU::sub1, Src0SubRC);
9002 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9003 AMDGPU::sub1, Src1SubRC);
9004
9005 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9006 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
9007 const TargetRegisterClass *NewDestSubRC =
9008 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
9009
9010 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
9011 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
9012 .add(SrcReg0Sub0)
9013 .add(SrcReg1Sub0);
9014
9015 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
9016 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
9017 .add(SrcReg0Sub1)
9018 .add(SrcReg1Sub1);
9019
9020 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
9021 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9022 .addReg(DestSub0)
9023 .addImm(AMDGPU::sub0)
9024 .addReg(DestSub1)
9025 .addImm(AMDGPU::sub1);
9026
9027 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9028
9029 Worklist.insert(&LoHalf);
9030 Worklist.insert(&HiHalf);
9031
9032 // Move all users of this moved value.
9033 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9034}
9035
9036void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
9037 MachineInstr &Inst,
9038 MachineDominatorTree *MDT) const {
9039 MachineBasicBlock &MBB = *Inst.getParent();
9040 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9041
9042 MachineOperand &Dest = Inst.getOperand(0);
9043 MachineOperand &Src0 = Inst.getOperand(1);
9044 MachineOperand &Src1 = Inst.getOperand(2);
9045 const DebugLoc &DL = Inst.getDebugLoc();
9046
9047 MachineBasicBlock::iterator MII = Inst;
9048
9049 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9050
9051 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
9052
9053 MachineOperand* Op0;
9054 MachineOperand* Op1;
9055
9056 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
9057 Op0 = &Src0;
9058 Op1 = &Src1;
9059 } else {
9060 Op0 = &Src1;
9061 Op1 = &Src0;
9062 }
9063
9064 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
9065 .add(*Op0);
9066
9067 Register NewDest = MRI.createVirtualRegister(DestRC);
9068
9069 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
9070 .addReg(Interm)
9071 .add(*Op1);
9072
9073 MRI.replaceRegWith(Dest.getReg(), NewDest);
9074
9075 Worklist.insert(&Xor);
9076}
9077
9078void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
9079 MachineInstr &Inst) const {
9080 MachineBasicBlock &MBB = *Inst.getParent();
9081 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9082
9083 MachineBasicBlock::iterator MII = Inst;
9084 const DebugLoc &DL = Inst.getDebugLoc();
9085
9086 MachineOperand &Dest = Inst.getOperand(0);
9087 MachineOperand &Src = Inst.getOperand(1);
9088
9089 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
9090 const TargetRegisterClass *SrcRC = Src.isReg() ?
9091 MRI.getRegClass(Src.getReg()) :
9092 &AMDGPU::SGPR_32RegClass;
9093
9094 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9095 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9096
9097 const TargetRegisterClass *SrcSubRC =
9098 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9099
9100 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9101 AMDGPU::sub0, SrcSubRC);
9102 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9103 AMDGPU::sub1, SrcSubRC);
9104
9105 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
9106
9107 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
9108
9109 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9110
9111 // We don't need to legalize operands here. src0 for either instruction can be
9112 // an SGPR, and the second input is unused or determined here.
9113 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9114}
9115
9116void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
9117 MachineInstr &Inst) const {
9118 MachineBasicBlock &MBB = *Inst.getParent();
9119 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9120 MachineBasicBlock::iterator MII = Inst;
9121 const DebugLoc &DL = Inst.getDebugLoc();
9122
9123 MachineOperand &Dest = Inst.getOperand(0);
9124 uint32_t Imm = Inst.getOperand(2).getImm();
9125 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
9126 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
9127
9128 (void) Offset;
9129
9130 // Only sext_inreg cases handled.
9131 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
9132 Offset == 0 && "Not implemented");
9133
9134 if (BitWidth < 32) {
9135 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9136 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9137 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9138
9139 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
9140 .addReg(Inst.getOperand(1).getReg(), {}, AMDGPU::sub0)
9141 .addImm(0)
9142 .addImm(BitWidth);
9143
9144 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
9145 .addImm(31)
9146 .addReg(MidRegLo);
9147
9148 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9149 .addReg(MidRegLo)
9150 .addImm(AMDGPU::sub0)
9151 .addReg(MidRegHi)
9152 .addImm(AMDGPU::sub1);
9153
9154 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9155 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9156 return;
9157 }
9158
9159 MachineOperand &Src = Inst.getOperand(1);
9160 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9161 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9162
9163 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
9164 .addImm(31)
9165 .addReg(Src.getReg(), {}, AMDGPU::sub0);
9166
9167 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9168 .addReg(Src.getReg(), {}, AMDGPU::sub0)
9169 .addImm(AMDGPU::sub0)
9170 .addReg(TmpReg)
9171 .addImm(AMDGPU::sub1);
9172
9173 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9174 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9175}
9176
9177void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
9178 MachineInstr &Inst, unsigned Opcode,
9179 MachineDominatorTree *MDT) const {
9180 // (S_FLBIT_I32_B64 hi:lo) ->
9181 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
9182 // (S_FF1_I32_B64 hi:lo) ->
9183 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
9184
9185 MachineBasicBlock &MBB = *Inst.getParent();
9186 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9187 MachineBasicBlock::iterator MII = Inst;
9188 const DebugLoc &DL = Inst.getDebugLoc();
9189
9190 MachineOperand &Dest = Inst.getOperand(0);
9191 MachineOperand &Src = Inst.getOperand(1);
9192
9193 const MCInstrDesc &InstDesc = get(Opcode);
9194
9195 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9196 unsigned OpcodeAdd = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64
9197 : AMDGPU::V_ADD_CO_U32_e32;
9198
9199 const TargetRegisterClass *SrcRC =
9200 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9201 const TargetRegisterClass *SrcSubRC =
9202 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9203
9204 MachineOperand SrcRegSub0 =
9205 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
9206 MachineOperand SrcRegSub1 =
9207 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
9208
9209 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9210 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9211 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9212 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9213
9214 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
9215
9216 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
9217
9218 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
9219 .addReg(IsCtlz ? MidReg1 : MidReg2)
9220 .addImm(32)
9221 .addImm(1); // enable clamp
9222
9223 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9224 .addReg(MidReg3)
9225 .addReg(IsCtlz ? MidReg2 : MidReg1);
9226
9227 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9228
9229 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9230}
9231
9232void SIInstrInfo::addUsersToMoveToVALUWorklist(
9234 SIInstrWorklist &Worklist) const {
9235 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9236 MachineInstr &UseMI = *MO.getParent();
9237
9238 unsigned OpNo = 0;
9239
9240 switch (UseMI.getOpcode()) {
9241 case AMDGPU::COPY:
9242 case AMDGPU::WQM:
9243 case AMDGPU::SOFT_WQM:
9244 case AMDGPU::STRICT_WWM:
9245 case AMDGPU::STRICT_WQM:
9246 case AMDGPU::REG_SEQUENCE:
9247 case AMDGPU::PHI:
9248 case AMDGPU::INSERT_SUBREG:
9249 break;
9250 default:
9251 OpNo = MO.getOperandNo();
9252 break;
9253 }
9254
9255 const TargetRegisterClass *OpRC = getOpRegClass(UseMI, OpNo);
9256 MRI.constrainRegClass(DstReg, OpRC);
9257
9258 if (!RI.hasVectorRegisters(OpRC))
9259 Worklist.insert(&UseMI);
9260 else
9261 // Legalization could change user list.
9263 }
9264}
9265
9266void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9268 MachineInstr &Inst) const {
9269 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9270 MachineBasicBlock *MBB = Inst.getParent();
9271 MachineOperand &Src0 = Inst.getOperand(1);
9272 MachineOperand &Src1 = Inst.getOperand(2);
9273 const DebugLoc &DL = Inst.getDebugLoc();
9274
9275 if (ST.useRealTrue16Insts()) {
9276 Register SrcReg0, SrcReg1;
9277 if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
9278 SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9279 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg0).add(Src0);
9280 } else {
9281 SrcReg0 = Src0.getReg();
9282 }
9283
9284 if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
9285 SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9286 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg1).add(Src1);
9287 } else {
9288 SrcReg1 = Src1.getReg();
9289 }
9290
9291 bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
9292 bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
9293
9294 auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
9295 switch (Inst.getOpcode()) {
9296 case AMDGPU::S_PACK_LL_B32_B16:
9297 NewMI
9298 .addReg(SrcReg0, {},
9299 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9300 .addImm(AMDGPU::lo16)
9301 .addReg(SrcReg1, {},
9302 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9303 .addImm(AMDGPU::hi16);
9304 break;
9305 case AMDGPU::S_PACK_LH_B32_B16:
9306 NewMI
9307 .addReg(SrcReg0, {},
9308 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9309 .addImm(AMDGPU::lo16)
9310 .addReg(SrcReg1, {}, AMDGPU::hi16)
9311 .addImm(AMDGPU::hi16);
9312 break;
9313 case AMDGPU::S_PACK_HL_B32_B16:
9314 NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
9315 .addImm(AMDGPU::lo16)
9316 .addReg(SrcReg1, {},
9317 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9318 .addImm(AMDGPU::hi16);
9319 break;
9320 case AMDGPU::S_PACK_HH_B32_B16:
9321 NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
9322 .addImm(AMDGPU::lo16)
9323 .addReg(SrcReg1, {}, AMDGPU::hi16)
9324 .addImm(AMDGPU::hi16);
9325 break;
9326 default:
9327 llvm_unreachable("unhandled s_pack_* instruction");
9328 }
9329
9330 MachineOperand &Dest = Inst.getOperand(0);
9331 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9332 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9333 return;
9334 }
9335
9336 switch (Inst.getOpcode()) {
9337 case AMDGPU::S_PACK_LL_B32_B16: {
9338 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9339 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9340
9341 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9342 // 0.
9343 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9344 .addImm(0xffff);
9345
9346 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9347 .addReg(ImmReg, RegState::Kill)
9348 .add(Src0);
9349
9350 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9351 .add(Src1)
9352 .addImm(16)
9353 .addReg(TmpReg, RegState::Kill);
9354 break;
9355 }
9356 case AMDGPU::S_PACK_LH_B32_B16: {
9357 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9358 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9359 .addImm(0xffff);
9360 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9361 .addReg(ImmReg, RegState::Kill)
9362 .add(Src0)
9363 .add(Src1);
9364 break;
9365 }
9366 case AMDGPU::S_PACK_HL_B32_B16: {
9367 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9368 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9369 .addImm(16)
9370 .add(Src0);
9371 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9372 .add(Src1)
9373 .addImm(16)
9374 .addReg(TmpReg, RegState::Kill);
9375 break;
9376 }
9377 case AMDGPU::S_PACK_HH_B32_B16: {
9378 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9379 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9380 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9381 .addImm(16)
9382 .add(Src0);
9383 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9384 .addImm(0xffff0000);
9385 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9386 .add(Src1)
9387 .addReg(ImmReg, RegState::Kill)
9388 .addReg(TmpReg, RegState::Kill);
9389 break;
9390 }
9391 default:
9392 llvm_unreachable("unhandled s_pack_* instruction");
9393 }
9394
9395 MachineOperand &Dest = Inst.getOperand(0);
9396 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9397 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9398}
9399
9400void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
9401 MachineInstr &SCCDefInst,
9402 SIInstrWorklist &Worklist,
9403 Register NewCond) const {
9404
9405 // Ensure that def inst defines SCC, which is still live.
9406 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9407 !Op.isDead() && Op.getParent() == &SCCDefInst);
9408 SmallVector<MachineInstr *, 4> CopyToDelete;
9409 // This assumes that all the users of SCC are in the same block
9410 // as the SCC def.
9411 for (MachineInstr &MI : // Skip the def inst itself.
9412 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9413 SCCDefInst.getParent()->end())) {
9414 // Check if SCC is used first.
9415 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9416 if (SCCIdx != -1) {
9417 if (MI.isCopy()) {
9418 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9419 Register DestReg = MI.getOperand(0).getReg();
9420
9421 MRI.replaceRegWith(DestReg, NewCond);
9422 CopyToDelete.push_back(&MI);
9423 } else {
9424
9425 if (NewCond.isValid())
9426 MI.getOperand(SCCIdx).setReg(NewCond);
9427
9428 Worklist.insert(&MI);
9429 }
9430 }
9431 // Exit if we find another SCC def.
9432 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9433 break;
9434 }
9435 for (auto &Copy : CopyToDelete)
9436 Copy->eraseFromParent();
9437}
9438
9439// Instructions that use SCC may be converted to VALU instructions. When that
9440// happens, the SCC register is changed to VCC_LO. The instruction that defines
9441// SCC must be changed to an instruction that defines VCC. This function makes
9442// sure that the instruction that defines SCC is added to the moveToVALU
9443// worklist.
9444void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9445 SIInstrWorklist &Worklist) const {
9446 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9447 // then there is nothing to do because the defining instruction has been
9448 // converted to a VALU already. If SCC then that instruction needs to be
9449 // converted to a VALU.
9450 for (MachineInstr &MI :
9451 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9452 SCCUseInst->getParent()->rend())) {
9453 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9454 break;
9455 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9456 Worklist.insert(&MI);
9457 break;
9458 }
9459 }
9460}
9461
9462const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9463 const MachineInstr &Inst) const {
9464 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9465
9466 switch (Inst.getOpcode()) {
9467 // For target instructions, getOpRegClass just returns the virtual register
9468 // class associated with the operand, so we need to find an equivalent VGPR
9469 // register class in order to move the instruction to the VALU.
9470 case AMDGPU::COPY:
9471 case AMDGPU::PHI:
9472 case AMDGPU::REG_SEQUENCE:
9473 case AMDGPU::INSERT_SUBREG:
9474 case AMDGPU::WQM:
9475 case AMDGPU::SOFT_WQM:
9476 case AMDGPU::STRICT_WWM:
9477 case AMDGPU::STRICT_WQM: {
9478 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9479 if (RI.isAGPRClass(SrcRC)) {
9480 if (RI.isAGPRClass(NewDstRC))
9481 return nullptr;
9482
9483 switch (Inst.getOpcode()) {
9484 case AMDGPU::PHI:
9485 case AMDGPU::REG_SEQUENCE:
9486 case AMDGPU::INSERT_SUBREG:
9487 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9488 break;
9489 default:
9490 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9491 }
9492
9493 if (!NewDstRC)
9494 return nullptr;
9495 } else {
9496 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9497 return nullptr;
9498
9499 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9500 if (!NewDstRC)
9501 return nullptr;
9502 }
9503
9504 return NewDstRC;
9505 }
9506 default:
9507 return NewDstRC;
9508 }
9509}
9510
9511// Find the one SGPR operand we are allowed to use.
9512Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9513 int OpIndices[3]) const {
9514 const MCInstrDesc &Desc = MI.getDesc();
9515
9516 // Find the one SGPR operand we are allowed to use.
9517 //
9518 // First we need to consider the instruction's operand requirements before
9519 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9520 // of VCC, but we are still bound by the constant bus requirement to only use
9521 // one.
9522 //
9523 // If the operand's class is an SGPR, we can never move it.
9524
9525 Register SGPRReg = findImplicitSGPRRead(MI);
9526 if (SGPRReg)
9527 return SGPRReg;
9528
9529 Register UsedSGPRs[3] = {Register()};
9530 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9531
9532 for (unsigned i = 0; i < 3; ++i) {
9533 int Idx = OpIndices[i];
9534 if (Idx == -1)
9535 break;
9536
9537 const MachineOperand &MO = MI.getOperand(Idx);
9538 if (!MO.isReg())
9539 continue;
9540
9541 // Is this operand statically required to be an SGPR based on the operand
9542 // constraints?
9543 const TargetRegisterClass *OpRC =
9544 RI.getRegClass(getOpRegClassID(Desc.operands()[Idx]));
9545 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9546 if (IsRequiredSGPR)
9547 return MO.getReg();
9548
9549 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9550 Register Reg = MO.getReg();
9551 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9552 if (RI.isSGPRClass(RegRC))
9553 UsedSGPRs[i] = Reg;
9554 }
9555
9556 // We don't have a required SGPR operand, so we have a bit more freedom in
9557 // selecting operands to move.
9558
9559 // Try to select the most used SGPR. If an SGPR is equal to one of the
9560 // others, we choose that.
9561 //
9562 // e.g.
9563 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9564 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9565
9566 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9567 // prefer those.
9568
9569 if (UsedSGPRs[0]) {
9570 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9571 SGPRReg = UsedSGPRs[0];
9572 }
9573
9574 if (!SGPRReg && UsedSGPRs[1]) {
9575 if (UsedSGPRs[1] == UsedSGPRs[2])
9576 SGPRReg = UsedSGPRs[1];
9577 }
9578
9579 return SGPRReg;
9580}
9581
9583 AMDGPU::OpName OperandName) const {
9584 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9585 return nullptr;
9586
9587 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9588 if (Idx == -1)
9589 return nullptr;
9590
9591 return &MI.getOperand(Idx);
9592}
9593
9595 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9596 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9599 return (Format << 44) |
9600 (1ULL << 56) | // RESOURCE_LEVEL = 1
9601 (3ULL << 60); // OOB_SELECT = 3
9602 }
9603
9604 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9605 if (ST.isAmdHsaOS()) {
9606 // Set ATC = 1. GFX9 doesn't have this bit.
9607 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9608 RsrcDataFormat |= (1ULL << 56);
9609
9610 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9611 // BTW, it disables TC L2 and therefore decreases performance.
9612 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9613 RsrcDataFormat |= (2ULL << 59);
9614 }
9615
9616 return RsrcDataFormat;
9617}
9618
9622 0xffffffff; // Size;
9623
9624 // GFX9 doesn't have ELEMENT_SIZE.
9625 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9626 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9627 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9628 }
9629
9630 // IndexStride = 64 / 32.
9631 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9632 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9633
9634 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9635 // Clear them unless we want a huge stride.
9636 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9637 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9638 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9639
9640 return Rsrc23;
9641}
9642
9644 unsigned Opc = MI.getOpcode();
9645
9646 return isSMRD(Opc);
9647}
9648
9650 return get(Opc).mayLoad() &&
9651 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9652}
9653
9655 int &FrameIndex) const {
9656 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9657 if (!Addr || !Addr->isFI())
9658 return Register();
9659
9660 assert(!MI.memoperands_empty() &&
9661 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9662
9663 FrameIndex = Addr->getIndex();
9664 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
9665}
9666
9668 int &FrameIndex) const {
9669 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9670 assert(Addr && Addr->isFI());
9671 FrameIndex = Addr->getIndex();
9672 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
9673}
9674
9676 int &FrameIndex) const {
9677 if (!MI.mayLoad())
9678 return Register();
9679
9680 if (isMUBUF(MI) || isVGPRSpill(MI))
9681 return isStackAccess(MI, FrameIndex);
9682
9683 if (isSGPRSpill(MI))
9684 return isSGPRStackAccess(MI, FrameIndex);
9685
9686 return Register();
9687}
9688
9690 int &FrameIndex) const {
9691 if (!MI.mayStore())
9692 return Register();
9693
9694 if (isMUBUF(MI) || isVGPRSpill(MI))
9695 return isStackAccess(MI, FrameIndex);
9696
9697 if (isSGPRSpill(MI))
9698 return isSGPRStackAccess(MI, FrameIndex);
9699
9700 return Register();
9701}
9702
9704 unsigned Size = 0;
9706 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
9707 while (++I != E && I->isInsideBundle()) {
9708 assert(!I->isBundle() && "No nested bundle!");
9710 }
9711
9712 return Size;
9713}
9714
9716 unsigned Opc = MI.getOpcode();
9718 unsigned DescSize = Desc.getSize();
9719
9720 // If we have a definitive size, we can use it. Otherwise we need to inspect
9721 // the operands to know the size.
9722 if (isFixedSize(MI)) {
9723 unsigned Size = DescSize;
9724
9725 // If we hit the buggy offset, an extra nop will be inserted in MC so
9726 // estimate the worst case.
9727 if (MI.isBranch() && ST.hasOffset3fBug())
9728 Size += 4;
9729
9730 return Size;
9731 }
9732
9733 // Instructions may have a 32-bit literal encoded after them. Check
9734 // operands that could ever be literals.
9735 if (isVALU(MI) || isSALU(MI)) {
9736 if (isDPP(MI))
9737 return DescSize;
9738 bool HasLiteral = false;
9739 unsigned LiteralSize = 4;
9740 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9741 const MachineOperand &Op = MI.getOperand(I);
9742 const MCOperandInfo &OpInfo = Desc.operands()[I];
9743 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9744 HasLiteral = true;
9745 if (ST.has64BitLiterals()) {
9746 switch (OpInfo.OperandType) {
9747 default:
9748 break;
9750 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9751 LiteralSize = 8;
9752 break;
9754 if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false))
9755 LiteralSize = 8;
9756 break;
9757 }
9758 }
9759 break;
9760 }
9761 }
9762 return HasLiteral ? DescSize + LiteralSize : DescSize;
9763 }
9764
9765 // Check whether we have extra NSA words.
9766 if (isMIMG(MI)) {
9767 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9768 if (VAddr0Idx < 0)
9769 return 8;
9770
9771 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9772 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9773 }
9774
9775 switch (Opc) {
9776 case TargetOpcode::BUNDLE:
9777 return getInstBundleSize(MI);
9778 case TargetOpcode::INLINEASM:
9779 case TargetOpcode::INLINEASM_BR: {
9780 const MachineFunction *MF = MI.getMF();
9781 const char *AsmStr = MI.getOperand(0).getSymbolName();
9782 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
9783 }
9784 default:
9785 if (MI.isMetaInstruction())
9786 return 0;
9787
9788 // If D16 Pseudo inst, get correct MC code size
9789 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9790 if (D16Info) {
9791 // Assume d16_lo/hi inst are always in same size
9792 unsigned LoInstOpcode = D16Info->LoOp;
9793 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9794 DescSize = Desc.getSize();
9795 }
9796
9797 // If FMA Pseudo inst, get correct MC code size
9798 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
9799 // All potential lowerings are the same size; arbitrarily pick one.
9800 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
9801 DescSize = Desc.getSize();
9802 }
9803
9804 return DescSize;
9805 }
9806}
9807
9809 if (!isFLAT(MI))
9810 return false;
9811
9812 if (MI.memoperands_empty())
9813 return true;
9814
9815 for (const MachineMemOperand *MMO : MI.memoperands()) {
9816 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9817 return true;
9818 }
9819 return false;
9820}
9821
9824 static const std::pair<int, const char *> TargetIndices[] = {
9825 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
9826 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
9827 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
9828 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
9829 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
9830 return ArrayRef(TargetIndices);
9831}
9832
9833/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
9834/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
9840
9841/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
9842/// pass.
9847
9848// Called during:
9849// - pre-RA scheduling and post-RA scheduling
9852 const ScheduleDAGMI *DAG) const {
9853 // Borrowed from Arm Target
9854 // We would like to restrict this hazard recognizer to only
9855 // post-RA scheduling; we can tell that we're post-RA because we don't
9856 // track VRegLiveness.
9857 if (!DAG->hasVRegLiveness())
9858 return new GCNHazardRecognizer(DAG->MF);
9860}
9861
9862std::pair<unsigned, unsigned>
9864 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9865}
9866
9869 static const std::pair<unsigned, const char *> TargetFlags[] = {
9870 {MO_GOTPCREL, "amdgpu-gotprel"},
9871 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
9872 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
9873 {MO_GOTPCREL64, "amdgpu-gotprel64"},
9874 {MO_REL32_LO, "amdgpu-rel32-lo"},
9875 {MO_REL32_HI, "amdgpu-rel32-hi"},
9876 {MO_REL64, "amdgpu-rel64"},
9877 {MO_ABS32_LO, "amdgpu-abs32-lo"},
9878 {MO_ABS32_HI, "amdgpu-abs32-hi"},
9879 {MO_ABS64, "amdgpu-abs64"},
9880 };
9881
9882 return ArrayRef(TargetFlags);
9883}
9884
9887 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9888 {
9889 {MONoClobber, "amdgpu-noclobber"},
9890 {MOLastUse, "amdgpu-last-use"},
9891 {MOCooperative, "amdgpu-cooperative"},
9892 };
9893
9894 return ArrayRef(TargetFlags);
9895}
9896
9898 const MachineFunction &MF) const {
9900 assert(SrcReg.isVirtual());
9901 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
9902 return AMDGPU::WWM_COPY;
9903
9904 return AMDGPU::COPY;
9905}
9906
9908 uint16_t Opcode = MI.getOpcode();
9909 // Check if it is SGPR spill or wwm-register spill Opcode.
9910 if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode))
9911 return true;
9912
9913 const MachineFunction *MF = MI.getMF();
9914 const MachineRegisterInfo &MRI = MF->getRegInfo();
9916
9917 // See if this is Liverange split instruction inserted for SGPR or
9918 // wwm-register. The implicit def inserted for wwm-registers should also be
9919 // included as they can appear at the bb begin.
9920 bool IsLRSplitInst = MI.getFlag(MachineInstr::LRSplit);
9921 if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF)
9922 return false;
9923
9924 Register Reg = MI.getOperand(0).getReg();
9925 if (RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg)))
9926 return IsLRSplitInst;
9927
9928 return MFI->isWWMReg(Reg);
9929}
9930
9932 Register Reg) const {
9933 // We need to handle instructions which may be inserted during register
9934 // allocation to handle the prolog. The initial prolog instruction may have
9935 // been separated from the start of the block by spills and copies inserted
9936 // needed by the prolog. However, the insertions for scalar registers can
9937 // always be placed at the BB top as they are independent of the exec mask
9938 // value.
9939 bool IsNullOrVectorRegister = true;
9940 if (Reg) {
9941 const MachineFunction *MF = MI.getMF();
9942 const MachineRegisterInfo &MRI = MF->getRegInfo();
9943 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
9944 }
9945
9946 return IsNullOrVectorRegister &&
9947 (canAddToBBProlog(MI) ||
9948 (!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
9949 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
9950}
9951
9955 const DebugLoc &DL,
9956 Register DestReg) const {
9957 if (ST.hasAddNoCarryInsts())
9958 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
9959
9960 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9961 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
9962 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
9963
9964 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9965 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9966}
9967
9970 const DebugLoc &DL,
9971 Register DestReg,
9972 RegScavenger &RS) const {
9973 if (ST.hasAddNoCarryInsts())
9974 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
9975
9976 // If available, prefer to use vcc.
9977 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
9978 ? Register(RI.getVCC())
9979 : RS.scavengeRegisterBackwards(
9980 *RI.getBoolRC(), I, /* RestoreAfter */ false,
9981 0, /* AllowSpill */ false);
9982
9983 // TODO: Users need to deal with this.
9984 if (!UnusedCarry.isValid())
9985 return MachineInstrBuilder();
9986
9987 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9988 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9989}
9990
9991bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
9992 switch (Opcode) {
9993 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
9994 case AMDGPU::SI_KILL_I1_TERMINATOR:
9995 return true;
9996 default:
9997 return false;
9998 }
9999}
10000
10002 switch (Opcode) {
10003 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
10004 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
10005 case AMDGPU::SI_KILL_I1_PSEUDO:
10006 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
10007 default:
10008 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
10009 }
10010}
10011
10012bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
10013 return Imm <= getMaxMUBUFImmOffset(ST);
10014}
10015
10017 // GFX12 field is non-negative 24-bit signed byte offset.
10018 const unsigned OffsetBits =
10019 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
10020 return (1 << OffsetBits) - 1;
10021}
10022
10024 if (!ST.isWave32())
10025 return;
10026
10027 if (MI.isInlineAsm())
10028 return;
10029
10030 if (MI.getNumOperands() < MI.getNumExplicitOperands())
10031 return;
10032
10033 for (auto &Op : MI.implicit_operands()) {
10034 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
10035 Op.setReg(AMDGPU::VCC_LO);
10036 }
10037}
10038
10040 if (!isSMRD(MI))
10041 return false;
10042
10043 // Check that it is using a buffer resource.
10044 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
10045 if (Idx == -1) // e.g. s_memtime
10046 return false;
10047
10048 const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]);
10049 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
10050}
10051
10052// Given Imm, split it into the values to put into the SOffset and ImmOffset
10053// fields in an MUBUF instruction. Return false if it is not possible (due to a
10054// hardware bug needing a workaround).
10055//
10056// The required alignment ensures that individual address components remain
10057// aligned if they are aligned to begin with. It also ensures that additional
10058// offsets within the given alignment can be added to the resulting ImmOffset.
10060 uint32_t &ImmOffset, Align Alignment) const {
10061 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
10062 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
10063 uint32_t Overflow = 0;
10064
10065 if (Imm > MaxImm) {
10066 if (Imm <= MaxImm + 64) {
10067 // Use an SOffset inline constant for 4..64
10068 Overflow = Imm - MaxImm;
10069 Imm = MaxImm;
10070 } else {
10071 // Try to keep the same value in SOffset for adjacent loads, so that
10072 // the corresponding register contents can be re-used.
10073 //
10074 // Load values with all low-bits (except for alignment bits) set into
10075 // SOffset, so that a larger range of values can be covered using
10076 // s_movk_i32.
10077 //
10078 // Atomic operations fail to work correctly when individual address
10079 // components are unaligned, even if their sum is aligned.
10080 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
10081 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
10082 Imm = Low;
10083 Overflow = High - Alignment.value();
10084 }
10085 }
10086
10087 if (Overflow > 0) {
10088 // There is a hardware bug in SI and CI which prevents address clamping in
10089 // MUBUF instructions from working correctly with SOffsets. The immediate
10090 // offset is unaffected.
10091 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
10092 return false;
10093
10094 // It is not possible to set immediate in SOffset field on some targets.
10095 if (ST.hasRestrictedSOffset())
10096 return false;
10097 }
10098
10099 ImmOffset = Imm;
10100 SOffset = Overflow;
10101 return true;
10102}
10103
10104// Depending on the used address space and instructions, some immediate offsets
10105// are allowed and some are not.
10106// Pre-GFX12, flat instruction offsets can only be non-negative, global and
10107// scratch instruction offsets can also be negative. On GFX12, offsets can be
10108// negative for all variants.
10109//
10110// There are several bugs related to these offsets:
10111// On gfx10.1, flat instructions that go into the global address space cannot
10112// use an offset.
10113//
10114// For scratch instructions, the address can be either an SGPR or a VGPR.
10115// The following offsets can be used, depending on the architecture (x means
10116// cannot be used):
10117// +----------------------------+------+------+
10118// | Address-Mode | SGPR | VGPR |
10119// +----------------------------+------+------+
10120// | gfx9 | | |
10121// | negative, 4-aligned offset | x | ok |
10122// | negative, unaligned offset | x | ok |
10123// +----------------------------+------+------+
10124// | gfx10 | | |
10125// | negative, 4-aligned offset | ok | ok |
10126// | negative, unaligned offset | ok | x |
10127// +----------------------------+------+------+
10128// | gfx10.3 | | |
10129// | negative, 4-aligned offset | ok | ok |
10130// | negative, unaligned offset | ok | ok |
10131// +----------------------------+------+------+
10132//
10133// This function ignores the addressing mode, so if an offset cannot be used in
10134// one addressing mode, it is considered illegal.
10135bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
10136 uint64_t FlatVariant) const {
10137 // TODO: Should 0 be special cased?
10138 if (!ST.hasFlatInstOffsets())
10139 return false;
10140
10141 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
10142 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
10143 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
10144 return false;
10145
10146 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10147 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
10148 (Offset % 4) != 0) {
10149 return false;
10150 }
10151
10152 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10153 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
10154 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
10155}
10156
10157// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
10158std::pair<int64_t, int64_t>
10159SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
10160 uint64_t FlatVariant) const {
10161 int64_t RemainderOffset = COffsetVal;
10162 int64_t ImmField = 0;
10163
10164 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10165 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
10166
10167 if (AllowNegative) {
10168 // Use signed division by a power of two to truncate towards 0.
10169 int64_t D = 1LL << NumBits;
10170 RemainderOffset = (COffsetVal / D) * D;
10171 ImmField = COffsetVal - RemainderOffset;
10172
10173 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10174 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
10175 (ImmField % 4) != 0) {
10176 // Make ImmField a multiple of 4
10177 RemainderOffset += ImmField % 4;
10178 ImmField -= ImmField % 4;
10179 }
10180 } else if (COffsetVal >= 0) {
10181 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
10182 RemainderOffset = COffsetVal - ImmField;
10183 }
10184
10185 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
10186 assert(RemainderOffset + ImmField == COffsetVal);
10187 return {ImmField, RemainderOffset};
10188}
10189
10191 if (ST.hasNegativeScratchOffsetBug() &&
10192 FlatVariant == SIInstrFlags::FlatScratch)
10193 return false;
10194
10195 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
10196}
10197
10198static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
10199 switch (ST.getGeneration()) {
10200 default:
10201 break;
10204 return SIEncodingFamily::SI;
10207 return SIEncodingFamily::VI;
10213 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
10217 }
10218 llvm_unreachable("Unknown subtarget generation!");
10219}
10220
10221bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
10222 switch(MCOp) {
10223 // These opcodes use indirect register addressing so
10224 // they need special handling by codegen (currently missing).
10225 // Therefore it is too risky to allow these opcodes
10226 // to be selected by dpp combiner or sdwa peepholer.
10227 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10228 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10229 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10230 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10231 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10232 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10233 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10234 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10235 return true;
10236 default:
10237 return false;
10238 }
10239}
10240
10241#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10242 case OPCODE##_dpp: \
10243 case OPCODE##_e32: \
10244 case OPCODE##_e64: \
10245 case OPCODE##_e64_dpp: \
10246 case OPCODE##_sdwa:
10247
10248static bool isRenamedInGFX9(int Opcode) {
10249 switch (Opcode) {
10250 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10251 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10252 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10253 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10254 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10255 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10256 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10257 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10258 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10259 //
10260 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10261 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10262 case AMDGPU::V_FMA_F16_gfx9_e64:
10263 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10264 case AMDGPU::V_INTERP_P2_F16:
10265 case AMDGPU::V_MAD_F16_e64:
10266 case AMDGPU::V_MAD_U16_e64:
10267 case AMDGPU::V_MAD_I16_e64:
10268 return true;
10269 default:
10270 return false;
10271 }
10272}
10273
10274int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10275 assert(Opcode == (int)SIInstrInfo::getNonSoftWaitcntOpcode(Opcode) &&
10276 "SIInsertWaitcnts should have promoted soft waitcnt instructions!");
10277
10278 unsigned Gen = subtargetEncodingFamily(ST);
10279
10280 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10282
10283 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10284 // subtarget has UnpackedD16VMem feature.
10285 // TODO: remove this when we discard GFX80 encoding.
10286 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10288
10289 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10290 switch (ST.getGeneration()) {
10291 default:
10293 break;
10296 break;
10299 break;
10300 }
10301 }
10302
10303 if (isMAI(Opcode)) {
10304 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10305 if (MFMAOp != -1)
10306 Opcode = MFMAOp;
10307 }
10308
10309 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10310
10311 if (MCOp == (uint16_t)-1 && ST.hasGFX1250Insts())
10313
10314 // -1 means that Opcode is already a native instruction.
10315 if (MCOp == -1)
10316 return Opcode;
10317
10318 if (ST.hasGFX90AInsts()) {
10319 uint16_t NMCOp = (uint16_t)-1;
10320 if (ST.hasGFX940Insts())
10322 if (NMCOp == (uint16_t)-1)
10324 if (NMCOp == (uint16_t)-1)
10326 if (NMCOp != (uint16_t)-1)
10327 MCOp = NMCOp;
10328 }
10329
10330 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
10331 // no encoding in the given subtarget generation.
10332 if (MCOp == (uint16_t)-1)
10333 return -1;
10334
10335 if (isAsmOnlyOpcode(MCOp))
10336 return -1;
10337
10338 return MCOp;
10339}
10340
10341static
10343 assert(RegOpnd.isReg());
10344 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10345 getRegSubRegPair(RegOpnd);
10346}
10347
10350 assert(MI.isRegSequence());
10351 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10352 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10353 auto &RegOp = MI.getOperand(1 + 2 * I);
10354 return getRegOrUndef(RegOp);
10355 }
10357}
10358
10359// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10360// Following a subreg of reg:subreg isn't supported
10363 if (!RSR.SubReg)
10364 return false;
10365 switch (MI.getOpcode()) {
10366 default: break;
10367 case AMDGPU::REG_SEQUENCE:
10368 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10369 return true;
10370 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10371 case AMDGPU::INSERT_SUBREG:
10372 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10373 // inserted the subreg we're looking for
10374 RSR = getRegOrUndef(MI.getOperand(2));
10375 else { // the subreg in the rest of the reg
10376 auto R1 = getRegOrUndef(MI.getOperand(1));
10377 if (R1.SubReg) // subreg of subreg isn't supported
10378 return false;
10379 RSR.Reg = R1.Reg;
10380 }
10381 return true;
10382 }
10383 return false;
10384}
10385
10387 const MachineRegisterInfo &MRI) {
10388 assert(MRI.isSSA());
10389 if (!P.Reg.isVirtual())
10390 return nullptr;
10391
10392 auto RSR = P;
10393 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10394 while (auto *MI = DefInst) {
10395 DefInst = nullptr;
10396 switch (MI->getOpcode()) {
10397 case AMDGPU::COPY:
10398 case AMDGPU::V_MOV_B32_e32: {
10399 auto &Op1 = MI->getOperand(1);
10400 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10401 if (Op1.isUndef())
10402 return nullptr;
10403 RSR = getRegSubRegPair(Op1);
10404 DefInst = MRI.getVRegDef(RSR.Reg);
10405 }
10406 break;
10407 }
10408 default:
10409 if (followSubRegDef(*MI, RSR)) {
10410 if (!RSR.Reg)
10411 return nullptr;
10412 DefInst = MRI.getVRegDef(RSR.Reg);
10413 }
10414 }
10415 if (!DefInst)
10416 return MI;
10417 }
10418 return nullptr;
10419}
10420
10422 Register VReg,
10423 const MachineInstr &DefMI,
10424 const MachineInstr &UseMI) {
10425 assert(MRI.isSSA() && "Must be run on SSA");
10426
10427 auto *TRI = MRI.getTargetRegisterInfo();
10428 auto *DefBB = DefMI.getParent();
10429
10430 // Don't bother searching between blocks, although it is possible this block
10431 // doesn't modify exec.
10432 if (UseMI.getParent() != DefBB)
10433 return true;
10434
10435 const int MaxInstScan = 20;
10436 int NumInst = 0;
10437
10438 // Stop scan at the use.
10439 auto E = UseMI.getIterator();
10440 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10441 if (I->isDebugInstr())
10442 continue;
10443
10444 if (++NumInst > MaxInstScan)
10445 return true;
10446
10447 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10448 return true;
10449 }
10450
10451 return false;
10452}
10453
10455 Register VReg,
10456 const MachineInstr &DefMI) {
10457 assert(MRI.isSSA() && "Must be run on SSA");
10458
10459 auto *TRI = MRI.getTargetRegisterInfo();
10460 auto *DefBB = DefMI.getParent();
10461
10462 const int MaxUseScan = 10;
10463 int NumUse = 0;
10464
10465 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10466 auto &UseInst = *Use.getParent();
10467 // Don't bother searching between blocks, although it is possible this block
10468 // doesn't modify exec.
10469 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10470 return true;
10471
10472 if (++NumUse > MaxUseScan)
10473 return true;
10474 }
10475
10476 if (NumUse == 0)
10477 return false;
10478
10479 const int MaxInstScan = 20;
10480 int NumInst = 0;
10481
10482 // Stop scan when we have seen all the uses.
10483 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10484 assert(I != DefBB->end());
10485
10486 if (I->isDebugInstr())
10487 continue;
10488
10489 if (++NumInst > MaxInstScan)
10490 return true;
10491
10492 for (const MachineOperand &Op : I->operands()) {
10493 // We don't check reg masks here as they're used only on calls:
10494 // 1. EXEC is only considered const within one BB
10495 // 2. Call should be a terminator instruction if present in a BB
10496
10497 if (!Op.isReg())
10498 continue;
10499
10500 Register Reg = Op.getReg();
10501 if (Op.isUse()) {
10502 if (Reg == VReg && --NumUse == 0)
10503 return false;
10504 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10505 return true;
10506 }
10507 }
10508}
10509
10512 const DebugLoc &DL, Register Src, Register Dst) const {
10513 auto Cur = MBB.begin();
10514 if (Cur != MBB.end())
10515 do {
10516 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10517 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10518 ++Cur;
10519 } while (Cur != MBB.end() && Cur != LastPHIIt);
10520
10521 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10522 Dst);
10523}
10524
10527 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10528 if (InsPt != MBB.end() &&
10529 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10530 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10531 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10532 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10533 InsPt++;
10534 return BuildMI(MBB, InsPt, DL,
10535 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10536 .addReg(Src, {}, SrcSubReg)
10537 .addReg(AMDGPU::EXEC, RegState::Implicit);
10538 }
10539 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10540 Dst);
10541}
10542
10543bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10544
10547 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
10548 VirtRegMap *VRM) const {
10549 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10550 //
10551 // %0:sreg_32 = COPY $m0
10552 //
10553 // We explicitly chose SReg_32 for the virtual register so such a copy might
10554 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10555 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10556 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10557 // TargetInstrInfo::foldMemoryOperand() is going to try.
10558 // A similar issue also exists with spilling and reloading $exec registers.
10559 //
10560 // To prevent that, constrain the %0 register class here.
10561 if (isFullCopyInstr(MI)) {
10562 Register DstReg = MI.getOperand(0).getReg();
10563 Register SrcReg = MI.getOperand(1).getReg();
10564 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10565 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10567 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10568 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10569 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10570 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10571 return nullptr;
10572 }
10573 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10574 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10575 return nullptr;
10576 }
10577 }
10578 }
10579
10580 return nullptr;
10581}
10582
10584 const MachineInstr &MI,
10585 unsigned *PredCost) const {
10586 if (MI.isBundle()) {
10588 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10589 unsigned Lat = 0, Count = 0;
10590 for (++I; I != E && I->isBundledWithPred(); ++I) {
10591 ++Count;
10592 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10593 }
10594 return Lat + Count - 1;
10595 }
10596
10597 return SchedModel.computeInstrLatency(&MI);
10598}
10599
10600const MachineOperand &
10602 if (const MachineOperand *CallAddrOp =
10603 getNamedOperand(MI, AMDGPU::OpName::src0))
10604 return *CallAddrOp;
10606}
10607
10610 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10611 unsigned Opcode = MI.getOpcode();
10612
10613 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10614 Register Dst = MI.getOperand(0).getReg();
10615 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10616 : MI.getOperand(1).getReg();
10617 LLT DstTy = MRI.getType(Dst);
10618 LLT SrcTy = MRI.getType(Src);
10619 unsigned DstAS = DstTy.getAddressSpace();
10620 unsigned SrcAS = SrcTy.getAddressSpace();
10621 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10622 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10623 ST.hasGloballyAddressableScratch()
10626 };
10627
10628 // If the target supports globally addressable scratch, the mapping from
10629 // scratch memory to the flat aperture changes therefore an address space cast
10630 // is no longer uniform.
10631 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10632 return HandleAddrSpaceCast(MI);
10633
10634 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10635 auto IID = GI->getIntrinsicID();
10640
10641 switch (IID) {
10642 case Intrinsic::amdgcn_addrspacecast_nonnull:
10643 return HandleAddrSpaceCast(MI);
10644 case Intrinsic::amdgcn_if:
10645 case Intrinsic::amdgcn_else:
10646 // FIXME: Uniform if second result
10647 break;
10648 }
10649
10651 }
10652
10653 // Loads from the private and flat address spaces are divergent, because
10654 // threads can execute the load instruction with the same inputs and get
10655 // different results.
10656 //
10657 // All other loads are not divergent, because if threads issue loads with the
10658 // same arguments, they will always get the same result.
10659 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10660 Opcode == AMDGPU::G_SEXTLOAD) {
10661 if (MI.memoperands_empty())
10662 return InstructionUniformity::NeverUniform; // conservative assumption
10663
10664 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10665 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10666 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10667 })) {
10668 // At least one MMO in a non-global address space.
10670 }
10672 }
10673
10674 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10675 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10676 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10677 AMDGPU::isGenericAtomic(Opcode)) {
10679 }
10681}
10682
10685
10686 if (isNeverUniform(MI))
10688
10689 unsigned opcode = MI.getOpcode();
10690 if (opcode == AMDGPU::V_READLANE_B32 ||
10691 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10692 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10694
10695 if (isCopyInstr(MI)) {
10696 const MachineOperand &srcOp = MI.getOperand(1);
10697 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10698 const TargetRegisterClass *regClass =
10699 RI.getPhysRegBaseClass(srcOp.getReg());
10700 return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
10702 }
10704 }
10705
10706 // GMIR handling
10707 if (MI.isPreISelOpcode())
10709
10710 // Atomics are divergent because they are executed sequentially: when an
10711 // atomic operation refers to the same address in each thread, then each
10712 // thread after the first sees the value written by the previous thread as
10713 // original value.
10714
10715 if (isAtomic(MI))
10717
10718 // Loads from the private and flat address spaces are divergent, because
10719 // threads can execute the load instruction with the same inputs and get
10720 // different results.
10721 if (isFLAT(MI) && MI.mayLoad()) {
10722 if (MI.memoperands_empty())
10723 return InstructionUniformity::NeverUniform; // conservative assumption
10724
10725 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10726 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10727 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10728 })) {
10729 // At least one MMO in a non-global address space.
10731 }
10732
10734 }
10735
10736 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10737 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10738
10739 // FIXME: It's conceptually broken to report this for an instruction, and not
10740 // a specific def operand. For inline asm in particular, there could be mixed
10741 // uniform and divergent results.
10742 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10743 const MachineOperand &SrcOp = MI.getOperand(I);
10744 if (!SrcOp.isReg())
10745 continue;
10746
10747 Register Reg = SrcOp.getReg();
10748 if (!Reg || !SrcOp.readsReg())
10749 continue;
10750
10751 // If RegBank is null, this is unassigned or an unallocatable special
10752 // register, which are all scalars.
10753 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10754 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10756 }
10757
10758 // TODO: Uniformity check condtions above can be rearranged for more
10759 // redability
10760
10761 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10762 // currently turned into no-op COPYs by SelectionDAG ISel and are
10763 // therefore no longer recognizable.
10764
10766}
10767
10769 switch (MF.getFunction().getCallingConv()) {
10771 return 1;
10773 return 2;
10775 return 3;
10779 const Function &F = MF.getFunction();
10780 F.getContext().diagnose(DiagnosticInfoUnsupported(
10781 F, "ds_ordered_count unsupported for this calling conv"));
10782 [[fallthrough]];
10783 }
10786 case CallingConv::C:
10787 case CallingConv::Fast:
10788 default:
10789 // Assume other calling conventions are various compute callable functions
10790 return 0;
10791 }
10792}
10793
10795 Register &SrcReg2, int64_t &CmpMask,
10796 int64_t &CmpValue) const {
10797 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
10798 return false;
10799
10800 switch (MI.getOpcode()) {
10801 default:
10802 break;
10803 case AMDGPU::S_CMP_EQ_U32:
10804 case AMDGPU::S_CMP_EQ_I32:
10805 case AMDGPU::S_CMP_LG_U32:
10806 case AMDGPU::S_CMP_LG_I32:
10807 case AMDGPU::S_CMP_LT_U32:
10808 case AMDGPU::S_CMP_LT_I32:
10809 case AMDGPU::S_CMP_GT_U32:
10810 case AMDGPU::S_CMP_GT_I32:
10811 case AMDGPU::S_CMP_LE_U32:
10812 case AMDGPU::S_CMP_LE_I32:
10813 case AMDGPU::S_CMP_GE_U32:
10814 case AMDGPU::S_CMP_GE_I32:
10815 case AMDGPU::S_CMP_EQ_U64:
10816 case AMDGPU::S_CMP_LG_U64:
10817 SrcReg = MI.getOperand(0).getReg();
10818 if (MI.getOperand(1).isReg()) {
10819 if (MI.getOperand(1).getSubReg())
10820 return false;
10821 SrcReg2 = MI.getOperand(1).getReg();
10822 CmpValue = 0;
10823 } else if (MI.getOperand(1).isImm()) {
10824 SrcReg2 = Register();
10825 CmpValue = MI.getOperand(1).getImm();
10826 } else {
10827 return false;
10828 }
10829 CmpMask = ~0;
10830 return true;
10831 case AMDGPU::S_CMPK_EQ_U32:
10832 case AMDGPU::S_CMPK_EQ_I32:
10833 case AMDGPU::S_CMPK_LG_U32:
10834 case AMDGPU::S_CMPK_LG_I32:
10835 case AMDGPU::S_CMPK_LT_U32:
10836 case AMDGPU::S_CMPK_LT_I32:
10837 case AMDGPU::S_CMPK_GT_U32:
10838 case AMDGPU::S_CMPK_GT_I32:
10839 case AMDGPU::S_CMPK_LE_U32:
10840 case AMDGPU::S_CMPK_LE_I32:
10841 case AMDGPU::S_CMPK_GE_U32:
10842 case AMDGPU::S_CMPK_GE_I32:
10843 SrcReg = MI.getOperand(0).getReg();
10844 SrcReg2 = Register();
10845 CmpValue = MI.getOperand(1).getImm();
10846 CmpMask = ~0;
10847 return true;
10848 }
10849
10850 return false;
10851}
10852
10854 for (MachineBasicBlock *S : MBB->successors()) {
10855 if (S->isLiveIn(AMDGPU::SCC))
10856 return false;
10857 }
10858 return true;
10859}
10860
10861// Invert all uses of SCC following SCCDef because SCCDef may be deleted and
10862// (incoming SCC) = !(SCC defined by SCCDef).
10863// Return true if all uses can be re-written, false otherwise.
10864bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
10865 MachineBasicBlock *MBB = SCCDef->getParent();
10866 SmallVector<MachineInstr *> InvertInstr;
10867 bool SCCIsDead = false;
10868
10869 // Scan instructions for SCC uses that need to be inverted until SCC is dead.
10870 constexpr unsigned ScanLimit = 12;
10871 unsigned Count = 0;
10872 for (MachineInstr &MI :
10873 make_range(std::next(MachineBasicBlock::iterator(SCCDef)), MBB->end())) {
10874 if (++Count > ScanLimit)
10875 return false;
10876 if (MI.readsRegister(AMDGPU::SCC, &RI)) {
10877 if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
10878 MI.getOpcode() == AMDGPU::S_CSELECT_B64 ||
10879 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
10880 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC1)
10881 InvertInstr.push_back(&MI);
10882 else
10883 return false;
10884 }
10885 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
10886 SCCIsDead = true;
10887 break;
10888 }
10889 }
10890 if (!SCCIsDead && isSCCDeadOnExit(MBB))
10891 SCCIsDead = true;
10892
10893 // SCC may have more uses. Can't invert all of them.
10894 if (!SCCIsDead)
10895 return false;
10896
10897 // Invert uses
10898 for (MachineInstr *MI : InvertInstr) {
10899 if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
10900 MI->getOpcode() == AMDGPU::S_CSELECT_B64) {
10901 swapOperands(*MI);
10902 } else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
10903 MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) {
10904 MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
10905 ? AMDGPU::S_CBRANCH_SCC1
10906 : AMDGPU::S_CBRANCH_SCC0));
10907 } else {
10908 llvm_unreachable("SCC used but no inversion handling");
10909 }
10910 }
10911 return true;
10912}
10913
10914// SCC is already valid after SCCValid.
10915// SCCRedefine will redefine SCC to the same value already available after
10916// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
10917// update kill/dead flags if necessary.
10918bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
10919 bool NeedInversion) const {
10920 MachineInstr *KillsSCC = nullptr;
10921 if (SCCValid->getParent() != SCCRedefine->getParent())
10922 return false;
10923 for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
10924 SCCRedefine->getIterator())) {
10925 if (MI.modifiesRegister(AMDGPU::SCC, &RI))
10926 return false;
10927 if (MI.killsRegister(AMDGPU::SCC, &RI))
10928 KillsSCC = &MI;
10929 }
10930 if (NeedInversion && !invertSCCUse(SCCRedefine))
10931 return false;
10932 if (MachineOperand *SccDef =
10933 SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
10934 SccDef->setIsDead(false);
10935 if (KillsSCC)
10936 KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
10937 SCCRedefine->eraseFromParent();
10938 return true;
10939}
10940
10941static bool foldableSelect(const MachineInstr &Def) {
10942 if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
10943 Def.getOpcode() != AMDGPU::S_CSELECT_B64)
10944 return false;
10945 bool Op1IsNonZeroImm =
10946 Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
10947 bool Op2IsZeroImm =
10948 Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
10949 if (!Op1IsNonZeroImm || !Op2IsZeroImm)
10950 return false;
10951 return true;
10952}
10953
10954static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion,
10955 unsigned &NewDefOpc) {
10956 // S_ADD_U32 X, 1 sets SCC on carryout which can only happen if result==0.
10957 // S_ADD_I32 X, 1 can be converted to S_ADD_U32 X, 1 if SCC is dead.
10958 if (Def.getOpcode() != AMDGPU::S_ADD_I32 &&
10959 Def.getOpcode() != AMDGPU::S_ADD_U32)
10960 return false;
10961 const MachineOperand &AddSrc1 = Def.getOperand(1);
10962 const MachineOperand &AddSrc2 = Def.getOperand(2);
10963 int64_t addend;
10964
10965 if ((!AddSrc1.isImm() || AddSrc1.getImm() != 1) &&
10966 (!AddSrc2.isImm() || AddSrc2.getImm() != 1) &&
10967 (!getFoldableImm(&AddSrc1, addend) || addend != 1) &&
10968 (!getFoldableImm(&AddSrc2, addend) || addend != 1))
10969 return false;
10970
10971 if (Def.getOpcode() == AMDGPU::S_ADD_I32) {
10972 const MachineOperand *SccDef =
10973 Def.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
10974 if (!SccDef->isDead())
10975 return false;
10976 NewDefOpc = AMDGPU::S_ADD_U32;
10977 }
10978 NeedInversion = !NeedInversion;
10979 return true;
10980}
10981
10983 Register SrcReg2, int64_t CmpMask,
10984 int64_t CmpValue,
10985 const MachineRegisterInfo *MRI) const {
10986 if (!SrcReg || SrcReg.isPhysical())
10987 return false;
10988
10989 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
10990 return false;
10991
10992 const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
10993 this](bool NeedInversion) -> bool {
10994 if (CmpValue != 0)
10995 return false;
10996
10997 MachineInstr *Def = MRI->getVRegDef(SrcReg);
10998 if (!Def)
10999 return false;
11000
11001 // For S_OP that set SCC = DST!=0, do the transformation
11002 //
11003 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11004 //
11005 // For (S_OP ...) that set SCC = DST==0, invert NeedInversion and
11006 // do the transformation:
11007 //
11008 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11009 //
11010 // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
11011 // for S_CSELECT* already has the same value that will be calculated by
11012 // s_cmp_lg_*
11013 //
11014 // s_cmp_[lg|eq]_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT*
11015 // (non-zero imm), 0)
11016
11017 unsigned NewDefOpc = Def->getOpcode();
11018 if (!setsSCCIfResultIsNonZero(*Def) &&
11019 !setsSCCIfResultIsZero(*Def, NeedInversion, NewDefOpc) &&
11020 !foldableSelect(*Def))
11021 return false;
11022
11023 if (!optimizeSCC(Def, &CmpInstr, NeedInversion))
11024 return false;
11025
11026 if (NewDefOpc != Def->getOpcode())
11027 Def->setDesc(get(NewDefOpc));
11028
11029 // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
11030 // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
11031 // 64-bit foldableSelect then delete s_or_b32 in the sequence:
11032 // sX = s_cselect_b64 (non-zero imm), 0
11033 // sLo = copy sX.sub0
11034 // sHi = copy sX.sub1
11035 // sY = s_or_b32 sLo, sHi
11036 if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
11037 MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
11038 const MachineOperand &OrOpnd1 = Def->getOperand(1);
11039 const MachineOperand &OrOpnd2 = Def->getOperand(2);
11040 if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
11041 MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
11042 MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
11043 if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
11044 Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
11045 Def2->getOperand(1).isReg() &&
11046 Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
11047 Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
11048 Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
11049 MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
11050 if (Select && foldableSelect(*Select))
11051 optimizeSCC(Select, Def, /*NeedInversion=*/false);
11052 }
11053 }
11054 }
11055 return true;
11056 };
11057
11058 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
11059 this](int64_t ExpectedValue, unsigned SrcSize,
11060 bool IsReversible, bool IsSigned) -> bool {
11061 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11062 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11063 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11064 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11065 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
11066 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11067 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11068 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11069 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11070 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
11071 //
11072 // Signed ge/gt are not used for the sign bit.
11073 //
11074 // If result of the AND is unused except in the compare:
11075 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
11076 //
11077 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11078 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11079 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
11080 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11081 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11082 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
11083
11084 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11085 if (!Def)
11086 return false;
11087
11088 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
11089 Def->getOpcode() != AMDGPU::S_AND_B64)
11090 return false;
11091
11092 int64_t Mask;
11093 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
11094 if (MO->isImm())
11095 Mask = MO->getImm();
11096 else if (!getFoldableImm(MO, Mask))
11097 return false;
11098 Mask &= maxUIntN(SrcSize);
11099 return isPowerOf2_64(Mask);
11100 };
11101
11102 MachineOperand *SrcOp = &Def->getOperand(1);
11103 if (isMask(SrcOp))
11104 SrcOp = &Def->getOperand(2);
11105 else if (isMask(&Def->getOperand(2)))
11106 SrcOp = &Def->getOperand(1);
11107 else
11108 return false;
11109
11110 // A valid Mask is required to have a single bit set, hence a non-zero and
11111 // power-of-two value. This verifies that we will not do 64-bit shift below.
11112 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
11113 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
11114 if (IsSigned && BitNo == SrcSize - 1)
11115 return false;
11116
11117 ExpectedValue <<= BitNo;
11118
11119 bool IsReversedCC = false;
11120 if (CmpValue != ExpectedValue) {
11121 if (!IsReversible)
11122 return false;
11123 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
11124 if (!IsReversedCC)
11125 return false;
11126 }
11127
11128 Register DefReg = Def->getOperand(0).getReg();
11129 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
11130 return false;
11131
11132 if (!optimizeSCC(Def, &CmpInstr, /*NeedInversion=*/false))
11133 return false;
11134
11135 if (!MRI->use_nodbg_empty(DefReg)) {
11136 assert(!IsReversedCC);
11137 return true;
11138 }
11139
11140 // Replace AND with unused result with a S_BITCMP.
11141 MachineBasicBlock *MBB = Def->getParent();
11142
11143 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
11144 : AMDGPU::S_BITCMP1_B32
11145 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
11146 : AMDGPU::S_BITCMP1_B64;
11147
11148 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
11149 .add(*SrcOp)
11150 .addImm(BitNo);
11151 Def->eraseFromParent();
11152
11153 return true;
11154 };
11155
11156 switch (CmpInstr.getOpcode()) {
11157 default:
11158 break;
11159 case AMDGPU::S_CMP_EQ_U32:
11160 case AMDGPU::S_CMP_EQ_I32:
11161 case AMDGPU::S_CMPK_EQ_U32:
11162 case AMDGPU::S_CMPK_EQ_I32:
11163 return optimizeCmpAnd(1, 32, true, false) ||
11164 optimizeCmpSelect(/*NeedInversion=*/true);
11165 case AMDGPU::S_CMP_GE_U32:
11166 case AMDGPU::S_CMPK_GE_U32:
11167 return optimizeCmpAnd(1, 32, false, false);
11168 case AMDGPU::S_CMP_GE_I32:
11169 case AMDGPU::S_CMPK_GE_I32:
11170 return optimizeCmpAnd(1, 32, false, true);
11171 case AMDGPU::S_CMP_EQ_U64:
11172 return optimizeCmpAnd(1, 64, true, false);
11173 case AMDGPU::S_CMP_LG_U32:
11174 case AMDGPU::S_CMP_LG_I32:
11175 case AMDGPU::S_CMPK_LG_U32:
11176 case AMDGPU::S_CMPK_LG_I32:
11177 return optimizeCmpAnd(0, 32, true, false) ||
11178 optimizeCmpSelect(/*NeedInversion=*/false);
11179 case AMDGPU::S_CMP_GT_U32:
11180 case AMDGPU::S_CMPK_GT_U32:
11181 return optimizeCmpAnd(0, 32, false, false);
11182 case AMDGPU::S_CMP_GT_I32:
11183 case AMDGPU::S_CMPK_GT_I32:
11184 return optimizeCmpAnd(0, 32, false, true);
11185 case AMDGPU::S_CMP_LG_U64:
11186 return optimizeCmpAnd(0, 64, true, false) ||
11187 optimizeCmpSelect(/*NeedInversion=*/false);
11188 }
11189
11190 return false;
11191}
11192
11194 AMDGPU::OpName OpName) const {
11195 if (!ST.needsAlignedVGPRs())
11196 return;
11197
11198 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
11199 if (OpNo < 0)
11200 return;
11201 MachineOperand &Op = MI.getOperand(OpNo);
11202 if (getOpSize(MI, OpNo) > 4)
11203 return;
11204
11205 // Add implicit aligned super-reg to force alignment on the data operand.
11206 const DebugLoc &DL = MI.getDebugLoc();
11207 MachineBasicBlock *BB = MI.getParent();
11209 Register DataReg = Op.getReg();
11210 bool IsAGPR = RI.isAGPR(MRI, DataReg);
11211 Register Undef = MRI.createVirtualRegister(
11212 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
11213 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
11214 Register NewVR =
11215 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
11216 : &AMDGPU::VReg_64_Align2RegClass);
11217 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
11218 .addReg(DataReg, {}, Op.getSubReg())
11219 .addImm(AMDGPU::sub0)
11220 .addReg(Undef)
11221 .addImm(AMDGPU::sub1);
11222 Op.setReg(NewVR);
11223 Op.setSubReg(AMDGPU::sub0);
11224 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
11225}
11226
11228 if (isIGLP(*MI))
11229 return false;
11230
11232}
11233
11235 if (!isWMMA(MI) && !isSWMMAC(MI))
11236 return false;
11237
11238 if (ST.hasGFX1250Insts())
11239 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
11240
11241 return true;
11242}
11243
11245 unsigned Opcode = MI.getOpcode();
11246
11247 if (AMDGPU::isGFX12Plus(ST))
11248 return isDOT(MI) || isXDLWMMA(MI);
11249
11250 if (!isMAI(MI) || isDGEMM(Opcode) ||
11251 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
11252 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
11253 return false;
11254
11255 if (!ST.hasGFX940Insts())
11256 return true;
11257
11258 return AMDGPU::getMAIIsGFX940XDL(Opcode);
11259}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion, unsigned &NewDefOpc)
static bool isSCCDeadOnExit(MachineBasicBlock *MBB)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool foldableSelect(const MachineInstr &Def)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
#define LLVM_DEBUG(...)
Definition Debug.h:114
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:144
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1571
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:92
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool isBundle() const
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI void eraseFromBundle()
Unlink 'this' from its basic block and delete it.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
LLVM_ABI void clearRegisterKills(Register Reg, const TargetRegisterInfo *RegInfo)
Clear all kill flags affecting Reg.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
bool canAddToBBProlog(const MachineInstr &MI) const
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
bool mayAccessScratch(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static unsigned getFoldableCopySrcIdx(const MachineInstr &MI)
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool setsSCCIfResultIsNonZero(const MachineInstr &MI)
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const final
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isNeverCoissue(MachineInstr &MI) const
static bool isBUF(const MachineInstr &MI)
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
static bool isFLATGlobal(const MachineInstr &MI)
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig) const override
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isTRANS(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
bool isReMaterializableImpl(const MachineInstr &MI) const override
static bool isVOP3(const MCInstrDesc &Desc)
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
static bool isMFMA(const MachineInstr &MI)
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
void mutateAndCleanupImplicit(MachineInstr &MI, const MCInstrDesc &NewDesc) const
bool isAlwaysGDS(uint16_t Opcode) const
static bool isMAI(const MCInstrDesc &Desc)
static bool usesLGKM_CNT(const MachineInstr &MI)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
const MachineOperand & getCalleeOperand(const MachineInstr &MI) const override
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
static bool isWWMRegSpillOpcode(uint16_t Opcode)
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
static bool usesVM_CNT(const MachineInstr &MI)
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
std::optional< int64_t > getImmOrMaterializedImm(MachineOperand &Op) const
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual const MachineOperand & getCalleeOperand(const MachineInstr &MI) const
Returns the callee operand from the given MI.
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
bool isPKFMACF16InlineConstant(uint32_t Literal, bool IsGFX11Plus)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:233
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:203
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:210
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:224
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:226
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:212
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:207
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:202
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:209
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:208
@ OPERAND_REG_IMM_V2FP16_SPLAT
Definition SIDefines.h:211
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:220
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:218
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:213
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:206
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:227
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:238
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:239
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:214
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:250
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:205
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:223
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:219
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:225
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:244
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:215
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:240
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:222
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:204
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:230
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, const MCSubtargetInfo &ST)
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:587
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:589
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:586
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:588
@ TI_CONSTDATA_START
Definition AMDGPU.h:585
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
int getMCOpcode(uint16_t Opcode, unsigned Gen)
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:71
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:70
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:61
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:67
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:72
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:532
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:207
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, const MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI VirtRegInfo AnalyzeVirtRegInBundle(MachineInstr &MI, Register Reg, SmallVectorImpl< std::pair< MachineInstr *, unsigned > > *Ops=nullptr)
AnalyzeVirtRegInBundle - Analyze how the current instruction or bundle uses a virtual register.
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:118
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr RegState getUndefRegState(bool B)
GenericCycleInfo< MachineSSAContext > MachineCycleInfo
MachineCycleInfo::CycleT MachineCycle
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
Helper struct for the implementation of 3-address conversion to communicate updates made to instructi...
MachineInstr * RemoveMIUse
Other instruction whose def is no longer used by the converted instruction.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:56
MachineInstr * top() const
Definition SIInstrInfo.h:61
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:80
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.
VirtRegInfo - Information about a virtual register used by a set of operands.
bool Reads
Reads - One of the operands read the virtual register.
bool Writes
Writes - One of the operands writes the virtual register.