LLVM 19.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "GCNHazardRecognizer.h"
18#include "GCNSubtarget.h"
31#include "llvm/IR/IntrinsicsAMDGPU.h"
32#include "llvm/MC/MCContext.h"
35
36using namespace llvm;
37
38#define DEBUG_TYPE "si-instr-info"
39
40#define GET_INSTRINFO_CTOR_DTOR
41#include "AMDGPUGenInstrInfo.inc"
42
43namespace llvm {
44namespace AMDGPU {
45#define GET_D16ImageDimIntrinsics_IMPL
46#define GET_ImageDimIntrinsicTable_IMPL
47#define GET_RsrcIntrinsics_IMPL
48#include "AMDGPUGenSearchableTables.inc"
49}
50}
51
52
53// Must be at least 4 to be able to branch over minimum unconditional branch
54// code. This is only for making it possible to write reasonably small tests for
55// long branches.
57BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
58 cl::desc("Restrict range of branch instructions (DEBUG)"));
59
61 "amdgpu-fix-16-bit-physreg-copies",
62 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
63 cl::init(true),
65
67 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
68 RI(ST), ST(ST) {
69 SchedModel.init(&ST);
70}
71
72//===----------------------------------------------------------------------===//
73// TargetInstrInfo callbacks
74//===----------------------------------------------------------------------===//
75
76static unsigned getNumOperandsNoGlue(SDNode *Node) {
77 unsigned N = Node->getNumOperands();
78 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
79 --N;
80 return N;
81}
82
83/// Returns true if both nodes have the same value for the given
84/// operand \p Op, or if both nodes do not have this operand.
85static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
86 unsigned Opc0 = N0->getMachineOpcode();
87 unsigned Opc1 = N1->getMachineOpcode();
88
89 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
90 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
91
92 if (Op0Idx == -1 && Op1Idx == -1)
93 return true;
94
95
96 if ((Op0Idx == -1 && Op1Idx != -1) ||
97 (Op1Idx == -1 && Op0Idx != -1))
98 return false;
99
100 // getNamedOperandIdx returns the index for the MachineInstr's operands,
101 // which includes the result as the first operand. We are indexing into the
102 // MachineSDNode's operands, so we need to skip the result operand to get
103 // the real index.
104 --Op0Idx;
105 --Op1Idx;
106
107 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
108}
109
110static bool canRemat(const MachineInstr &MI) {
111
115 return true;
116
117 if (SIInstrInfo::isSMRD(MI)) {
118 return !MI.memoperands_empty() &&
119 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
120 return MMO->isLoad() && MMO->isInvariant();
121 });
122 }
123
124 return false;
125}
126
128 const MachineInstr &MI) const {
129
130 if (canRemat(MI)) {
131 // Normally VALU use of exec would block the rematerialization, but that
132 // is OK in this case to have an implicit exec read as all VALU do.
133 // We really want all of the generic logic for this except for this.
134
135 // Another potential implicit use is mode register. The core logic of
136 // the RA will not attempt rematerialization if mode is set anywhere
137 // in the function, otherwise it is safe since mode is not changed.
138
139 // There is difference to generic method which does not allow
140 // rematerialization if there are virtual register uses. We allow this,
141 // therefore this method includes SOP instructions as well.
142 if (!MI.hasImplicitDef() &&
143 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
144 !MI.mayRaiseFPException())
145 return true;
146 }
147
149}
150
151// Returns true if the scalar result of a VALU instruction depends on exec.
153 // Ignore comparisons which are only used masked with exec.
154 // This allows some hoisting/sinking of VALU comparisons.
155 if (MI.isCompare()) {
156 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
157 Register DstReg = MI.getOperand(0).getReg();
158 if (!DstReg.isVirtual())
159 return true;
160 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
161 switch (Use.getOpcode()) {
162 case AMDGPU::S_AND_SAVEEXEC_B32:
163 case AMDGPU::S_AND_SAVEEXEC_B64:
164 break;
165 case AMDGPU::S_AND_B32:
166 case AMDGPU::S_AND_B64:
167 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
168 return true;
169 break;
170 default:
171 return true;
172 }
173 }
174 return false;
175 }
176
177 switch (MI.getOpcode()) {
178 default:
179 break;
180 case AMDGPU::V_READFIRSTLANE_B32:
181 return true;
182 }
183
184 return false;
185}
186
188 // Any implicit use of exec by VALU is not a real register read.
189 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
191}
192
194 MachineBasicBlock *SuccToSinkTo,
195 MachineCycleInfo *CI) const {
196 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
197 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
198 return true;
199
200 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
201 // Check if sinking of MI would create temporal divergent use.
202 for (auto Op : MI.uses()) {
203 if (Op.isReg() && Op.getReg().isVirtual() &&
204 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
205 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
206
207 // SgprDef defined inside cycle
208 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
209 if (FromCycle == nullptr)
210 continue;
211
212 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
213 // Check if there is a FromCycle that contains SgprDef's basic block but
214 // does not contain SuccToSinkTo and also has divergent exit condition.
215 while (FromCycle && !FromCycle->contains(ToCycle)) {
216 // After structurize-cfg, there should be exactly one cycle exit.
218 FromCycle->getExitBlocks(ExitBlocks);
219 assert(ExitBlocks.size() == 1);
220 assert(ExitBlocks[0]->getSinglePredecessor());
221
222 // FromCycle has divergent exit condition.
223 if (hasDivergentBranch(ExitBlocks[0]->getSinglePredecessor())) {
224 return false;
225 }
226
227 FromCycle = FromCycle->getParentCycle();
228 }
229 }
230 }
231
232 return true;
233}
234
236 int64_t &Offset0,
237 int64_t &Offset1) const {
238 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
239 return false;
240
241 unsigned Opc0 = Load0->getMachineOpcode();
242 unsigned Opc1 = Load1->getMachineOpcode();
243
244 // Make sure both are actually loads.
245 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
246 return false;
247
248 // A mayLoad instruction without a def is not a load. Likely a prefetch.
249 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
250 return false;
251
252 if (isDS(Opc0) && isDS(Opc1)) {
253
254 // FIXME: Handle this case:
255 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
256 return false;
257
258 // Check base reg.
259 if (Load0->getOperand(0) != Load1->getOperand(0))
260 return false;
261
262 // Skip read2 / write2 variants for simplicity.
263 // TODO: We should report true if the used offsets are adjacent (excluded
264 // st64 versions).
265 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
266 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
267 if (Offset0Idx == -1 || Offset1Idx == -1)
268 return false;
269
270 // XXX - be careful of dataless loads
271 // getNamedOperandIdx returns the index for MachineInstrs. Since they
272 // include the output in the operand list, but SDNodes don't, we need to
273 // subtract the index by one.
274 Offset0Idx -= get(Opc0).NumDefs;
275 Offset1Idx -= get(Opc1).NumDefs;
276 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
277 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
278 return true;
279 }
280
281 if (isSMRD(Opc0) && isSMRD(Opc1)) {
282 // Skip time and cache invalidation instructions.
283 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
284 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
285 return false;
286
287 unsigned NumOps = getNumOperandsNoGlue(Load0);
288 if (NumOps != getNumOperandsNoGlue(Load1))
289 return false;
290
291 // Check base reg.
292 if (Load0->getOperand(0) != Load1->getOperand(0))
293 return false;
294
295 // Match register offsets, if both register and immediate offsets present.
296 assert(NumOps == 4 || NumOps == 5);
297 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
298 return false;
299
300 const ConstantSDNode *Load0Offset =
301 dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3));
302 const ConstantSDNode *Load1Offset =
303 dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3));
304
305 if (!Load0Offset || !Load1Offset)
306 return false;
307
308 Offset0 = Load0Offset->getZExtValue();
309 Offset1 = Load1Offset->getZExtValue();
310 return true;
311 }
312
313 // MUBUF and MTBUF can access the same addresses.
314 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
315
316 // MUBUF and MTBUF have vaddr at different indices.
317 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
318 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
319 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
320 return false;
321
322 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
323 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
324
325 if (OffIdx0 == -1 || OffIdx1 == -1)
326 return false;
327
328 // getNamedOperandIdx returns the index for MachineInstrs. Since they
329 // include the output in the operand list, but SDNodes don't, we need to
330 // subtract the index by one.
331 OffIdx0 -= get(Opc0).NumDefs;
332 OffIdx1 -= get(Opc1).NumDefs;
333
334 SDValue Off0 = Load0->getOperand(OffIdx0);
335 SDValue Off1 = Load1->getOperand(OffIdx1);
336
337 // The offset might be a FrameIndexSDNode.
338 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
339 return false;
340
341 Offset0 = Off0->getAsZExtVal();
342 Offset1 = Off1->getAsZExtVal();
343 return true;
344 }
345
346 return false;
347}
348
349static bool isStride64(unsigned Opc) {
350 switch (Opc) {
351 case AMDGPU::DS_READ2ST64_B32:
352 case AMDGPU::DS_READ2ST64_B64:
353 case AMDGPU::DS_WRITE2ST64_B32:
354 case AMDGPU::DS_WRITE2ST64_B64:
355 return true;
356 default:
357 return false;
358 }
359}
360
363 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
364 const TargetRegisterInfo *TRI) const {
365 if (!LdSt.mayLoadOrStore())
366 return false;
367
368 unsigned Opc = LdSt.getOpcode();
369 OffsetIsScalable = false;
370 const MachineOperand *BaseOp, *OffsetOp;
371 int DataOpIdx;
372
373 if (isDS(LdSt)) {
374 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
375 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
376 if (OffsetOp) {
377 // Normal, single offset LDS instruction.
378 if (!BaseOp) {
379 // DS_CONSUME/DS_APPEND use M0 for the base address.
380 // TODO: find the implicit use operand for M0 and use that as BaseOp?
381 return false;
382 }
383 BaseOps.push_back(BaseOp);
384 Offset = OffsetOp->getImm();
385 // Get appropriate operand, and compute width accordingly.
386 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
387 if (DataOpIdx == -1)
388 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
389 Width = getOpSize(LdSt, DataOpIdx);
390 } else {
391 // The 2 offset instructions use offset0 and offset1 instead. We can treat
392 // these as a load with a single offset if the 2 offsets are consecutive.
393 // We will use this for some partially aligned loads.
394 const MachineOperand *Offset0Op =
395 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
396 const MachineOperand *Offset1Op =
397 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
398
399 unsigned Offset0 = Offset0Op->getImm() & 0xff;
400 unsigned Offset1 = Offset1Op->getImm() & 0xff;
401 if (Offset0 + 1 != Offset1)
402 return false;
403
404 // Each of these offsets is in element sized units, so we need to convert
405 // to bytes of the individual reads.
406
407 unsigned EltSize;
408 if (LdSt.mayLoad())
409 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
410 else {
411 assert(LdSt.mayStore());
412 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
413 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
414 }
415
416 if (isStride64(Opc))
417 EltSize *= 64;
418
419 BaseOps.push_back(BaseOp);
420 Offset = EltSize * Offset0;
421 // Get appropriate operand(s), and compute width accordingly.
422 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
423 if (DataOpIdx == -1) {
424 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
425 Width = getOpSize(LdSt, DataOpIdx);
426 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
427 Width = Width.getValue() + getOpSize(LdSt, DataOpIdx);
428 } else {
429 Width = getOpSize(LdSt, DataOpIdx);
430 }
431 }
432 return true;
433 }
434
435 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
436 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
437 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
438 return false;
439 BaseOps.push_back(RSrc);
440 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
441 if (BaseOp && !BaseOp->isFI())
442 BaseOps.push_back(BaseOp);
443 const MachineOperand *OffsetImm =
444 getNamedOperand(LdSt, AMDGPU::OpName::offset);
445 Offset = OffsetImm->getImm();
446 const MachineOperand *SOffset =
447 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
448 if (SOffset) {
449 if (SOffset->isReg())
450 BaseOps.push_back(SOffset);
451 else
452 Offset += SOffset->getImm();
453 }
454 // Get appropriate operand, and compute width accordingly.
455 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
456 if (DataOpIdx == -1)
457 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
458 if (DataOpIdx == -1) // LDS DMA
459 return false;
460 Width = getOpSize(LdSt, DataOpIdx);
461 return true;
462 }
463
464 if (isMIMG(LdSt)) {
465 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
466 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
467 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
468 if (VAddr0Idx >= 0) {
469 // GFX10 possible NSA encoding.
470 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
471 BaseOps.push_back(&LdSt.getOperand(I));
472 } else {
473 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
474 }
475 Offset = 0;
476 // Get appropriate operand, and compute width accordingly.
477 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
478 Width = getOpSize(LdSt, DataOpIdx);
479 return true;
480 }
481
482 if (isSMRD(LdSt)) {
483 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
484 if (!BaseOp) // e.g. S_MEMTIME
485 return false;
486 BaseOps.push_back(BaseOp);
487 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
488 Offset = OffsetOp ? OffsetOp->getImm() : 0;
489 // Get appropriate operand, and compute width accordingly.
490 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
491 if (DataOpIdx == -1)
492 return false;
493 Width = getOpSize(LdSt, DataOpIdx);
494 return true;
495 }
496
497 if (isFLAT(LdSt)) {
498 // Instructions have either vaddr or saddr or both or none.
499 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
500 if (BaseOp)
501 BaseOps.push_back(BaseOp);
502 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
503 if (BaseOp)
504 BaseOps.push_back(BaseOp);
505 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
506 // Get appropriate operand, and compute width accordingly.
507 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
508 if (DataOpIdx == -1)
509 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
510 if (DataOpIdx == -1) // LDS DMA
511 return false;
512 Width = getOpSize(LdSt, DataOpIdx);
513 return true;
514 }
515
516 return false;
517}
518
519static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
521 const MachineInstr &MI2,
523 // Only examine the first "base" operand of each instruction, on the
524 // assumption that it represents the real base address of the memory access.
525 // Other operands are typically offsets or indices from this base address.
526 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
527 return true;
528
529 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
530 return false;
531
532 auto MO1 = *MI1.memoperands_begin();
533 auto MO2 = *MI2.memoperands_begin();
534 if (MO1->getAddrSpace() != MO2->getAddrSpace())
535 return false;
536
537 auto Base1 = MO1->getValue();
538 auto Base2 = MO2->getValue();
539 if (!Base1 || !Base2)
540 return false;
541 Base1 = getUnderlyingObject(Base1);
542 Base2 = getUnderlyingObject(Base2);
543
544 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
545 return false;
546
547 return Base1 == Base2;
548}
549
551 int64_t Offset1, bool OffsetIsScalable1,
553 int64_t Offset2, bool OffsetIsScalable2,
554 unsigned ClusterSize,
555 unsigned NumBytes) const {
556 // If the mem ops (to be clustered) do not have the same base ptr, then they
557 // should not be clustered
558 if (!BaseOps1.empty() && !BaseOps2.empty()) {
559 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
560 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
561 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
562 return false;
563 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
564 // If only one base op is empty, they do not have the same base ptr
565 return false;
566 }
567
568 // In order to avoid register pressure, on an average, the number of DWORDS
569 // loaded together by all clustered mem ops should not exceed 8. This is an
570 // empirical value based on certain observations and performance related
571 // experiments.
572 // The good thing about this heuristic is - it avoids clustering of too many
573 // sub-word loads, and also avoids clustering of wide loads. Below is the
574 // brief summary of how the heuristic behaves for various `LoadSize`.
575 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
576 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
577 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
578 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
579 // (5) LoadSize >= 17: do not cluster
580 const unsigned LoadSize = NumBytes / ClusterSize;
581 const unsigned NumDWORDs = ((LoadSize + 3) / 4) * ClusterSize;
582 return NumDWORDs <= 8;
583}
584
585// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
586// the first 16 loads will be interleaved with the stores, and the next 16 will
587// be clustered as expected. It should really split into 2 16 store batches.
588//
589// Loads are clustered until this returns false, rather than trying to schedule
590// groups of stores. This also means we have to deal with saying different
591// address space loads should be clustered, and ones which might cause bank
592// conflicts.
593//
594// This might be deprecated so it might not be worth that much effort to fix.
596 int64_t Offset0, int64_t Offset1,
597 unsigned NumLoads) const {
598 assert(Offset1 > Offset0 &&
599 "Second offset should be larger than first offset!");
600 // If we have less than 16 loads in a row, and the offsets are within 64
601 // bytes, then schedule together.
602
603 // A cacheline is 64 bytes (for global memory).
604 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
605}
606
609 const DebugLoc &DL, MCRegister DestReg,
610 MCRegister SrcReg, bool KillSrc,
611 const char *Msg = "illegal VGPR to SGPR copy") {
613 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
615 C.diagnose(IllegalCopy);
616
617 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
618 .addReg(SrcReg, getKillRegState(KillSrc));
619}
620
621/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
622/// possible to have a direct copy in these cases on GFX908, so an intermediate
623/// VGPR copy is required.
627 const DebugLoc &DL, MCRegister DestReg,
628 MCRegister SrcReg, bool KillSrc,
629 RegScavenger &RS, bool RegsOverlap,
630 Register ImpDefSuperReg = Register(),
631 Register ImpUseSuperReg = Register()) {
632 assert((TII.getSubtarget().hasMAIInsts() &&
633 !TII.getSubtarget().hasGFX90AInsts()) &&
634 "Expected GFX908 subtarget.");
635
636 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
637 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
638 "Source register of the copy should be either an SGPR or an AGPR.");
639
640 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
641 "Destination register of the copy should be an AGPR.");
642
643 const SIRegisterInfo &RI = TII.getRegisterInfo();
644
645 // First try to find defining accvgpr_write to avoid temporary registers.
646 // In the case of copies of overlapping AGPRs, we conservatively do not
647 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
648 // an accvgpr_write used for this same copy due to implicit-defs
649 if (!RegsOverlap) {
650 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
651 --Def;
652
653 if (!Def->modifiesRegister(SrcReg, &RI))
654 continue;
655
656 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
657 Def->getOperand(0).getReg() != SrcReg)
658 break;
659
660 MachineOperand &DefOp = Def->getOperand(1);
661 assert(DefOp.isReg() || DefOp.isImm());
662
663 if (DefOp.isReg()) {
664 bool SafeToPropagate = true;
665 // Check that register source operand is not clobbered before MI.
666 // Immediate operands are always safe to propagate.
667 for (auto I = Def; I != MI && SafeToPropagate; ++I)
668 if (I->modifiesRegister(DefOp.getReg(), &RI))
669 SafeToPropagate = false;
670
671 if (!SafeToPropagate)
672 break;
673
674 DefOp.setIsKill(false);
675 }
676
677 MachineInstrBuilder Builder =
678 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
679 .add(DefOp);
680 if (ImpDefSuperReg)
681 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
682
683 if (ImpUseSuperReg) {
684 Builder.addReg(ImpUseSuperReg,
686 }
687
688 return;
689 }
690 }
691
693 RS.backward(std::next(MI));
694
695 // Ideally we want to have three registers for a long reg_sequence copy
696 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
697 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
698 *MBB.getParent());
699
700 // Registers in the sequence are allocated contiguously so we can just
701 // use register number to pick one of three round-robin temps.
702 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
703 Register Tmp =
704 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
706 "VGPR used for an intermediate copy should have been reserved.");
707
708 // Only loop through if there are any free registers left. We don't want to
709 // spill.
710 while (RegNo--) {
711 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
712 /* RestoreAfter */ false, 0,
713 /* AllowSpill */ false);
714 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
715 break;
716 Tmp = Tmp2;
717 RS.setRegUsed(Tmp);
718 }
719
720 // Insert copy to temporary VGPR.
721 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
722 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
723 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
724 } else {
725 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
726 }
727
728 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
729 .addReg(SrcReg, getKillRegState(KillSrc));
730 if (ImpUseSuperReg) {
731 UseBuilder.addReg(ImpUseSuperReg,
733 }
734
735 MachineInstrBuilder DefBuilder
736 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
737 .addReg(Tmp, RegState::Kill);
738
739 if (ImpDefSuperReg)
740 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
741}
742
745 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
746 const TargetRegisterClass *RC, bool Forward) {
747 const SIRegisterInfo &RI = TII.getRegisterInfo();
748 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
750 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
751
752 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
753 int16_t SubIdx = BaseIndices[Idx];
754 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
755 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
756 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
757 unsigned Opcode = AMDGPU::S_MOV_B32;
758
759 // Is SGPR aligned? If so try to combine with next.
760 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
761 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
762 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
763 // Can use SGPR64 copy
764 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
765 SubIdx = RI.getSubRegFromChannel(Channel, 2);
766 DestSubReg = RI.getSubReg(DestReg, SubIdx);
767 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
768 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
769 Opcode = AMDGPU::S_MOV_B64;
770 Idx++;
771 }
772
773 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
774 .addReg(SrcSubReg)
775 .addReg(SrcReg, RegState::Implicit);
776
777 if (!FirstMI)
778 FirstMI = LastMI;
779
780 if (!Forward)
781 I--;
782 }
783
784 assert(FirstMI && LastMI);
785 if (!Forward)
786 std::swap(FirstMI, LastMI);
787
788 FirstMI->addOperand(
789 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
790
791 if (KillSrc)
792 LastMI->addRegisterKilled(SrcReg, &RI);
793}
794
797 const DebugLoc &DL, MCRegister DestReg,
798 MCRegister SrcReg, bool KillSrc) const {
799 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
800 unsigned Size = RI.getRegSizeInBits(*RC);
801 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
802 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
803
804 // The rest of copyPhysReg assumes Src and Dst size are the same size.
805 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
806 // we remove Fix16BitCopies and this code block?
807 if (Fix16BitCopies) {
808 if (((Size == 16) != (SrcSize == 16))) {
809 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
811 MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg;
812 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
813 RegToFix = SubReg;
814
815 if (DestReg == SrcReg) {
816 // Identity copy. Insert empty bundle since ExpandPostRA expects an
817 // instruction here.
818 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
819 return;
820 }
821 RC = RI.getPhysRegBaseClass(DestReg);
822 Size = RI.getRegSizeInBits(*RC);
823 SrcRC = RI.getPhysRegBaseClass(SrcReg);
824 SrcSize = RI.getRegSizeInBits(*SrcRC);
825 }
826 }
827
828 if (RC == &AMDGPU::VGPR_32RegClass) {
829 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
830 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
831 AMDGPU::AGPR_32RegClass.contains(SrcReg));
832 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
833 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
834 BuildMI(MBB, MI, DL, get(Opc), DestReg)
835 .addReg(SrcReg, getKillRegState(KillSrc));
836 return;
837 }
838
839 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
840 RC == &AMDGPU::SReg_32RegClass) {
841 if (SrcReg == AMDGPU::SCC) {
842 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
843 .addImm(1)
844 .addImm(0);
845 return;
846 }
847
848 if (DestReg == AMDGPU::VCC_LO) {
849 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
850 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
851 .addReg(SrcReg, getKillRegState(KillSrc));
852 } else {
853 // FIXME: Hack until VReg_1 removed.
854 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
855 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
856 .addImm(0)
857 .addReg(SrcReg, getKillRegState(KillSrc));
858 }
859
860 return;
861 }
862
863 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
864 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
865 return;
866 }
867
868 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
869 .addReg(SrcReg, getKillRegState(KillSrc));
870 return;
871 }
872
873 if (RC == &AMDGPU::SReg_64RegClass) {
874 if (SrcReg == AMDGPU::SCC) {
875 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
876 .addImm(1)
877 .addImm(0);
878 return;
879 }
880
881 if (DestReg == AMDGPU::VCC) {
882 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
883 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
884 .addReg(SrcReg, getKillRegState(KillSrc));
885 } else {
886 // FIXME: Hack until VReg_1 removed.
887 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
888 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
889 .addImm(0)
890 .addReg(SrcReg, getKillRegState(KillSrc));
891 }
892
893 return;
894 }
895
896 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
897 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
898 return;
899 }
900
901 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
902 .addReg(SrcReg, getKillRegState(KillSrc));
903 return;
904 }
905
906 if (DestReg == AMDGPU::SCC) {
907 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
908 // but SelectionDAG emits such copies for i1 sources.
909 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
910 // This copy can only be produced by patterns
911 // with explicit SCC, which are known to be enabled
912 // only for subtargets with S_CMP_LG_U64 present.
914 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
915 .addReg(SrcReg, getKillRegState(KillSrc))
916 .addImm(0);
917 } else {
918 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
919 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
920 .addReg(SrcReg, getKillRegState(KillSrc))
921 .addImm(0);
922 }
923
924 return;
925 }
926
927 if (RC == &AMDGPU::AGPR_32RegClass) {
928 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
929 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
930 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
931 .addReg(SrcReg, getKillRegState(KillSrc));
932 return;
933 }
934
935 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
936 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
937 .addReg(SrcReg, getKillRegState(KillSrc));
938 return;
939 }
940
941 // FIXME: Pass should maintain scavenger to avoid scan through the block on
942 // every AGPR spill.
943 RegScavenger RS;
944 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
945 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
946 return;
947 }
948
949 if (Size == 16) {
950 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
951 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
952 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
953
954 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
955 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
956 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
957 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
958 bool DstLow = !AMDGPU::isHi(DestReg, RI);
959 bool SrcLow = !AMDGPU::isHi(SrcReg, RI);
960 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
961 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
962
963 if (IsSGPRDst) {
964 if (!IsSGPRSrc) {
965 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
966 return;
967 }
968
969 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
970 .addReg(NewSrcReg, getKillRegState(KillSrc));
971 return;
972 }
973
974 if (IsAGPRDst || IsAGPRSrc) {
975 if (!DstLow || !SrcLow) {
976 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
977 "Cannot use hi16 subreg with an AGPR!");
978 }
979
980 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
981 return;
982 }
983
984 if (ST.hasTrue16BitInsts()) {
985 if (IsSGPRSrc) {
986 assert(SrcLow);
987 SrcReg = NewSrcReg;
988 }
989 // Use the smaller instruction encoding if possible.
990 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
991 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
992 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
993 .addReg(SrcReg);
994 } else {
995 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
996 .addImm(0) // src0_modifiers
997 .addReg(SrcReg)
998 .addImm(0); // op_sel
999 }
1000 return;
1001 }
1002
1003 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1004 if (!DstLow || !SrcLow) {
1005 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1006 "Cannot use hi16 subreg on VI!");
1007 }
1008
1009 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1010 .addReg(NewSrcReg, getKillRegState(KillSrc));
1011 return;
1012 }
1013
1014 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1015 .addImm(0) // src0_modifiers
1016 .addReg(NewSrcReg)
1017 .addImm(0) // clamp
1024 // First implicit operand is $exec.
1025 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1026 return;
1027 }
1028
1029 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1030 if (ST.hasMovB64()) {
1031 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1032 .addReg(SrcReg, getKillRegState(KillSrc));
1033 return;
1034 }
1035 if (ST.hasPkMovB32()) {
1036 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1038 .addReg(SrcReg)
1040 .addReg(SrcReg)
1041 .addImm(0) // op_sel_lo
1042 .addImm(0) // op_sel_hi
1043 .addImm(0) // neg_lo
1044 .addImm(0) // neg_hi
1045 .addImm(0) // clamp
1046 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1047 return;
1048 }
1049 }
1050
1051 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1052 if (RI.isSGPRClass(RC)) {
1053 if (!RI.isSGPRClass(SrcRC)) {
1054 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1055 return;
1056 }
1057 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1058 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1059 Forward);
1060 return;
1061 }
1062
1063 unsigned EltSize = 4;
1064 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1065 if (RI.isAGPRClass(RC)) {
1066 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1067 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1068 else if (RI.hasVGPRs(SrcRC) ||
1069 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1070 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1071 else
1072 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1073 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1074 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1075 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1076 (RI.isProperlyAlignedRC(*RC) &&
1077 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1078 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1079 if (ST.hasMovB64()) {
1080 Opcode = AMDGPU::V_MOV_B64_e32;
1081 EltSize = 8;
1082 } else if (ST.hasPkMovB32()) {
1083 Opcode = AMDGPU::V_PK_MOV_B32;
1084 EltSize = 8;
1085 }
1086 }
1087
1088 // For the cases where we need an intermediate instruction/temporary register
1089 // (destination is an AGPR), we need a scavenger.
1090 //
1091 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1092 // whole block for every handled copy.
1093 std::unique_ptr<RegScavenger> RS;
1094 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1095 RS.reset(new RegScavenger());
1096
1097 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1098
1099 // If there is an overlap, we can't kill the super-register on the last
1100 // instruction, since it will also kill the components made live by this def.
1101 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1102 const bool CanKillSuperReg = KillSrc && !Overlap;
1103
1104 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1105 unsigned SubIdx;
1106 if (Forward)
1107 SubIdx = SubIndices[Idx];
1108 else
1109 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1110 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1111 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1112 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1113
1114 bool IsFirstSubreg = Idx == 0;
1115 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1116
1117 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1118 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1119 Register ImpUseSuper = SrcReg;
1120 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1121 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1122 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1124 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1126 .addReg(SrcSubReg)
1128 .addReg(SrcSubReg)
1129 .addImm(0) // op_sel_lo
1130 .addImm(0) // op_sel_hi
1131 .addImm(0) // neg_lo
1132 .addImm(0) // neg_hi
1133 .addImm(0) // clamp
1134 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1135 if (IsFirstSubreg)
1137 } else {
1138 MachineInstrBuilder Builder =
1139 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1140 if (IsFirstSubreg)
1141 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1142
1143 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1144 }
1145 }
1146}
1147
1148int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1149 int NewOpc;
1150
1151 // Try to map original to commuted opcode
1152 NewOpc = AMDGPU::getCommuteRev(Opcode);
1153 if (NewOpc != -1)
1154 // Check if the commuted (REV) opcode exists on the target.
1155 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1156
1157 // Try to map commuted to original opcode
1158 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1159 if (NewOpc != -1)
1160 // Check if the original (non-REV) opcode exists on the target.
1161 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1162
1163 return Opcode;
1164}
1165
1168 const DebugLoc &DL, Register DestReg,
1169 int64_t Value) const {
1171 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
1172 if (RegClass == &AMDGPU::SReg_32RegClass ||
1173 RegClass == &AMDGPU::SGPR_32RegClass ||
1174 RegClass == &AMDGPU::SReg_32_XM0RegClass ||
1175 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
1176 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
1177 .addImm(Value);
1178 return;
1179 }
1180
1181 if (RegClass == &AMDGPU::SReg_64RegClass ||
1182 RegClass == &AMDGPU::SGPR_64RegClass ||
1183 RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
1184 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
1185 .addImm(Value);
1186 return;
1187 }
1188
1189 if (RegClass == &AMDGPU::VGPR_32RegClass) {
1190 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
1191 .addImm(Value);
1192 return;
1193 }
1194 if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
1195 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
1196 .addImm(Value);
1197 return;
1198 }
1199
1200 unsigned EltSize = 4;
1201 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1202 if (RI.isSGPRClass(RegClass)) {
1203 if (RI.getRegSizeInBits(*RegClass) > 32) {
1204 Opcode = AMDGPU::S_MOV_B64;
1205 EltSize = 8;
1206 } else {
1207 Opcode = AMDGPU::S_MOV_B32;
1208 EltSize = 4;
1209 }
1210 }
1211
1212 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
1213 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1214 int64_t IdxValue = Idx == 0 ? Value : 0;
1215
1216 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
1217 get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
1218 Builder.addImm(IdxValue);
1219 }
1220}
1221
1222const TargetRegisterClass *
1224 return &AMDGPU::VGPR_32RegClass;
1225}
1226
1229 const DebugLoc &DL, Register DstReg,
1231 Register TrueReg,
1232 Register FalseReg) const {
1234 const TargetRegisterClass *BoolXExecRC =
1235 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1236 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1237 "Not a VGPR32 reg");
1238
1239 if (Cond.size() == 1) {
1240 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1241 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1242 .add(Cond[0]);
1243 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1244 .addImm(0)
1245 .addReg(FalseReg)
1246 .addImm(0)
1247 .addReg(TrueReg)
1248 .addReg(SReg);
1249 } else if (Cond.size() == 2) {
1250 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1251 switch (Cond[0].getImm()) {
1252 case SIInstrInfo::SCC_TRUE: {
1253 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1254 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1255 : AMDGPU::S_CSELECT_B64), SReg)
1256 .addImm(1)
1257 .addImm(0);
1258 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1259 .addImm(0)
1260 .addReg(FalseReg)
1261 .addImm(0)
1262 .addReg(TrueReg)
1263 .addReg(SReg);
1264 break;
1265 }
1266 case SIInstrInfo::SCC_FALSE: {
1267 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1268 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1269 : AMDGPU::S_CSELECT_B64), SReg)
1270 .addImm(0)
1271 .addImm(1);
1272 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1273 .addImm(0)
1274 .addReg(FalseReg)
1275 .addImm(0)
1276 .addReg(TrueReg)
1277 .addReg(SReg);
1278 break;
1279 }
1280 case SIInstrInfo::VCCNZ: {
1281 MachineOperand RegOp = Cond[1];
1282 RegOp.setImplicit(false);
1283 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1284 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1285 .add(RegOp);
1286 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1287 .addImm(0)
1288 .addReg(FalseReg)
1289 .addImm(0)
1290 .addReg(TrueReg)
1291 .addReg(SReg);
1292 break;
1293 }
1294 case SIInstrInfo::VCCZ: {
1295 MachineOperand RegOp = Cond[1];
1296 RegOp.setImplicit(false);
1297 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1298 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1299 .add(RegOp);
1300 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1301 .addImm(0)
1302 .addReg(TrueReg)
1303 .addImm(0)
1304 .addReg(FalseReg)
1305 .addReg(SReg);
1306 break;
1307 }
1308 case SIInstrInfo::EXECNZ: {
1309 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1310 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1311 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1312 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1313 .addImm(0);
1314 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1315 : AMDGPU::S_CSELECT_B64), SReg)
1316 .addImm(1)
1317 .addImm(0);
1318 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1319 .addImm(0)
1320 .addReg(FalseReg)
1321 .addImm(0)
1322 .addReg(TrueReg)
1323 .addReg(SReg);
1324 break;
1325 }
1326 case SIInstrInfo::EXECZ: {
1327 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1328 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1329 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1330 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1331 .addImm(0);
1332 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1333 : AMDGPU::S_CSELECT_B64), SReg)
1334 .addImm(0)
1335 .addImm(1);
1336 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1337 .addImm(0)
1338 .addReg(FalseReg)
1339 .addImm(0)
1340 .addReg(TrueReg)
1341 .addReg(SReg);
1342 llvm_unreachable("Unhandled branch predicate EXECZ");
1343 break;
1344 }
1345 default:
1346 llvm_unreachable("invalid branch predicate");
1347 }
1348 } else {
1349 llvm_unreachable("Can only handle Cond size 1 or 2");
1350 }
1351}
1352
1355 const DebugLoc &DL,
1356 Register SrcReg, int Value) const {
1358 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1359 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1360 .addImm(Value)
1361 .addReg(SrcReg);
1362
1363 return Reg;
1364}
1365
1368 const DebugLoc &DL,
1369 Register SrcReg, int Value) const {
1371 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1372 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1373 .addImm(Value)
1374 .addReg(SrcReg);
1375
1376 return Reg;
1377}
1378
1380
1381 if (RI.isAGPRClass(DstRC))
1382 return AMDGPU::COPY;
1383 if (RI.getRegSizeInBits(*DstRC) == 16) {
1384 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1385 // before RA.
1386 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1387 } else if (RI.getRegSizeInBits(*DstRC) == 32) {
1388 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1389 } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
1390 return AMDGPU::S_MOV_B64;
1391 } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
1392 return AMDGPU::V_MOV_B64_PSEUDO;
1393 }
1394 return AMDGPU::COPY;
1395}
1396
1397const MCInstrDesc &
1399 bool IsIndirectSrc) const {
1400 if (IsIndirectSrc) {
1401 if (VecSize <= 32) // 4 bytes
1402 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1403 if (VecSize <= 64) // 8 bytes
1404 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1405 if (VecSize <= 96) // 12 bytes
1406 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1407 if (VecSize <= 128) // 16 bytes
1408 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1409 if (VecSize <= 160) // 20 bytes
1410 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1411 if (VecSize <= 256) // 32 bytes
1412 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1413 if (VecSize <= 288) // 36 bytes
1414 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1415 if (VecSize <= 320) // 40 bytes
1416 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1417 if (VecSize <= 352) // 44 bytes
1418 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1419 if (VecSize <= 384) // 48 bytes
1420 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1421 if (VecSize <= 512) // 64 bytes
1422 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1423 if (VecSize <= 1024) // 128 bytes
1424 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1425
1426 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1427 }
1428
1429 if (VecSize <= 32) // 4 bytes
1430 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1431 if (VecSize <= 64) // 8 bytes
1432 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1433 if (VecSize <= 96) // 12 bytes
1434 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1435 if (VecSize <= 128) // 16 bytes
1436 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1437 if (VecSize <= 160) // 20 bytes
1438 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1439 if (VecSize <= 256) // 32 bytes
1440 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1441 if (VecSize <= 288) // 36 bytes
1442 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1443 if (VecSize <= 320) // 40 bytes
1444 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1445 if (VecSize <= 352) // 44 bytes
1446 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1447 if (VecSize <= 384) // 48 bytes
1448 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1449 if (VecSize <= 512) // 64 bytes
1450 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1451 if (VecSize <= 1024) // 128 bytes
1452 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1453
1454 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1455}
1456
1457static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1458 if (VecSize <= 32) // 4 bytes
1459 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1460 if (VecSize <= 64) // 8 bytes
1461 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1462 if (VecSize <= 96) // 12 bytes
1463 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1464 if (VecSize <= 128) // 16 bytes
1465 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1466 if (VecSize <= 160) // 20 bytes
1467 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1468 if (VecSize <= 256) // 32 bytes
1469 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1470 if (VecSize <= 288) // 36 bytes
1471 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1472 if (VecSize <= 320) // 40 bytes
1473 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1474 if (VecSize <= 352) // 44 bytes
1475 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1476 if (VecSize <= 384) // 48 bytes
1477 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1478 if (VecSize <= 512) // 64 bytes
1479 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1480 if (VecSize <= 1024) // 128 bytes
1481 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1482
1483 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1484}
1485
1486static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1487 if (VecSize <= 32) // 4 bytes
1488 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1489 if (VecSize <= 64) // 8 bytes
1490 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1491 if (VecSize <= 96) // 12 bytes
1492 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1493 if (VecSize <= 128) // 16 bytes
1494 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1495 if (VecSize <= 160) // 20 bytes
1496 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1497 if (VecSize <= 256) // 32 bytes
1498 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1499 if (VecSize <= 288) // 36 bytes
1500 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1501 if (VecSize <= 320) // 40 bytes
1502 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1503 if (VecSize <= 352) // 44 bytes
1504 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1505 if (VecSize <= 384) // 48 bytes
1506 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1507 if (VecSize <= 512) // 64 bytes
1508 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1509 if (VecSize <= 1024) // 128 bytes
1510 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1511
1512 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1513}
1514
1515static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1516 if (VecSize <= 64) // 8 bytes
1517 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1518 if (VecSize <= 128) // 16 bytes
1519 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1520 if (VecSize <= 256) // 32 bytes
1521 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1522 if (VecSize <= 512) // 64 bytes
1523 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1524 if (VecSize <= 1024) // 128 bytes
1525 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1526
1527 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1528}
1529
1530const MCInstrDesc &
1531SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1532 bool IsSGPR) const {
1533 if (IsSGPR) {
1534 switch (EltSize) {
1535 case 32:
1536 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1537 case 64:
1538 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1539 default:
1540 llvm_unreachable("invalid reg indexing elt size");
1541 }
1542 }
1543
1544 assert(EltSize == 32 && "invalid reg indexing elt size");
1546}
1547
1548static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1549 switch (Size) {
1550 case 4:
1551 return AMDGPU::SI_SPILL_S32_SAVE;
1552 case 8:
1553 return AMDGPU::SI_SPILL_S64_SAVE;
1554 case 12:
1555 return AMDGPU::SI_SPILL_S96_SAVE;
1556 case 16:
1557 return AMDGPU::SI_SPILL_S128_SAVE;
1558 case 20:
1559 return AMDGPU::SI_SPILL_S160_SAVE;
1560 case 24:
1561 return AMDGPU::SI_SPILL_S192_SAVE;
1562 case 28:
1563 return AMDGPU::SI_SPILL_S224_SAVE;
1564 case 32:
1565 return AMDGPU::SI_SPILL_S256_SAVE;
1566 case 36:
1567 return AMDGPU::SI_SPILL_S288_SAVE;
1568 case 40:
1569 return AMDGPU::SI_SPILL_S320_SAVE;
1570 case 44:
1571 return AMDGPU::SI_SPILL_S352_SAVE;
1572 case 48:
1573 return AMDGPU::SI_SPILL_S384_SAVE;
1574 case 64:
1575 return AMDGPU::SI_SPILL_S512_SAVE;
1576 case 128:
1577 return AMDGPU::SI_SPILL_S1024_SAVE;
1578 default:
1579 llvm_unreachable("unknown register size");
1580 }
1581}
1582
1583static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1584 switch (Size) {
1585 case 4:
1586 return AMDGPU::SI_SPILL_V32_SAVE;
1587 case 8:
1588 return AMDGPU::SI_SPILL_V64_SAVE;
1589 case 12:
1590 return AMDGPU::SI_SPILL_V96_SAVE;
1591 case 16:
1592 return AMDGPU::SI_SPILL_V128_SAVE;
1593 case 20:
1594 return AMDGPU::SI_SPILL_V160_SAVE;
1595 case 24:
1596 return AMDGPU::SI_SPILL_V192_SAVE;
1597 case 28:
1598 return AMDGPU::SI_SPILL_V224_SAVE;
1599 case 32:
1600 return AMDGPU::SI_SPILL_V256_SAVE;
1601 case 36:
1602 return AMDGPU::SI_SPILL_V288_SAVE;
1603 case 40:
1604 return AMDGPU::SI_SPILL_V320_SAVE;
1605 case 44:
1606 return AMDGPU::SI_SPILL_V352_SAVE;
1607 case 48:
1608 return AMDGPU::SI_SPILL_V384_SAVE;
1609 case 64:
1610 return AMDGPU::SI_SPILL_V512_SAVE;
1611 case 128:
1612 return AMDGPU::SI_SPILL_V1024_SAVE;
1613 default:
1614 llvm_unreachable("unknown register size");
1615 }
1616}
1617
1618static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
1619 switch (Size) {
1620 case 4:
1621 return AMDGPU::SI_SPILL_A32_SAVE;
1622 case 8:
1623 return AMDGPU::SI_SPILL_A64_SAVE;
1624 case 12:
1625 return AMDGPU::SI_SPILL_A96_SAVE;
1626 case 16:
1627 return AMDGPU::SI_SPILL_A128_SAVE;
1628 case 20:
1629 return AMDGPU::SI_SPILL_A160_SAVE;
1630 case 24:
1631 return AMDGPU::SI_SPILL_A192_SAVE;
1632 case 28:
1633 return AMDGPU::SI_SPILL_A224_SAVE;
1634 case 32:
1635 return AMDGPU::SI_SPILL_A256_SAVE;
1636 case 36:
1637 return AMDGPU::SI_SPILL_A288_SAVE;
1638 case 40:
1639 return AMDGPU::SI_SPILL_A320_SAVE;
1640 case 44:
1641 return AMDGPU::SI_SPILL_A352_SAVE;
1642 case 48:
1643 return AMDGPU::SI_SPILL_A384_SAVE;
1644 case 64:
1645 return AMDGPU::SI_SPILL_A512_SAVE;
1646 case 128:
1647 return AMDGPU::SI_SPILL_A1024_SAVE;
1648 default:
1649 llvm_unreachable("unknown register size");
1650 }
1651}
1652
1653static unsigned getAVSpillSaveOpcode(unsigned Size) {
1654 switch (Size) {
1655 case 4:
1656 return AMDGPU::SI_SPILL_AV32_SAVE;
1657 case 8:
1658 return AMDGPU::SI_SPILL_AV64_SAVE;
1659 case 12:
1660 return AMDGPU::SI_SPILL_AV96_SAVE;
1661 case 16:
1662 return AMDGPU::SI_SPILL_AV128_SAVE;
1663 case 20:
1664 return AMDGPU::SI_SPILL_AV160_SAVE;
1665 case 24:
1666 return AMDGPU::SI_SPILL_AV192_SAVE;
1667 case 28:
1668 return AMDGPU::SI_SPILL_AV224_SAVE;
1669 case 32:
1670 return AMDGPU::SI_SPILL_AV256_SAVE;
1671 case 36:
1672 return AMDGPU::SI_SPILL_AV288_SAVE;
1673 case 40:
1674 return AMDGPU::SI_SPILL_AV320_SAVE;
1675 case 44:
1676 return AMDGPU::SI_SPILL_AV352_SAVE;
1677 case 48:
1678 return AMDGPU::SI_SPILL_AV384_SAVE;
1679 case 64:
1680 return AMDGPU::SI_SPILL_AV512_SAVE;
1681 case 128:
1682 return AMDGPU::SI_SPILL_AV1024_SAVE;
1683 default:
1684 llvm_unreachable("unknown register size");
1685 }
1686}
1687
1688static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1689 bool IsVectorSuperClass) {
1690 // Currently, there is only 32-bit WWM register spills needed.
1691 if (Size != 4)
1692 llvm_unreachable("unknown wwm register spill size");
1693
1694 if (IsVectorSuperClass)
1695 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1696
1697 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1698}
1699
1701 const TargetRegisterClass *RC,
1702 unsigned Size,
1703 const SIRegisterInfo &TRI,
1704 const SIMachineFunctionInfo &MFI) {
1705 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1706
1707 // Choose the right opcode if spilling a WWM register.
1709 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1710
1711 if (IsVectorSuperClass)
1712 return getAVSpillSaveOpcode(Size);
1713
1714 return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
1716}
1717
1720 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1721 const TargetRegisterInfo *TRI, Register VReg) const {
1724 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1725 const DebugLoc &DL = MBB.findDebugLoc(MI);
1726
1727 MachinePointerInfo PtrInfo
1728 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1730 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1731 FrameInfo.getObjectAlign(FrameIndex));
1732 unsigned SpillSize = TRI->getSpillSize(*RC);
1733
1735 if (RI.isSGPRClass(RC)) {
1736 MFI->setHasSpilledSGPRs();
1737 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1738 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1739 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1740
1741 // We are only allowed to create one new instruction when spilling
1742 // registers, so we need to use pseudo instruction for spilling SGPRs.
1743 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1744
1745 // The SGPR spill/restore instructions only work on number sgprs, so we need
1746 // to make sure we are using the correct register class.
1747 if (SrcReg.isVirtual() && SpillSize == 4) {
1748 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1749 }
1750
1751 BuildMI(MBB, MI, DL, OpDesc)
1752 .addReg(SrcReg, getKillRegState(isKill)) // data
1753 .addFrameIndex(FrameIndex) // addr
1754 .addMemOperand(MMO)
1756
1757 if (RI.spillSGPRToVGPR())
1758 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1759 return;
1760 }
1761
1762 unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
1763 SpillSize, RI, *MFI);
1764 MFI->setHasSpilledVGPRs();
1765
1766 BuildMI(MBB, MI, DL, get(Opcode))
1767 .addReg(SrcReg, getKillRegState(isKill)) // data
1768 .addFrameIndex(FrameIndex) // addr
1769 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1770 .addImm(0) // offset
1771 .addMemOperand(MMO);
1772}
1773
1774static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1775 switch (Size) {
1776 case 4:
1777 return AMDGPU::SI_SPILL_S32_RESTORE;
1778 case 8:
1779 return AMDGPU::SI_SPILL_S64_RESTORE;
1780 case 12:
1781 return AMDGPU::SI_SPILL_S96_RESTORE;
1782 case 16:
1783 return AMDGPU::SI_SPILL_S128_RESTORE;
1784 case 20:
1785 return AMDGPU::SI_SPILL_S160_RESTORE;
1786 case 24:
1787 return AMDGPU::SI_SPILL_S192_RESTORE;
1788 case 28:
1789 return AMDGPU::SI_SPILL_S224_RESTORE;
1790 case 32:
1791 return AMDGPU::SI_SPILL_S256_RESTORE;
1792 case 36:
1793 return AMDGPU::SI_SPILL_S288_RESTORE;
1794 case 40:
1795 return AMDGPU::SI_SPILL_S320_RESTORE;
1796 case 44:
1797 return AMDGPU::SI_SPILL_S352_RESTORE;
1798 case 48:
1799 return AMDGPU::SI_SPILL_S384_RESTORE;
1800 case 64:
1801 return AMDGPU::SI_SPILL_S512_RESTORE;
1802 case 128:
1803 return AMDGPU::SI_SPILL_S1024_RESTORE;
1804 default:
1805 llvm_unreachable("unknown register size");
1806 }
1807}
1808
1809static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1810 switch (Size) {
1811 case 4:
1812 return AMDGPU::SI_SPILL_V32_RESTORE;
1813 case 8:
1814 return AMDGPU::SI_SPILL_V64_RESTORE;
1815 case 12:
1816 return AMDGPU::SI_SPILL_V96_RESTORE;
1817 case 16:
1818 return AMDGPU::SI_SPILL_V128_RESTORE;
1819 case 20:
1820 return AMDGPU::SI_SPILL_V160_RESTORE;
1821 case 24:
1822 return AMDGPU::SI_SPILL_V192_RESTORE;
1823 case 28:
1824 return AMDGPU::SI_SPILL_V224_RESTORE;
1825 case 32:
1826 return AMDGPU::SI_SPILL_V256_RESTORE;
1827 case 36:
1828 return AMDGPU::SI_SPILL_V288_RESTORE;
1829 case 40:
1830 return AMDGPU::SI_SPILL_V320_RESTORE;
1831 case 44:
1832 return AMDGPU::SI_SPILL_V352_RESTORE;
1833 case 48:
1834 return AMDGPU::SI_SPILL_V384_RESTORE;
1835 case 64:
1836 return AMDGPU::SI_SPILL_V512_RESTORE;
1837 case 128:
1838 return AMDGPU::SI_SPILL_V1024_RESTORE;
1839 default:
1840 llvm_unreachable("unknown register size");
1841 }
1842}
1843
1844static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1845 switch (Size) {
1846 case 4:
1847 return AMDGPU::SI_SPILL_A32_RESTORE;
1848 case 8:
1849 return AMDGPU::SI_SPILL_A64_RESTORE;
1850 case 12:
1851 return AMDGPU::SI_SPILL_A96_RESTORE;
1852 case 16:
1853 return AMDGPU::SI_SPILL_A128_RESTORE;
1854 case 20:
1855 return AMDGPU::SI_SPILL_A160_RESTORE;
1856 case 24:
1857 return AMDGPU::SI_SPILL_A192_RESTORE;
1858 case 28:
1859 return AMDGPU::SI_SPILL_A224_RESTORE;
1860 case 32:
1861 return AMDGPU::SI_SPILL_A256_RESTORE;
1862 case 36:
1863 return AMDGPU::SI_SPILL_A288_RESTORE;
1864 case 40:
1865 return AMDGPU::SI_SPILL_A320_RESTORE;
1866 case 44:
1867 return AMDGPU::SI_SPILL_A352_RESTORE;
1868 case 48:
1869 return AMDGPU::SI_SPILL_A384_RESTORE;
1870 case 64:
1871 return AMDGPU::SI_SPILL_A512_RESTORE;
1872 case 128:
1873 return AMDGPU::SI_SPILL_A1024_RESTORE;
1874 default:
1875 llvm_unreachable("unknown register size");
1876 }
1877}
1878
1879static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1880 switch (Size) {
1881 case 4:
1882 return AMDGPU::SI_SPILL_AV32_RESTORE;
1883 case 8:
1884 return AMDGPU::SI_SPILL_AV64_RESTORE;
1885 case 12:
1886 return AMDGPU::SI_SPILL_AV96_RESTORE;
1887 case 16:
1888 return AMDGPU::SI_SPILL_AV128_RESTORE;
1889 case 20:
1890 return AMDGPU::SI_SPILL_AV160_RESTORE;
1891 case 24:
1892 return AMDGPU::SI_SPILL_AV192_RESTORE;
1893 case 28:
1894 return AMDGPU::SI_SPILL_AV224_RESTORE;
1895 case 32:
1896 return AMDGPU::SI_SPILL_AV256_RESTORE;
1897 case 36:
1898 return AMDGPU::SI_SPILL_AV288_RESTORE;
1899 case 40:
1900 return AMDGPU::SI_SPILL_AV320_RESTORE;
1901 case 44:
1902 return AMDGPU::SI_SPILL_AV352_RESTORE;
1903 case 48:
1904 return AMDGPU::SI_SPILL_AV384_RESTORE;
1905 case 64:
1906 return AMDGPU::SI_SPILL_AV512_RESTORE;
1907 case 128:
1908 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1909 default:
1910 llvm_unreachable("unknown register size");
1911 }
1912}
1913
1914static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1915 bool IsVectorSuperClass) {
1916 // Currently, there is only 32-bit WWM register spills needed.
1917 if (Size != 4)
1918 llvm_unreachable("unknown wwm register spill size");
1919
1920 if (IsVectorSuperClass)
1921 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1922
1923 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1924}
1925
1926static unsigned
1928 unsigned Size, const SIRegisterInfo &TRI,
1929 const SIMachineFunctionInfo &MFI) {
1930 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1931
1932 // Choose the right opcode if restoring a WWM register.
1934 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1935
1936 if (IsVectorSuperClass)
1938
1939 return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
1941}
1942
1945 Register DestReg, int FrameIndex,
1946 const TargetRegisterClass *RC,
1947 const TargetRegisterInfo *TRI,
1948 Register VReg) const {
1951 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1952 const DebugLoc &DL = MBB.findDebugLoc(MI);
1953 unsigned SpillSize = TRI->getSpillSize(*RC);
1954
1955 MachinePointerInfo PtrInfo
1956 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1957
1959 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1960 FrameInfo.getObjectAlign(FrameIndex));
1961
1962 if (RI.isSGPRClass(RC)) {
1963 MFI->setHasSpilledSGPRs();
1964 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1965 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1966 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1967
1968 // FIXME: Maybe this should not include a memoperand because it will be
1969 // lowered to non-memory instructions.
1970 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1971 if (DestReg.isVirtual() && SpillSize == 4) {
1973 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1974 }
1975
1976 if (RI.spillSGPRToVGPR())
1977 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1978 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1979 .addFrameIndex(FrameIndex) // addr
1980 .addMemOperand(MMO)
1982
1983 return;
1984 }
1985
1986 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1987 SpillSize, RI, *MFI);
1988 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1989 .addFrameIndex(FrameIndex) // vaddr
1990 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1991 .addImm(0) // offset
1992 .addMemOperand(MMO);
1993}
1994
1997 insertNoops(MBB, MI, 1);
1998}
1999
2002 unsigned Quantity) const {
2004 while (Quantity > 0) {
2005 unsigned Arg = std::min(Quantity, 8u);
2006 Quantity -= Arg;
2007 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
2008 }
2009}
2010
2012 auto MF = MBB.getParent();
2014
2015 assert(Info->isEntryFunction());
2016
2017 if (MBB.succ_empty()) {
2018 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
2019 if (HasNoTerminator) {
2020 if (Info->returnsVoid()) {
2021 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
2022 } else {
2023 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
2024 }
2025 }
2026 }
2027}
2028
2032 const DebugLoc &DL) const {
2034 MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
2036 MF->push_back(HaltLoop);
2037
2038 constexpr unsigned DoorbellIDMask = 0x3ff;
2039 constexpr unsigned ECQueueWaveAbort = 0x400;
2040
2041 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2042 // will be a nop.
2043 BuildMI(MBB, MI, DL, get(AMDGPU::S_TRAP))
2044 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2045 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2046 BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG_RTN_B32), DoorbellReg)
2048 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2049 .addUse(AMDGPU::M0);
2050 Register DoorbellRegMasked =
2051 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2052 BuildMI(MBB, MI, DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2053 .addUse(DoorbellReg)
2054 .addImm(DoorbellIDMask);
2055 Register SetWaveAbortBit =
2056 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2057 BuildMI(MBB, MI, DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2058 .addUse(DoorbellRegMasked)
2059 .addImm(ECQueueWaveAbort);
2060 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2061 .addUse(SetWaveAbortBit);
2062 BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG))
2064 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2065 .addUse(AMDGPU::TTMP2);
2066 BuildMI(MBB, MI, DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoop);
2067
2068 BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2069 BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_BRANCH))
2070 .addMBB(HaltLoop);
2071
2072 if (SplitBB != &MBB)
2073 MBB.removeSuccessor(SplitBB);
2074 MBB.addSuccessor(HaltLoop);
2075 HaltLoop->addSuccessor(HaltLoop);
2076
2077 return SplitBB;
2078}
2079
2081 switch (MI.getOpcode()) {
2082 default:
2083 if (MI.isMetaInstruction())
2084 return 0;
2085 return 1; // FIXME: Do wait states equal cycles?
2086
2087 case AMDGPU::S_NOP:
2088 return MI.getOperand(0).getImm() + 1;
2089 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2090 // hazard, even if one exist, won't really be visible. Should we handle it?
2091 }
2092}
2093
2095 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2096 MachineBasicBlock &MBB = *MI.getParent();
2098 switch (MI.getOpcode()) {
2099 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2100 case AMDGPU::S_MOV_B64_term:
2101 // This is only a terminator to get the correct spill code placement during
2102 // register allocation.
2103 MI.setDesc(get(AMDGPU::S_MOV_B64));
2104 break;
2105
2106 case AMDGPU::S_MOV_B32_term:
2107 // This is only a terminator to get the correct spill code placement during
2108 // register allocation.
2109 MI.setDesc(get(AMDGPU::S_MOV_B32));
2110 break;
2111
2112 case AMDGPU::S_XOR_B64_term:
2113 // This is only a terminator to get the correct spill code placement during
2114 // register allocation.
2115 MI.setDesc(get(AMDGPU::S_XOR_B64));
2116 break;
2117
2118 case AMDGPU::S_XOR_B32_term:
2119 // This is only a terminator to get the correct spill code placement during
2120 // register allocation.
2121 MI.setDesc(get(AMDGPU::S_XOR_B32));
2122 break;
2123 case AMDGPU::S_OR_B64_term:
2124 // This is only a terminator to get the correct spill code placement during
2125 // register allocation.
2126 MI.setDesc(get(AMDGPU::S_OR_B64));
2127 break;
2128 case AMDGPU::S_OR_B32_term:
2129 // This is only a terminator to get the correct spill code placement during
2130 // register allocation.
2131 MI.setDesc(get(AMDGPU::S_OR_B32));
2132 break;
2133
2134 case AMDGPU::S_ANDN2_B64_term:
2135 // This is only a terminator to get the correct spill code placement during
2136 // register allocation.
2137 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2138 break;
2139
2140 case AMDGPU::S_ANDN2_B32_term:
2141 // This is only a terminator to get the correct spill code placement during
2142 // register allocation.
2143 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2144 break;
2145
2146 case AMDGPU::S_AND_B64_term:
2147 // This is only a terminator to get the correct spill code placement during
2148 // register allocation.
2149 MI.setDesc(get(AMDGPU::S_AND_B64));
2150 break;
2151
2152 case AMDGPU::S_AND_B32_term:
2153 // This is only a terminator to get the correct spill code placement during
2154 // register allocation.
2155 MI.setDesc(get(AMDGPU::S_AND_B32));
2156 break;
2157
2158 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2159 // This is only a terminator to get the correct spill code placement during
2160 // register allocation.
2161 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2162 break;
2163
2164 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2165 // This is only a terminator to get the correct spill code placement during
2166 // register allocation.
2167 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2168 break;
2169
2170 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2171 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2172 break;
2173
2174 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2175 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2176 break;
2177
2178 case AMDGPU::V_MOV_B64_PSEUDO: {
2179 Register Dst = MI.getOperand(0).getReg();
2180 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2181 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2182
2183 const MachineOperand &SrcOp = MI.getOperand(1);
2184 // FIXME: Will this work for 64-bit floating point immediates?
2185 assert(!SrcOp.isFPImm());
2186 if (ST.hasMovB64()) {
2187 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2188 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2189 isUInt<32>(SrcOp.getImm()))
2190 break;
2191 }
2192 if (SrcOp.isImm()) {
2193 APInt Imm(64, SrcOp.getImm());
2194 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2195 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2196 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2197 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2199 .addImm(Lo.getSExtValue())
2201 .addImm(Lo.getSExtValue())
2202 .addImm(0) // op_sel_lo
2203 .addImm(0) // op_sel_hi
2204 .addImm(0) // neg_lo
2205 .addImm(0) // neg_hi
2206 .addImm(0); // clamp
2207 } else {
2208 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2209 .addImm(Lo.getSExtValue())
2211 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2212 .addImm(Hi.getSExtValue())
2214 }
2215 } else {
2216 assert(SrcOp.isReg());
2217 if (ST.hasPkMovB32() &&
2218 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2219 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2220 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2221 .addReg(SrcOp.getReg())
2223 .addReg(SrcOp.getReg())
2224 .addImm(0) // op_sel_lo
2225 .addImm(0) // op_sel_hi
2226 .addImm(0) // neg_lo
2227 .addImm(0) // neg_hi
2228 .addImm(0); // clamp
2229 } else {
2230 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2231 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2233 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2234 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2236 }
2237 }
2238 MI.eraseFromParent();
2239 break;
2240 }
2241 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2243 break;
2244 }
2245 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2246 const MachineOperand &SrcOp = MI.getOperand(1);
2247 assert(!SrcOp.isFPImm());
2248 APInt Imm(64, SrcOp.getImm());
2249 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2250 MI.setDesc(get(AMDGPU::S_MOV_B64));
2251 break;
2252 }
2253
2254 Register Dst = MI.getOperand(0).getReg();
2255 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2256 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2257
2258 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2259 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2260 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2261 .addImm(Lo.getSExtValue())
2263 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2264 .addImm(Hi.getSExtValue())
2266 MI.eraseFromParent();
2267 break;
2268 }
2269 case AMDGPU::V_SET_INACTIVE_B32: {
2270 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2271 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2272 // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
2273 // optimizations (mainly Register Coalescer) aware of WWM register liveness.
2274 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2275 .add(MI.getOperand(1));
2276 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2277 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2278 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2279 .add(MI.getOperand(2));
2280 BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2281 .addReg(Exec);
2282 MI.eraseFromParent();
2283 break;
2284 }
2285 case AMDGPU::V_SET_INACTIVE_B64: {
2286 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2287 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2288 MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2289 MI.getOperand(0).getReg())
2290 .add(MI.getOperand(1));
2291 expandPostRAPseudo(*Copy);
2292 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2293 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2294 Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2295 MI.getOperand(0).getReg())
2296 .add(MI.getOperand(2));
2297 expandPostRAPseudo(*Copy);
2298 BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2299 .addReg(Exec);
2300 MI.eraseFromParent();
2301 break;
2302 }
2303 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2304 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2305 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2306 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2307 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2308 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2309 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2310 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2311 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2312 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2313 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2314 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2315 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2316 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2317 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2318 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2319 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2320 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2321 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2322 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2323 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2324 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2325 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2326 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2327 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2328 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2329 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2330 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2331 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2332 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2333
2334 unsigned Opc;
2335 if (RI.hasVGPRs(EltRC)) {
2336 Opc = AMDGPU::V_MOVRELD_B32_e32;
2337 } else {
2338 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2339 : AMDGPU::S_MOVRELD_B32;
2340 }
2341
2342 const MCInstrDesc &OpDesc = get(Opc);
2343 Register VecReg = MI.getOperand(0).getReg();
2344 bool IsUndef = MI.getOperand(1).isUndef();
2345 unsigned SubReg = MI.getOperand(3).getImm();
2346 assert(VecReg == MI.getOperand(1).getReg());
2347
2349 BuildMI(MBB, MI, DL, OpDesc)
2350 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2351 .add(MI.getOperand(2))
2353 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2354
2355 const int ImpDefIdx =
2356 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2357 const int ImpUseIdx = ImpDefIdx + 1;
2358 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2359 MI.eraseFromParent();
2360 break;
2361 }
2362 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2363 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2364 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2365 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2366 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2367 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2368 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2369 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2370 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2371 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2372 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2373 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2375 Register VecReg = MI.getOperand(0).getReg();
2376 bool IsUndef = MI.getOperand(1).isUndef();
2377 Register Idx = MI.getOperand(3).getReg();
2378 Register SubReg = MI.getOperand(4).getImm();
2379
2380 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2381 .addReg(Idx)
2383 SetOn->getOperand(3).setIsUndef();
2384
2385 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2387 BuildMI(MBB, MI, DL, OpDesc)
2388 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2389 .add(MI.getOperand(2))
2391 .addReg(VecReg,
2392 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2393
2394 const int ImpDefIdx =
2395 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2396 const int ImpUseIdx = ImpDefIdx + 1;
2397 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2398
2399 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2400
2401 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2402
2403 MI.eraseFromParent();
2404 break;
2405 }
2406 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2407 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2408 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2409 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2410 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2411 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2412 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2413 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2414 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2415 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2416 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2417 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2419 Register Dst = MI.getOperand(0).getReg();
2420 Register VecReg = MI.getOperand(1).getReg();
2421 bool IsUndef = MI.getOperand(1).isUndef();
2422 Register Idx = MI.getOperand(2).getReg();
2423 Register SubReg = MI.getOperand(3).getImm();
2424
2425 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2426 .addReg(Idx)
2428 SetOn->getOperand(3).setIsUndef();
2429
2430 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2431 .addDef(Dst)
2432 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2433 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2434
2435 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2436
2437 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2438
2439 MI.eraseFromParent();
2440 break;
2441 }
2442 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2443 MachineFunction &MF = *MBB.getParent();
2444 Register Reg = MI.getOperand(0).getReg();
2445 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2446 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2447 MachineOperand OpLo = MI.getOperand(1);
2448 MachineOperand OpHi = MI.getOperand(2);
2449
2450 // Create a bundle so these instructions won't be re-ordered by the
2451 // post-RA scheduler.
2452 MIBundleBuilder Bundler(MBB, MI);
2453 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2454
2455 // What we want here is an offset from the value returned by s_getpc (which
2456 // is the address of the s_add_u32 instruction) to the global variable, but
2457 // since the encoding of $symbol starts 4 bytes after the start of the
2458 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2459 // small. This requires us to add 4 to the global variable offset in order
2460 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2461 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2462 // instruction.
2463
2464 int64_t Adjust = 0;
2465 if (ST.hasGetPCZeroExtension()) {
2466 // Fix up hardware that does not sign-extend the 48-bit PC value by
2467 // inserting: s_sext_i32_i16 reghi, reghi
2468 Bundler.append(
2469 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2470 Adjust += 4;
2471 }
2472
2473 if (OpLo.isGlobal())
2474 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2475 Bundler.append(
2476 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2477
2478 if (OpHi.isGlobal())
2479 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2480 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2481 .addReg(RegHi)
2482 .add(OpHi));
2483
2484 finalizeBundle(MBB, Bundler.begin());
2485
2486 MI.eraseFromParent();
2487 break;
2488 }
2489 case AMDGPU::ENTER_STRICT_WWM: {
2490 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2491 // Whole Wave Mode is entered.
2492 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
2493 : AMDGPU::S_OR_SAVEEXEC_B64));
2494 break;
2495 }
2496 case AMDGPU::ENTER_STRICT_WQM: {
2497 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2498 // STRICT_WQM is entered.
2499 const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2500 const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
2501 const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2502 BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
2503 BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
2504
2505 MI.eraseFromParent();
2506 break;
2507 }
2508 case AMDGPU::EXIT_STRICT_WWM:
2509 case AMDGPU::EXIT_STRICT_WQM: {
2510 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2511 // WWM/STICT_WQM is exited.
2512 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
2513 break;
2514 }
2515 case AMDGPU::ENTER_PSEUDO_WM:
2516 case AMDGPU::EXIT_PSEUDO_WM: {
2517 // These do nothing.
2518 MI.eraseFromParent();
2519 break;
2520 }
2521 case AMDGPU::SI_RETURN: {
2522 const MachineFunction *MF = MBB.getParent();
2523 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2524 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2525 // Hiding the return address use with SI_RETURN may lead to extra kills in
2526 // the function and missing live-ins. We are fine in practice because callee
2527 // saved register handling ensures the register value is restored before
2528 // RET, but we need the undef flag here to appease the MachineVerifier
2529 // liveness checks.
2531 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2532 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2533
2534 MIB.copyImplicitOps(MI);
2535 MI.eraseFromParent();
2536 break;
2537 }
2538
2539 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2540 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2541 MI.setDesc(get(AMDGPU::S_MUL_U64));
2542 break;
2543
2544 case AMDGPU::S_GETPC_B64_pseudo:
2545 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2546 if (ST.hasGetPCZeroExtension()) {
2547 Register Dst = MI.getOperand(0).getReg();
2548 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2549 // Fix up hardware that does not sign-extend the 48-bit PC value by
2550 // inserting: s_sext_i32_i16 dsthi, dsthi
2551 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2552 DstHi)
2553 .addReg(DstHi);
2554 }
2555 break;
2556 }
2557 return true;
2558}
2559
2562 unsigned SubIdx, const MachineInstr &Orig,
2563 const TargetRegisterInfo &RI) const {
2564
2565 // Try shrinking the instruction to remat only the part needed for current
2566 // context.
2567 // TODO: Handle more cases.
2568 unsigned Opcode = Orig.getOpcode();
2569 switch (Opcode) {
2570 case AMDGPU::S_LOAD_DWORDX16_IMM:
2571 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2572 if (SubIdx != 0)
2573 break;
2574
2575 if (I == MBB.end())
2576 break;
2577
2578 if (I->isBundled())
2579 break;
2580
2581 // Look for a single use of the register that is also a subreg.
2582 Register RegToFind = Orig.getOperand(0).getReg();
2583 MachineOperand *UseMO = nullptr;
2584 for (auto &CandMO : I->operands()) {
2585 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2586 continue;
2587 if (UseMO) {
2588 UseMO = nullptr;
2589 break;
2590 }
2591 UseMO = &CandMO;
2592 }
2593 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2594 break;
2595
2596 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2597 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2598
2601 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2602
2603 unsigned NewOpcode = -1;
2604 if (SubregSize == 256)
2605 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2606 else if (SubregSize == 128)
2607 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2608 else
2609 break;
2610
2611 const MCInstrDesc &TID = get(NewOpcode);
2612 const TargetRegisterClass *NewRC =
2613 RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
2614 MRI.setRegClass(DestReg, NewRC);
2615
2616 UseMO->setReg(DestReg);
2617 UseMO->setSubReg(AMDGPU::NoSubRegister);
2618
2619 // Use a smaller load with the desired size, possibly with updated offset.
2620 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2621 MI->setDesc(TID);
2622 MI->getOperand(0).setReg(DestReg);
2623 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2624 if (Offset) {
2625 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2626 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2627 OffsetMO->setImm(FinalOffset);
2628 }
2630 for (const MachineMemOperand *MemOp : Orig.memoperands())
2631 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2632 SubregSize / 8));
2633 MI->setMemRefs(*MF, NewMMOs);
2634
2635 MBB.insert(I, MI);
2636 return;
2637 }
2638
2639 default:
2640 break;
2641 }
2642
2643 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2644}
2645
2646std::pair<MachineInstr*, MachineInstr*>
2648 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2649
2650 if (ST.hasMovB64() &&
2652 getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2653 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2654 return std::pair(&MI, nullptr);
2655 }
2656
2657 MachineBasicBlock &MBB = *MI.getParent();
2661 Register Dst = MI.getOperand(0).getReg();
2662 unsigned Part = 0;
2663 MachineInstr *Split[2];
2664
2665 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2666 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2667 if (Dst.isPhysical()) {
2668 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2669 } else {
2670 assert(MRI.isSSA());
2671 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2672 MovDPP.addDef(Tmp);
2673 }
2674
2675 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2676 const MachineOperand &SrcOp = MI.getOperand(I);
2677 assert(!SrcOp.isFPImm());
2678 if (SrcOp.isImm()) {
2679 APInt Imm(64, SrcOp.getImm());
2680 Imm.ashrInPlace(Part * 32);
2681 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2682 } else {
2683 assert(SrcOp.isReg());
2684 Register Src = SrcOp.getReg();
2685 if (Src.isPhysical())
2686 MovDPP.addReg(RI.getSubReg(Src, Sub));
2687 else
2688 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2689 }
2690 }
2691
2692 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2693 MovDPP.addImm(MO.getImm());
2694
2695 Split[Part] = MovDPP;
2696 ++Part;
2697 }
2698
2699 if (Dst.isVirtual())
2700 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2701 .addReg(Split[0]->getOperand(0).getReg())
2702 .addImm(AMDGPU::sub0)
2703 .addReg(Split[1]->getOperand(0).getReg())
2704 .addImm(AMDGPU::sub1);
2705
2706 MI.eraseFromParent();
2707 return std::pair(Split[0], Split[1]);
2708}
2709
2710std::optional<DestSourcePair>
2712 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2713 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2714
2715 return std::nullopt;
2716}
2717
2719 MachineOperand &Src0,
2720 unsigned Src0OpName,
2721 MachineOperand &Src1,
2722 unsigned Src1OpName) const {
2723 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2724 if (!Src0Mods)
2725 return false;
2726
2727 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2728 assert(Src1Mods &&
2729 "All commutable instructions have both src0 and src1 modifiers");
2730
2731 int Src0ModsVal = Src0Mods->getImm();
2732 int Src1ModsVal = Src1Mods->getImm();
2733
2734 Src1Mods->setImm(Src0ModsVal);
2735 Src0Mods->setImm(Src1ModsVal);
2736 return true;
2737}
2738
2740 MachineOperand &RegOp,
2741 MachineOperand &NonRegOp) {
2742 Register Reg = RegOp.getReg();
2743 unsigned SubReg = RegOp.getSubReg();
2744 bool IsKill = RegOp.isKill();
2745 bool IsDead = RegOp.isDead();
2746 bool IsUndef = RegOp.isUndef();
2747 bool IsDebug = RegOp.isDebug();
2748
2749 if (NonRegOp.isImm())
2750 RegOp.ChangeToImmediate(NonRegOp.getImm());
2751 else if (NonRegOp.isFI())
2752 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2753 else if (NonRegOp.isGlobal()) {
2754 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2755 NonRegOp.getTargetFlags());
2756 } else
2757 return nullptr;
2758
2759 // Make sure we don't reinterpret a subreg index in the target flags.
2760 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2761
2762 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2763 NonRegOp.setSubReg(SubReg);
2764
2765 return &MI;
2766}
2767
2769 unsigned Src0Idx,
2770 unsigned Src1Idx) const {
2771 assert(!NewMI && "this should never be used");
2772
2773 unsigned Opc = MI.getOpcode();
2774 int CommutedOpcode = commuteOpcode(Opc);
2775 if (CommutedOpcode == -1)
2776 return nullptr;
2777
2778 if (Src0Idx > Src1Idx)
2779 std::swap(Src0Idx, Src1Idx);
2780
2781 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2782 static_cast<int>(Src0Idx) &&
2783 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2784 static_cast<int>(Src1Idx) &&
2785 "inconsistency with findCommutedOpIndices");
2786
2787 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2788 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2789
2790 MachineInstr *CommutedMI = nullptr;
2791 if (Src0.isReg() && Src1.isReg()) {
2792 if (isOperandLegal(MI, Src1Idx, &Src0)) {
2793 // Be sure to copy the source modifiers to the right place.
2794 CommutedMI
2795 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2796 }
2797
2798 } else if (Src0.isReg() && !Src1.isReg()) {
2799 // src0 should always be able to support any operand type, so no need to
2800 // check operand legality.
2801 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2802 } else if (!Src0.isReg() && Src1.isReg()) {
2803 if (isOperandLegal(MI, Src1Idx, &Src0))
2804 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2805 } else {
2806 // FIXME: Found two non registers to commute. This does happen.
2807 return nullptr;
2808 }
2809
2810 if (CommutedMI) {
2811 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2812 Src1, AMDGPU::OpName::src1_modifiers);
2813
2814 CommutedMI->setDesc(get(CommutedOpcode));
2815 }
2816
2817 return CommutedMI;
2818}
2819
2820// This needs to be implemented because the source modifiers may be inserted
2821// between the true commutable operands, and the base
2822// TargetInstrInfo::commuteInstruction uses it.
2824 unsigned &SrcOpIdx0,
2825 unsigned &SrcOpIdx1) const {
2826 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2827}
2828
2830 unsigned &SrcOpIdx0,
2831 unsigned &SrcOpIdx1) const {
2832 if (!Desc.isCommutable())
2833 return false;
2834
2835 unsigned Opc = Desc.getOpcode();
2836 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2837 if (Src0Idx == -1)
2838 return false;
2839
2840 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2841 if (Src1Idx == -1)
2842 return false;
2843
2844 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2845}
2846
2848 int64_t BrOffset) const {
2849 // BranchRelaxation should never have to check s_setpc_b64 because its dest
2850 // block is unanalyzable.
2851 assert(BranchOp != AMDGPU::S_SETPC_B64);
2852
2853 // Convert to dwords.
2854 BrOffset /= 4;
2855
2856 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2857 // from the next instruction.
2858 BrOffset -= 1;
2859
2860 return isIntN(BranchOffsetBits, BrOffset);
2861}
2862
2865 return MI.getOperand(0).getMBB();
2866}
2867
2869 for (const MachineInstr &MI : MBB->terminators()) {
2870 if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO ||
2871 MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2872 MI.getOpcode() == AMDGPU::SI_LOOP)
2873 return true;
2874 }
2875 return false;
2876}
2877
2879 MachineBasicBlock &DestBB,
2880 MachineBasicBlock &RestoreBB,
2881 const DebugLoc &DL, int64_t BrOffset,
2882 RegScavenger *RS) const {
2883 assert(RS && "RegScavenger required for long branching");
2884 assert(MBB.empty() &&
2885 "new block should be inserted for expanding unconditional branch");
2886 assert(MBB.pred_size() == 1);
2887 assert(RestoreBB.empty() &&
2888 "restore block should be inserted for restoring clobbered registers");
2889
2893
2894 // FIXME: Virtual register workaround for RegScavenger not working with empty
2895 // blocks.
2896 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2897
2898 auto I = MBB.end();
2899
2900 // We need to compute the offset relative to the instruction immediately after
2901 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2902 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2903
2904 auto &MCCtx = MF->getContext();
2905 MCSymbol *PostGetPCLabel =
2906 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2907 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2908
2909 MCSymbol *OffsetLo =
2910 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2911 MCSymbol *OffsetHi =
2912 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2913 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2914 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2915 .addReg(PCReg, 0, AMDGPU::sub0)
2916 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2917 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2918 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2919 .addReg(PCReg, 0, AMDGPU::sub1)
2920 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2921
2922 // Insert the indirect branch after the other terminator.
2923 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2924 .addReg(PCReg);
2925
2926 // If a spill is needed for the pc register pair, we need to insert a spill
2927 // restore block right before the destination block, and insert a short branch
2928 // into the old destination block's fallthrough predecessor.
2929 // e.g.:
2930 //
2931 // s_cbranch_scc0 skip_long_branch:
2932 //
2933 // long_branch_bb:
2934 // spill s[8:9]
2935 // s_getpc_b64 s[8:9]
2936 // s_add_u32 s8, s8, restore_bb
2937 // s_addc_u32 s9, s9, 0
2938 // s_setpc_b64 s[8:9]
2939 //
2940 // skip_long_branch:
2941 // foo;
2942 //
2943 // .....
2944 //
2945 // dest_bb_fallthrough_predecessor:
2946 // bar;
2947 // s_branch dest_bb
2948 //
2949 // restore_bb:
2950 // restore s[8:9]
2951 // fallthrough dest_bb
2952 ///
2953 // dest_bb:
2954 // buzz;
2955
2956 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
2957 Register Scav;
2958
2959 // If we've previously reserved a register for long branches
2960 // avoid running the scavenger and just use those registers
2961 if (LongBranchReservedReg) {
2962 RS->enterBasicBlock(MBB);
2963 Scav = LongBranchReservedReg;
2964 } else {
2966 Scav = RS->scavengeRegisterBackwards(
2967 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
2968 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
2969 }
2970 if (Scav) {
2971 RS->setRegUsed(Scav);
2972 MRI.replaceRegWith(PCReg, Scav);
2973 MRI.clearVirtRegs();
2974 } else {
2975 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
2976 // SGPR spill.
2977 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2978 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2979 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
2980 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
2981 MRI.clearVirtRegs();
2982 }
2983
2984 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
2985 // Now, the distance could be defined.
2987 MCSymbolRefExpr::create(DestLabel, MCCtx),
2988 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
2989 // Add offset assignments.
2990 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
2991 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
2992 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
2993 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
2994}
2995
2996unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
2997 switch (Cond) {
2998 case SIInstrInfo::SCC_TRUE:
2999 return AMDGPU::S_CBRANCH_SCC1;
3000 case SIInstrInfo::SCC_FALSE:
3001 return AMDGPU::S_CBRANCH_SCC0;
3002 case SIInstrInfo::VCCNZ:
3003 return AMDGPU::S_CBRANCH_VCCNZ;
3004 case SIInstrInfo::VCCZ:
3005 return AMDGPU::S_CBRANCH_VCCZ;
3006 case SIInstrInfo::EXECNZ:
3007 return AMDGPU::S_CBRANCH_EXECNZ;
3008 case SIInstrInfo::EXECZ:
3009 return AMDGPU::S_CBRANCH_EXECZ;
3010 default:
3011 llvm_unreachable("invalid branch predicate");
3012 }
3013}
3014
3015SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3016 switch (Opcode) {
3017 case AMDGPU::S_CBRANCH_SCC0:
3018 return SCC_FALSE;
3019 case AMDGPU::S_CBRANCH_SCC1:
3020 return SCC_TRUE;
3021 case AMDGPU::S_CBRANCH_VCCNZ:
3022 return VCCNZ;
3023 case AMDGPU::S_CBRANCH_VCCZ:
3024 return VCCZ;
3025 case AMDGPU::S_CBRANCH_EXECNZ:
3026 return EXECNZ;
3027 case AMDGPU::S_CBRANCH_EXECZ:
3028 return EXECZ;
3029 default:
3030 return INVALID_BR;
3031 }
3032}
3033
3037 MachineBasicBlock *&FBB,
3039 bool AllowModify) const {
3040 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3041 // Unconditional Branch
3042 TBB = I->getOperand(0).getMBB();
3043 return false;
3044 }
3045
3046 MachineBasicBlock *CondBB = nullptr;
3047
3048 if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
3049 CondBB = I->getOperand(1).getMBB();
3050 Cond.push_back(I->getOperand(0));
3051 } else {
3052 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3053 if (Pred == INVALID_BR)
3054 return true;
3055
3056 CondBB = I->getOperand(0).getMBB();
3057 Cond.push_back(MachineOperand::CreateImm(Pred));
3058 Cond.push_back(I->getOperand(1)); // Save the branch register.
3059 }
3060 ++I;
3061
3062 if (I == MBB.end()) {
3063 // Conditional branch followed by fall-through.
3064 TBB = CondBB;
3065 return false;
3066 }
3067
3068 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3069 TBB = CondBB;
3070 FBB = I->getOperand(0).getMBB();
3071 return false;
3072 }
3073
3074 return true;
3075}
3076
3078 MachineBasicBlock *&FBB,
3080 bool AllowModify) const {
3082 auto E = MBB.end();
3083 if (I == E)
3084 return false;
3085
3086 // Skip over the instructions that are artificially terminators for special
3087 // exec management.
3088 while (I != E && !I->isBranch() && !I->isReturn()) {
3089 switch (I->getOpcode()) {
3090 case AMDGPU::S_MOV_B64_term:
3091 case AMDGPU::S_XOR_B64_term:
3092 case AMDGPU::S_OR_B64_term:
3093 case AMDGPU::S_ANDN2_B64_term:
3094 case AMDGPU::S_AND_B64_term:
3095 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3096 case AMDGPU::S_MOV_B32_term:
3097 case AMDGPU::S_XOR_B32_term:
3098 case AMDGPU::S_OR_B32_term:
3099 case AMDGPU::S_ANDN2_B32_term:
3100 case AMDGPU::S_AND_B32_term:
3101 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3102 break;
3103 case AMDGPU::SI_IF:
3104 case AMDGPU::SI_ELSE:
3105 case AMDGPU::SI_KILL_I1_TERMINATOR:
3106 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3107 // FIXME: It's messy that these need to be considered here at all.
3108 return true;
3109 default:
3110 llvm_unreachable("unexpected non-branch terminator inst");
3111 }
3112
3113 ++I;
3114 }
3115
3116 if (I == E)
3117 return false;
3118
3119 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3120}
3121
3123 int *BytesRemoved) const {
3124 unsigned Count = 0;
3125 unsigned RemovedSize = 0;
3127 // Skip over artificial terminators when removing instructions.
3128 if (MI.isBranch() || MI.isReturn()) {
3129 RemovedSize += getInstSizeInBytes(MI);
3130 MI.eraseFromParent();
3131 ++Count;
3132 }
3133 }
3134
3135 if (BytesRemoved)
3136 *BytesRemoved = RemovedSize;
3137
3138 return Count;
3139}
3140
3141// Copy the flags onto the implicit condition register operand.
3143 const MachineOperand &OrigCond) {
3144 CondReg.setIsUndef(OrigCond.isUndef());
3145 CondReg.setIsKill(OrigCond.isKill());
3146}
3147
3150 MachineBasicBlock *FBB,
3152 const DebugLoc &DL,
3153 int *BytesAdded) const {
3154 if (!FBB && Cond.empty()) {
3155 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3156 .addMBB(TBB);
3157 if (BytesAdded)
3158 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3159 return 1;
3160 }
3161
3162 if(Cond.size() == 1 && Cond[0].isReg()) {
3163 BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
3164 .add(Cond[0])
3165 .addMBB(TBB);
3166 return 1;
3167 }
3168
3169 assert(TBB && Cond[0].isImm());
3170
3171 unsigned Opcode
3172 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3173
3174 if (!FBB) {
3175 MachineInstr *CondBr =
3176 BuildMI(&MBB, DL, get(Opcode))
3177 .addMBB(TBB);
3178
3179 // Copy the flags onto the implicit condition register operand.
3180 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3181 fixImplicitOperands(*CondBr);
3182
3183 if (BytesAdded)
3184 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3185 return 1;
3186 }
3187
3188 assert(TBB && FBB);
3189
3190 MachineInstr *CondBr =
3191 BuildMI(&MBB, DL, get(Opcode))
3192 .addMBB(TBB);
3193 fixImplicitOperands(*CondBr);
3194 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3195 .addMBB(FBB);
3196
3197 MachineOperand &CondReg = CondBr->getOperand(1);
3198 CondReg.setIsUndef(Cond[1].isUndef());
3199 CondReg.setIsKill(Cond[1].isKill());
3200
3201 if (BytesAdded)
3202 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3203
3204 return 2;
3205}
3206
3209 if (Cond.size() != 2) {
3210 return true;
3211 }
3212
3213 if (Cond[0].isImm()) {
3214 Cond[0].setImm(-Cond[0].getImm());
3215 return false;
3216 }
3217
3218 return true;
3219}
3220
3223 Register DstReg, Register TrueReg,
3224 Register FalseReg, int &CondCycles,
3225 int &TrueCycles, int &FalseCycles) const {
3226 switch (Cond[0].getImm()) {
3227 case VCCNZ:
3228 case VCCZ: {
3230 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3231 if (MRI.getRegClass(FalseReg) != RC)
3232 return false;
3233
3234 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3235 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3236
3237 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3238 return RI.hasVGPRs(RC) && NumInsts <= 6;
3239 }
3240 case SCC_TRUE:
3241 case SCC_FALSE: {
3242 // FIXME: We could insert for VGPRs if we could replace the original compare
3243 // with a vector one.
3245 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3246 if (MRI.getRegClass(FalseReg) != RC)
3247 return false;
3248
3249 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3250
3251 // Multiples of 8 can do s_cselect_b64
3252 if (NumInsts % 2 == 0)
3253 NumInsts /= 2;
3254
3255 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3256 return RI.isSGPRClass(RC);
3257 }
3258 default:
3259 return false;
3260 }
3261}
3262
3266 Register TrueReg, Register FalseReg) const {
3267 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3268 if (Pred == VCCZ || Pred == SCC_FALSE) {
3269 Pred = static_cast<BranchPredicate>(-Pred);
3270 std::swap(TrueReg, FalseReg);
3271 }
3272
3274 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3275 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3276
3277 if (DstSize == 32) {
3279 if (Pred == SCC_TRUE) {
3280 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3281 .addReg(TrueReg)
3282 .addReg(FalseReg);
3283 } else {
3284 // Instruction's operands are backwards from what is expected.
3285 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3286 .addReg(FalseReg)
3287 .addReg(TrueReg);
3288 }
3289
3290 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3291 return;
3292 }
3293
3294 if (DstSize == 64 && Pred == SCC_TRUE) {
3296 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3297 .addReg(TrueReg)
3298 .addReg(FalseReg);
3299
3300 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3301 return;
3302 }
3303
3304 static const int16_t Sub0_15[] = {
3305 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3306 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3307 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3308 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3309 };
3310
3311 static const int16_t Sub0_15_64[] = {
3312 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3313 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3314 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3315 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3316 };
3317
3318 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3319 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3320 const int16_t *SubIndices = Sub0_15;
3321 int NElts = DstSize / 32;
3322
3323 // 64-bit select is only available for SALU.
3324 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3325 if (Pred == SCC_TRUE) {
3326 if (NElts % 2) {
3327 SelOp = AMDGPU::S_CSELECT_B32;
3328 EltRC = &AMDGPU::SGPR_32RegClass;
3329 } else {
3330 SelOp = AMDGPU::S_CSELECT_B64;
3331 EltRC = &AMDGPU::SGPR_64RegClass;
3332 SubIndices = Sub0_15_64;
3333 NElts /= 2;
3334 }
3335 }
3336
3338 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3339
3340 I = MIB->getIterator();
3341
3343 for (int Idx = 0; Idx != NElts; ++Idx) {
3344 Register DstElt = MRI.createVirtualRegister(EltRC);
3345 Regs.push_back(DstElt);
3346
3347 unsigned SubIdx = SubIndices[Idx];
3348
3350 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3351 Select =
3352 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3353 .addReg(FalseReg, 0, SubIdx)
3354 .addReg(TrueReg, 0, SubIdx);
3355 } else {
3356 Select =
3357 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3358 .addReg(TrueReg, 0, SubIdx)
3359 .addReg(FalseReg, 0, SubIdx);
3360 }
3361
3362 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3364
3365 MIB.addReg(DstElt)
3366 .addImm(SubIdx);
3367 }
3368}
3369
3371 switch (MI.getOpcode()) {
3372 case AMDGPU::V_MOV_B32_e32:
3373 case AMDGPU::V_MOV_B32_e64:
3374 case AMDGPU::V_MOV_B64_PSEUDO:
3375 case AMDGPU::V_MOV_B64_e32:
3376 case AMDGPU::V_MOV_B64_e64:
3377 case AMDGPU::S_MOV_B32:
3378 case AMDGPU::S_MOV_B64:
3379 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3380 case AMDGPU::COPY:
3381 case AMDGPU::WWM_COPY:
3382 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3383 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3384 case AMDGPU::V_ACCVGPR_MOV_B32:
3385 return true;
3386 default:
3387 return false;
3388 }
3389}
3390
3391static constexpr unsigned ModifierOpNames[] = {
3392 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3393 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3394 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3395
3397 unsigned Opc = MI.getOpcode();
3398 for (unsigned Name : reverse(ModifierOpNames)) {
3400 if (Idx >= 0)
3401 MI.removeOperand(Idx);
3402 }
3403}
3404
3406 Register Reg, MachineRegisterInfo *MRI) const {
3407 if (!MRI->hasOneNonDBGUse(Reg))
3408 return false;
3409
3410 switch (DefMI.getOpcode()) {
3411 default:
3412 return false;
3413 case AMDGPU::V_MOV_B64_e32:
3414 case AMDGPU::S_MOV_B64:
3415 case AMDGPU::V_MOV_B64_PSEUDO:
3416 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3417 case AMDGPU::V_MOV_B32_e32:
3418 case AMDGPU::S_MOV_B32:
3419 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3420 break;
3421 }
3422
3423 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
3424 assert(ImmOp);
3425 // FIXME: We could handle FrameIndex values here.
3426 if (!ImmOp->isImm())
3427 return false;
3428
3429 auto getImmFor = [ImmOp](const MachineOperand &UseOp) -> int64_t {
3430 int64_t Imm = ImmOp->getImm();
3431 switch (UseOp.getSubReg()) {
3432 default:
3433 return Imm;
3434 case AMDGPU::sub0:
3435 return Lo_32(Imm);
3436 case AMDGPU::sub1:
3437 return Hi_32(Imm);
3438 case AMDGPU::lo16:
3439 return APInt(16, Imm).getSExtValue();
3440 case AMDGPU::hi16:
3441 return APInt(32, Imm).ashr(16).getSExtValue();
3442 case AMDGPU::sub1_lo16:
3443 return APInt(16, Hi_32(Imm)).getSExtValue();
3444 case AMDGPU::sub1_hi16:
3445 return APInt(32, Hi_32(Imm)).ashr(16).getSExtValue();
3446 }
3447 };
3448
3449 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3450
3451 unsigned Opc = UseMI.getOpcode();
3452 if (Opc == AMDGPU::COPY) {
3453 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3454
3455 Register DstReg = UseMI.getOperand(0).getReg();
3456 unsigned OpSize = getOpSize(UseMI, 0);
3457 bool Is16Bit = OpSize == 2;
3458 bool Is64Bit = OpSize == 8;
3459 bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
3460 unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
3461 : AMDGPU::V_MOV_B32_e32
3462 : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
3463 : AMDGPU::S_MOV_B32;
3464 APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1)));
3465
3466 if (RI.isAGPR(*MRI, DstReg)) {
3467 if (Is64Bit || !isInlineConstant(Imm))
3468 return false;
3469 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
3470 }
3471
3472 if (Is16Bit) {
3473 if (isVGPRCopy)
3474 return false; // Do not clobber vgpr_hi16
3475
3476 if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
3477 return false;
3478
3479 UseMI.getOperand(0).setSubReg(0);
3480 if (DstReg.isPhysical()) {
3481 DstReg = RI.get32BitRegister(DstReg);
3482 UseMI.getOperand(0).setReg(DstReg);
3483 }
3484 assert(UseMI.getOperand(1).getReg().isVirtual());
3485 }
3486
3487 const MCInstrDesc &NewMCID = get(NewOpc);
3488 if (DstReg.isPhysical() &&
3489 !RI.getRegClass(NewMCID.operands()[0].RegClass)->contains(DstReg))
3490 return false;
3491
3492 UseMI.setDesc(NewMCID);
3493 UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
3494 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
3495 return true;
3496 }
3497
3498 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3499 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3500 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3501 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3502 Opc == AMDGPU::V_FMAC_F16_t16_e64) {
3503 // Don't fold if we are using source or output modifiers. The new VOP2
3504 // instructions don't have them.
3506 return false;
3507
3508 // If this is a free constant, there's no reason to do this.
3509 // TODO: We could fold this here instead of letting SIFoldOperands do it
3510 // later.
3511 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
3512
3513 // Any src operand can be used for the legality check.
3514 if (isInlineConstant(UseMI, *Src0, *ImmOp))
3515 return false;
3516
3517 bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3518 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64;
3519 bool IsFMA =
3520 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3521 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3522 Opc == AMDGPU::V_FMAC_F16_t16_e64;
3523 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3524 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3525
3526 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3527 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3528 (Src1->isReg() && Src1->getReg() == Reg)) {
3529 MachineOperand *RegSrc =
3530 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3531 if (!RegSrc->isReg())
3532 return false;
3533 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3534 ST.getConstantBusLimit(Opc) < 2)
3535 return false;
3536
3537 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3538 return false;
3539
3540 // If src2 is also a literal constant then we have to choose which one to
3541 // fold. In general it is better to choose madak so that the other literal
3542 // can be materialized in an sgpr instead of a vgpr:
3543 // s_mov_b32 s0, literal
3544 // v_madak_f32 v0, s0, v0, literal
3545 // Instead of:
3546 // v_mov_b32 v1, literal
3547 // v_madmk_f32 v0, v0, literal, v1
3548 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3549 if (Def && Def->isMoveImmediate() &&
3550 !isInlineConstant(Def->getOperand(1)))
3551 return false;
3552
3553 unsigned NewOpc =
3554 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3555 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
3556 : AMDGPU::V_FMAMK_F16)
3557 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
3558 if (pseudoToMCOpcode(NewOpc) == -1)
3559 return false;
3560
3561 // V_FMAMK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
3562 // would also require restricting their register classes. For now
3563 // just bail out.
3564 if (NewOpc == AMDGPU::V_FMAMK_F16_t16)
3565 return false;
3566
3567 const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
3568
3569 // FIXME: This would be a lot easier if we could return a new instruction
3570 // instead of having to modify in place.
3571
3572 Register SrcReg = RegSrc->getReg();
3573 unsigned SrcSubReg = RegSrc->getSubReg();
3574 Src0->setReg(SrcReg);
3575 Src0->setSubReg(SrcSubReg);
3576 Src0->setIsKill(RegSrc->isKill());
3577
3578 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3579 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3580 Opc == AMDGPU::V_FMAC_F16_e64)
3581 UseMI.untieRegOperand(
3582 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3583
3584 Src1->ChangeToImmediate(Imm);
3585
3587 UseMI.setDesc(get(NewOpc));
3588
3589 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3590 if (DeleteDef)
3591 DefMI.eraseFromParent();
3592
3593 return true;
3594 }
3595
3596 // Added part is the constant: Use v_madak_{f16, f32}.
3597 if (Src2->isReg() && Src2->getReg() == Reg) {
3598 if (ST.getConstantBusLimit(Opc) < 2) {
3599 // Not allowed to use constant bus for another operand.
3600 // We can however allow an inline immediate as src0.
3601 bool Src0Inlined = false;
3602 if (Src0->isReg()) {
3603 // Try to inline constant if possible.
3604 // If the Def moves immediate and the use is single
3605 // We are saving VGPR here.
3606 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3607 if (Def && Def->isMoveImmediate() &&
3608 isInlineConstant(Def->getOperand(1)) &&
3609 MRI->hasOneUse(Src0->getReg())) {
3610 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3611 Src0Inlined = true;
3612 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3613 RI.isSGPRReg(*MRI, Src0->getReg())) {
3614 return false;
3615 }
3616 // VGPR is okay as Src0 - fallthrough
3617 }
3618
3619 if (Src1->isReg() && !Src0Inlined) {
3620 // We have one slot for inlinable constant so far - try to fill it
3621 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3622 if (Def && Def->isMoveImmediate() &&
3623 isInlineConstant(Def->getOperand(1)) &&
3624 MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI))
3625 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3626 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3627 return false;
3628 // VGPR is okay as Src1 - fallthrough
3629 }
3630 }
3631
3632 unsigned NewOpc =
3633 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3634 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
3635 : AMDGPU::V_FMAAK_F16)
3636 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
3637 if (pseudoToMCOpcode(NewOpc) == -1)
3638 return false;
3639
3640 // V_FMAAK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
3641 // would also require restricting their register classes. For now
3642 // just bail out.
3643 if (NewOpc == AMDGPU::V_FMAAK_F16_t16)
3644 return false;
3645
3646 // FIXME: This would be a lot easier if we could return a new instruction
3647 // instead of having to modify in place.
3648
3649 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3650 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3651 Opc == AMDGPU::V_FMAC_F16_e64)
3652 UseMI.untieRegOperand(
3653 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3654
3655 // ChangingToImmediate adds Src2 back to the instruction.
3656 Src2->ChangeToImmediate(getImmFor(*Src2));
3657
3658 // These come before src2.
3660 UseMI.setDesc(get(NewOpc));
3661 // It might happen that UseMI was commuted
3662 // and we now have SGPR as SRC1. If so 2 inlined
3663 // constant and SGPR are illegal.
3665
3666 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3667 if (DeleteDef)
3668 DefMI.eraseFromParent();
3669
3670 return true;
3671 }
3672 }
3673
3674 return false;
3675}
3676
3677static bool
3680 if (BaseOps1.size() != BaseOps2.size())
3681 return false;
3682 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3683 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3684 return false;
3685 }
3686 return true;
3687}
3688
3689static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3690 LocationSize WidthB, int OffsetB) {
3691 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3692 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3693 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3694 return LowWidth.hasValue() &&
3695 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3696}
3697
3698bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3699 const MachineInstr &MIb) const {
3700 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3701 int64_t Offset0, Offset1;
3702 LocationSize Dummy0 = 0, Dummy1 = 0;
3703 bool Offset0IsScalable, Offset1IsScalable;
3704 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3705 Dummy0, &RI) ||
3706 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3707 Dummy1, &RI))
3708 return false;
3709
3710 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3711 return false;
3712
3713 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3714 // FIXME: Handle ds_read2 / ds_write2.
3715 return false;
3716 }
3717 LocationSize Width0 = MIa.memoperands().front()->getSize();
3718 LocationSize Width1 = MIb.memoperands().front()->getSize();
3719 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3720}
3721
3723 const MachineInstr &MIb) const {
3724 assert(MIa.mayLoadOrStore() &&
3725 "MIa must load from or modify a memory location");
3726 assert(MIb.mayLoadOrStore() &&
3727 "MIb must load from or modify a memory location");
3728
3730 return false;
3731
3732 // XXX - Can we relax this between address spaces?
3733 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3734 return false;
3735
3736 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3737 return false;
3738
3739 // TODO: Should we check the address space from the MachineMemOperand? That
3740 // would allow us to distinguish objects we know don't alias based on the
3741 // underlying address space, even if it was lowered to a different one,
3742 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3743 // buffer.
3744 if (isDS(MIa)) {
3745 if (isDS(MIb))
3746 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3747
3748 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3749 }
3750
3751 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3752 if (isMUBUF(MIb) || isMTBUF(MIb))
3753 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3754
3755 if (isFLAT(MIb))
3756 return isFLATScratch(MIb);
3757
3758 return !isSMRD(MIb);
3759 }
3760
3761 if (isSMRD(MIa)) {
3762 if (isSMRD(MIb))
3763 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3764
3765 if (isFLAT(MIb))
3766 return isFLATScratch(MIb);
3767
3768 return !isMUBUF(MIb) && !isMTBUF(MIb);
3769 }
3770
3771 if (isFLAT(MIa)) {
3772 if (isFLAT(MIb)) {
3773 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3774 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3775 return true;
3776
3777 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3778 }
3779
3780 return false;
3781 }
3782
3783 return false;
3784}
3785
3787 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3788 if (Reg.isPhysical())
3789 return false;
3790 auto *Def = MRI.getUniqueVRegDef(Reg);
3791 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3792 Imm = Def->getOperand(1).getImm();
3793 if (DefMI)
3794 *DefMI = Def;
3795 return true;
3796 }
3797 return false;
3798}
3799
3800static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3801 MachineInstr **DefMI = nullptr) {
3802 if (!MO->isReg())
3803 return false;
3804 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3805 const MachineRegisterInfo &MRI = MF->getRegInfo();
3806 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3807}
3808
3810 MachineInstr &NewMI) {
3811 if (LV) {
3812 unsigned NumOps = MI.getNumOperands();
3813 for (unsigned I = 1; I < NumOps; ++I) {
3814 MachineOperand &Op = MI.getOperand(I);
3815 if (Op.isReg() && Op.isKill())
3816 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3817 }
3818 }
3819}
3820
3822 LiveVariables *LV,
3823 LiveIntervals *LIS) const {
3824 MachineBasicBlock &MBB = *MI.getParent();
3825 unsigned Opc = MI.getOpcode();
3826
3827 // Handle MFMA.
3828 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
3829 if (NewMFMAOpc != -1) {
3831 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
3832 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3833 MIB.add(MI.getOperand(I));
3834 updateLiveVariables(LV, MI, *MIB);
3835 if (LIS) {
3836 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3837 // SlotIndex of defs needs to be updated when converting to early-clobber
3838 MachineOperand &Def = MIB->getOperand(0);
3839 if (Def.isEarlyClobber() && Def.isReg() &&
3840 LIS->hasInterval(Def.getReg())) {
3841 SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
3842 SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
3843 auto &LI = LIS->getInterval(Def.getReg());
3844 auto UpdateDefIndex = [&](LiveRange &LR) {
3845 auto S = LR.find(OldIndex);
3846 if (S != LR.end() && S->start == OldIndex) {
3847 assert(S->valno && S->valno->def == OldIndex);
3848 S->start = NewIndex;
3849 S->valno->def = NewIndex;
3850 }
3851 };
3852 UpdateDefIndex(LI);
3853 for (auto &SR : LI.subranges())
3854 UpdateDefIndex(SR);
3855 }
3856 }
3857 return MIB;
3858 }
3859
3860 if (SIInstrInfo::isWMMA(MI)) {
3861 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
3862 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3863 .setMIFlags(MI.getFlags());
3864 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3865 MIB->addOperand(MI.getOperand(I));
3866
3867 updateLiveVariables(LV, MI, *MIB);
3868 if (LIS)
3869 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3870
3871 return MIB;
3872 }
3873
3874 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
3875 "V_FMAC_F16_t16_e32 is not supported and not expected to be present "
3876 "pre-RA");
3877
3878 // Handle MAC/FMAC.
3879 bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
3880 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3881 Opc == AMDGPU::V_FMAC_F16_t16_e64;
3882 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3883 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3884 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
3885 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3886 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3887 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3888 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3889 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
3890 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
3891 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3892 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
3893 bool Src0Literal = false;
3894
3895 switch (Opc) {
3896 default:
3897 return nullptr;
3898 case AMDGPU::V_MAC_F16_e64:
3899 case AMDGPU::V_FMAC_F16_e64:
3900 case AMDGPU::V_FMAC_F16_t16_e64:
3901 case AMDGPU::V_MAC_F32_e64:
3902 case AMDGPU::V_MAC_LEGACY_F32_e64:
3903 case AMDGPU::V_FMAC_F32_e64:
3904 case AMDGPU::V_FMAC_LEGACY_F32_e64:
3905 case AMDGPU::V_FMAC_F64_e64:
3906 break;
3907 case AMDGPU::V_MAC_F16_e32:
3908 case AMDGPU::V_FMAC_F16_e32:
3909 case AMDGPU::V_MAC_F32_e32:
3910 case AMDGPU::V_MAC_LEGACY_F32_e32:
3911 case AMDGPU::V_FMAC_F32_e32:
3912 case AMDGPU::V_FMAC_LEGACY_F32_e32:
3913 case AMDGPU::V_FMAC_F64_e32: {
3914 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
3915 AMDGPU::OpName::src0);
3916 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
3917 if (!Src0->isReg() && !Src0->isImm())
3918 return nullptr;
3919
3920 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
3921 Src0Literal = true;
3922
3923 break;
3924 }
3925 }
3926
3928 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
3929 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
3930 const MachineOperand *Src0Mods =
3931 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
3932 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3933 const MachineOperand *Src1Mods =
3934 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
3935 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3936 const MachineOperand *Src2Mods =
3937 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
3938 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
3939 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
3940 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
3941
3942 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 &&
3943 !IsLegacy &&
3944 // If we have an SGPR input, we will violate the constant bus restriction.
3945 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
3946 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
3948 const auto killDef = [&]() -> void {
3950 // The only user is the instruction which will be killed.
3951 Register DefReg = DefMI->getOperand(0).getReg();
3952 if (!MRI.hasOneNonDBGUse(DefReg))
3953 return;
3954 // We cannot just remove the DefMI here, calling pass will crash.
3955 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
3956 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
3958 if (LV)
3959 LV->getVarInfo(DefReg).AliveBlocks.clear();
3960 };
3961
3962 int64_t Imm;
3963 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
3964 unsigned NewOpc =
3965 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
3966 : AMDGPU::V_FMAAK_F16)
3967 : AMDGPU::V_FMAAK_F32)
3968 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
3969 if (pseudoToMCOpcode(NewOpc) != -1) {
3970 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3971 .add(*Dst)
3972 .add(*Src0)
3973 .add(*Src1)
3974 .addImm(Imm);
3975 updateLiveVariables(LV, MI, *MIB);
3976 if (LIS)
3977 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3978 killDef();
3979 return MIB;
3980 }
3981 }
3982 unsigned NewOpc =
3983 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
3984 : AMDGPU::V_FMAMK_F16)
3985 : AMDGPU::V_FMAMK_F32)
3986 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
3987 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
3988 if (pseudoToMCOpcode(NewOpc) != -1) {
3989 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3990 .add(*Dst)
3991 .add(*Src0)
3992 .addImm(Imm)
3993 .add(*Src2);
3994 updateLiveVariables(LV, MI, *MIB);
3995 if (LIS)
3996 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3997 killDef();
3998 return MIB;
3999 }
4000 }
4001 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4002 if (Src0Literal) {
4003 Imm = Src0->getImm();
4004 DefMI = nullptr;
4005 }
4006 if (pseudoToMCOpcode(NewOpc) != -1 &&
4008 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4009 Src1)) {
4010 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4011 .add(*Dst)
4012 .add(*Src1)
4013 .addImm(Imm)
4014 .add(*Src2);
4015 updateLiveVariables(LV, MI, *MIB);
4016 if (LIS)
4017 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4018 if (DefMI)
4019 killDef();
4020 return MIB;
4021 }
4022 }
4023 }
4024
4025 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4026 // if VOP3 does not allow a literal operand.
4027 if (Src0Literal && !ST.hasVOP3Literal())
4028 return nullptr;
4029
4030 unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64
4031 : IsF64 ? AMDGPU::V_FMA_F64_e64
4032 : IsLegacy
4033 ? AMDGPU::V_FMA_LEGACY_F32_e64
4034 : AMDGPU::V_FMA_F32_e64
4035 : IsF16 ? AMDGPU::V_MAD_F16_e64
4036 : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64
4037 : AMDGPU::V_MAD_F32_e64;
4038 if (pseudoToMCOpcode(NewOpc) == -1)
4039 return nullptr;
4040
4041 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4042 .add(*Dst)
4043 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4044 .add(*Src0)
4045 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4046 .add(*Src1)
4047 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4048 .add(*Src2)
4049 .addImm(Clamp ? Clamp->getImm() : 0)
4050 .addImm(Omod ? Omod->getImm() : 0);
4051 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4052 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4053 updateLiveVariables(LV, MI, *MIB);
4054 if (LIS)
4055 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4056 return MIB;
4057}
4058
4059// It's not generally safe to move VALU instructions across these since it will
4060// start using the register as a base index rather than directly.
4061// XXX - Why isn't hasSideEffects sufficient for these?
4063 switch (MI.getOpcode()) {
4064 case AMDGPU::S_SET_GPR_IDX_ON:
4065 case AMDGPU::S_SET_GPR_IDX_MODE:
4066 case AMDGPU::S_SET_GPR_IDX_OFF:
4067 return true;
4068 default:
4069 return false;
4070 }
4071}
4072
4074 const MachineBasicBlock *MBB,
4075 const MachineFunction &MF) const {
4076 // Skipping the check for SP writes in the base implementation. The reason it
4077 // was added was apparently due to compile time concerns.
4078 //
4079 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4080 // but is probably avoidable.
4081
4082 // Copied from base implementation.
4083 // Terminators and labels can't be scheduled around.
4084 if (MI.isTerminator() || MI.isPosition())
4085 return true;
4086
4087 // INLINEASM_BR can jump to another block
4088 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4089 return true;
4090
4091 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4092 return true;
4093
4094 // Target-independent instructions do not have an implicit-use of EXEC, even
4095 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4096 // boundaries prevents incorrect movements of such instructions.
4097 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4098 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4099 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4100 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4102}
4103
4105 return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
4106}
4107
4109 // Skip the full operand and register alias search modifiesRegister
4110 // does. There's only a handful of instructions that touch this, it's only an
4111 // implicit def, and doesn't alias any other registers.
4112 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4113}
4114
4116 unsigned Opcode = MI.getOpcode();
4117
4118 if (MI.mayStore() && isSMRD(MI))
4119 return true; // scalar store or atomic
4120
4121 // This will terminate the function when other lanes may need to continue.
4122 if (MI.isReturn())
4123 return true;
4124
4125 // These instructions cause shader I/O that may cause hardware lockups
4126 // when executed with an empty EXEC mask.
4127 //
4128 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4129 // EXEC = 0, but checking for that case here seems not worth it
4130 // given the typical code patterns.
4131 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4132 isEXP(Opcode) ||
4133 Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP ||
4134 Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER)
4135 return true;
4136
4137 if (MI.isCall() || MI.isInlineAsm())
4138 return true; // conservative assumption
4139
4140 // A mode change is a scalar operation that influences vector instructions.
4142 return true;
4143
4144 // These are like SALU instructions in terms of effects, so it's questionable
4145 // whether we should return true for those.
4146 //
4147 // However, executing them with EXEC = 0 causes them to operate on undefined
4148 // data, which we avoid by returning true here.
4149 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4150 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4151 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4152 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4153 return true;
4154
4155 return false;
4156}
4157
4159 const MachineInstr &MI) const {
4160 if (MI.isMetaInstruction())
4161 return false;
4162
4163 // This won't read exec if this is an SGPR->SGPR copy.
4164 if (MI.isCopyLike()) {
4165 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4166 return true;
4167
4168 // Make sure this isn't copying exec as a normal operand
4169 return MI.readsRegister(AMDGPU::EXEC, &RI);
4170 }
4171
4172 // Make a conservative assumption about the callee.
4173 if (MI.isCall())
4174 return true;
4175
4176 // Be conservative with any unhandled generic opcodes.
4177 if (!isTargetSpecificOpcode(MI.getOpcode()))
4178 return true;
4179
4180 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4181}
4182
4183bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4184 switch (Imm.getBitWidth()) {
4185 case 1: // This likely will be a condition code mask.
4186 return true;
4187
4188 case 32:
4189 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4190 ST.hasInv2PiInlineImm());
4191 case 64:
4192 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4193 ST.hasInv2PiInlineImm());
4194 case 16:
4195 return ST.has16BitInsts() &&
4196 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4197 ST.hasInv2PiInlineImm());
4198 default:
4199 llvm_unreachable("invalid bitwidth");
4200 }
4201}
4202
4204 APInt IntImm = Imm.bitcastToAPInt();
4205 int64_t IntImmVal = IntImm.getSExtValue();
4206 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4207 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4208 default:
4209 llvm_unreachable("invalid fltSemantics");
4212 return isInlineConstant(IntImm);
4214 return ST.has16BitInsts() &&
4215 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4217 return ST.has16BitInsts() &&
4218 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4219 }
4220}
4221
4223 uint8_t OperandType) const {
4224 assert(!MO.isReg() && "isInlineConstant called on register operand!");
4225 if (!MO.isImm())
4226 return false;
4227
4228 // MachineOperand provides no way to tell the true operand size, since it only
4229 // records a 64-bit value. We need to know the size to determine if a 32-bit
4230 // floating point immediate bit pattern is legal for an integer immediate. It
4231 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4232
4233 int64_t Imm = MO.getImm();
4234 switch (OperandType) {
4247 int32_t Trunc = static_cast<int32_t>(Imm);
4249 }
4256 ST.hasInv2PiInlineImm());
4260 // We would expect inline immediates to not be concerned with an integer/fp
4261 // distinction. However, in the case of 16-bit integer operations, the
4262 // "floating point" values appear to not work. It seems read the low 16-bits
4263 // of 32-bit immediates, which happens to always work for the integer
4264 // values.
4265 //
4266 // See llvm bugzilla 46302.
4267 //
4268 // TODO: Theoretically we could use op-sel to use the high bits of the
4269 // 32-bit FP values.
4287 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4288 // A few special case instructions have 16-bit operands on subtargets
4289 // where 16-bit instructions are not legal.
4290 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4291 // constants in these cases
4292 int16_t Trunc = static_cast<int16_t>(Imm);
4293 return ST.has16BitInsts() &&
4295 }
4296
4297 return false;
4298 }
4303 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4304 int16_t Trunc = static_cast<int16_t>(Imm);
4305 return ST.has16BitInsts() &&
4307 }
4308 return false;
4309 }
4312 return false;
4315 // Always embedded in the instruction for free.
4316 return true;
4326 // Just ignore anything else.
4327 return true;
4328 default:
4329 llvm_unreachable("invalid operand type");
4330 }
4331}
4332
4333static bool compareMachineOp(const MachineOperand &Op0,
4334 const MachineOperand &Op1) {
4335 if (Op0.getType() != Op1.getType())
4336 return false;
4337
4338 switch (Op0.getType()) {
4340 return Op0.getReg() == Op1.getReg();
4342 return Op0.getImm() == Op1.getImm();
4343 default:
4344 llvm_unreachable("Didn't expect to be comparing these operand types");
4345 }
4346}
4347
4349 const MachineOperand &MO) const {
4350 const MCInstrDesc &InstDesc = MI.getDesc();
4351 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4352
4353 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
4354
4356 return true;
4357
4358 if (OpInfo.RegClass < 0)
4359 return false;
4360
4361 if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
4362 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
4363 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4364 AMDGPU::OpName::src2))
4365 return false;
4366 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4367 }
4368
4369 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4370 return false;
4371
4372 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
4373 return true;
4374
4375 return ST.hasVOP3Literal();
4376}
4377
4378bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4379 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4380 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4381 return false;
4382
4383 int Op32 = AMDGPU::getVOPe32(Opcode);
4384 if (Op32 == -1)
4385 return false;
4386
4387 return pseudoToMCOpcode(Op32) != -1;
4388}
4389
4390bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4391 // The src0_modifier operand is present on all instructions
4392 // that have modifiers.
4393
4394 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4395}
4396
4398 unsigned OpName) const {
4399 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4400 return Mods && Mods->getImm();
4401}
4402
4404 return any_of(ModifierOpNames,
4405 [&](unsigned Name) { return hasModifiersSet(MI, Name); });
4406}
4407
4409 const MachineRegisterInfo &MRI) const {
4410 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4411 // Can't shrink instruction with three operands.
4412 if (Src2) {
4413 switch (MI.getOpcode()) {
4414 default: return false;
4415
4416 case AMDGPU::V_ADDC_U32_e64:
4417 case AMDGPU::V_SUBB_U32_e64:
4418 case AMDGPU::V_SUBBREV_U32_e64: {
4419 const MachineOperand *Src1
4420 = getNamedOperand(MI, AMDGPU::OpName::src1);
4421 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4422 return false;
4423 // Additional verification is needed for sdst/src2.
4424 return true;
4425 }
4426 case AMDGPU::V_MAC_F16_e64:
4427 case AMDGPU::V_MAC_F32_e64:
4428 case AMDGPU::V_MAC_LEGACY_F32_e64:
4429 case AMDGPU::V_FMAC_F16_e64:
4430 case AMDGPU::V_FMAC_F16_t16_e64:
4431 case AMDGPU::V_FMAC_F32_e64:
4432 case AMDGPU::V_FMAC_F64_e64:
4433 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4434 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4435 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4436 return false;
4437 break;
4438
4439 case AMDGPU::V_CNDMASK_B32_e64:
4440 break;
4441 }
4442 }
4443
4444 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4445 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4446 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4447 return false;
4448
4449 // We don't need to check src0, all input types are legal, so just make sure
4450 // src0 isn't using any modifiers.
4451 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4452 return false;
4453
4454 // Can it be shrunk to a valid 32 bit opcode?
4455 if (!hasVALU32BitEncoding(MI.getOpcode()))
4456 return false;
4457
4458 // Check output modifiers
4459 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4460 !hasModifiersSet(MI, AMDGPU::OpName::clamp);
4461}
4462
4463// Set VCC operand with all flags from \p Orig, except for setting it as
4464// implicit.
4466 const MachineOperand &Orig) {
4467
4468 for (MachineOperand &Use : MI.implicit_operands()) {
4469 if (Use.isUse() &&
4470 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4471 Use.setIsUndef(Orig.isUndef());
4472 Use.setIsKill(Orig.isKill());
4473 return;
4474 }
4475 }
4476}
4477
4479 unsigned Op32) const {
4480 MachineBasicBlock *MBB = MI.getParent();
4481 MachineInstrBuilder Inst32 =
4482 BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32))
4483 .setMIFlags(MI.getFlags());
4484
4485 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4486 // For VOPC instructions, this is replaced by an implicit def of vcc.
4487 if (AMDGPU::hasNamedOperand(Op32, AMDGPU::OpName::vdst)) {
4488 // dst
4489 Inst32.add(MI.getOperand(0));
4490 } else if (AMDGPU::hasNamedOperand(Op32, AMDGPU::OpName::sdst)) {
4491 // VOPCX instructions won't be writing to an explicit dst, so this should
4492 // not fail for these instructions.
4493 assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) ||
4494 (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&
4495 "Unexpected case");
4496 }
4497
4498 Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0));
4499
4500 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4501 if (Src1)
4502 Inst32.add(*Src1);
4503
4504 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4505
4506 if (Src2) {
4507 int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
4508 if (Op32Src2Idx != -1) {
4509 Inst32.add(*Src2);
4510 } else {
4511 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4512 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4513 // of vcc was already added during the initial BuildMI, but we
4514 // 1) may need to change vcc to vcc_lo to preserve the original register
4515 // 2) have to preserve the original flags.
4516 fixImplicitOperands(*Inst32);
4517 copyFlagsToImplicitVCC(*Inst32, *Src2);
4518 }
4519 }
4520
4521 return Inst32;
4522}
4523
4525 const MachineOperand &MO,
4526 const MCOperandInfo &OpInfo) const {
4527 // Literal constants use the constant bus.
4528 if (!MO.isReg())
4529 return !isInlineConstant(MO, OpInfo);
4530
4531 if (!MO.isUse())
4532 return false;
4533
4534 if (MO.getReg().isVirtual())
4535 return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
4536
4537 // Null is free
4538 if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64)
4539 return false;
4540
4541 // SGPRs use the constant bus
4542 if (MO.isImplicit()) {
4543 return MO.getReg() == AMDGPU::M0 ||
4544 MO.getReg() == AMDGPU::VCC ||
4545 MO.getReg() == AMDGPU::VCC_LO;
4546 } else {
4547 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
4548 AMDGPU::SReg_64RegClass.contains(MO.getReg());
4549 }
4550}
4551
4553 for (const MachineOperand &MO : MI.implicit_operands()) {
4554 // We only care about reads.
4555 if (MO.isDef())
4556 continue;
4557
4558 switch (MO.getReg()) {
4559 case AMDGPU::VCC:
4560 case AMDGPU::VCC_LO:
4561 case AMDGPU::VCC_HI:
4562 case AMDGPU::M0:
4563 case AMDGPU::FLAT_SCR:
4564 return MO.getReg();
4565
4566 default:
4567 break;
4568 }
4569 }
4570
4571 return Register();
4572}
4573
4574static bool shouldReadExec(const MachineInstr &MI) {
4575 if (SIInstrInfo::isVALU(MI)) {
4576 switch (MI.getOpcode()) {
4577 case AMDGPU::V_READLANE_B32:
4578 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4579 case AMDGPU::V_WRITELANE_B32:
4580 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4581 return false;
4582 }
4583
4584 return true;
4585 }
4586
4587 if (MI.isPreISelOpcode() ||
4588 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4591 return false;
4592
4593 return true;
4594}
4595
4596static bool isSubRegOf(const SIRegisterInfo &TRI,
4597 const MachineOperand &SuperVec,
4598 const MachineOperand &SubReg) {
4599 if (SubReg.getReg().isPhysical())
4600 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4601
4602 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4603 SubReg.getReg() == SuperVec.getReg();
4604}
4605
4607 StringRef &ErrInfo) const {
4608 uint16_t Opcode = MI.getOpcode();
4609 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
4610 return true;
4611
4612 const MachineFunction *MF = MI.getParent()->getParent();
4613 const MachineRegisterInfo &MRI = MF->getRegInfo();
4614
4615 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4616 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4617 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4618 int Src3Idx = -1;
4619 if (Src0Idx == -1) {
4620 // VOPD V_DUAL_* instructions use different operand names.
4621 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4622 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4623 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4624 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4625 }
4626
4627 // Make sure the number of operands is correct.
4628 const MCInstrDesc &Desc = get(Opcode);
4629 if (!Desc.isVariadic() &&
4630 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4631 ErrInfo = "Instruction has wrong number of operands.";
4632 return false;
4633 }
4634
4635 if (MI.isInlineAsm()) {
4636 // Verify register classes for inlineasm constraints.
4637 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4638 I != E; ++I) {
4639 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4640 if (!RC)
4641 continue;
4642
4643 const MachineOperand &Op = MI.getOperand(I);
4644 if (!Op.isReg())
4645 continue;
4646
4647 Register Reg = Op.getReg();
4648 if (!Reg.isVirtual() && !RC->contains(Reg)) {
4649 ErrInfo = "inlineasm operand has incorrect register class.";
4650 return false;
4651 }
4652 }
4653
4654 return true;
4655 }
4656
4657 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
4658 ErrInfo = "missing memory operand from image instruction.";
4659 return false;
4660 }
4661
4662 // Make sure the register classes are correct.
4663 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
4664 const MachineOperand &MO = MI.getOperand(i);
4665 if (MO.isFPImm()) {
4666 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
4667 "all fp values to integers.";
4668 return false;
4669 }
4670
4671 int RegClass = Desc.operands()[i].RegClass;
4672
4673 switch (Desc.operands()[i].OperandType) {
4675 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
4676 ErrInfo = "Illegal immediate value for operand.";
4677 return false;
4678 }
4679 break;
4684 break;
4696 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
4697 ErrInfo = "Illegal immediate value for operand.";
4698 return false;
4699 }
4700 break;
4701 }
4703 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
4704 ErrInfo = "Expected inline constant for operand.";
4705 return false;
4706 }
4707 break;
4710 // Check if this operand is an immediate.
4711 // FrameIndex operands will be replaced by immediates, so they are
4712 // allowed.
4713 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
4714 ErrInfo = "Expected immediate, but got non-immediate";
4715 return false;
4716 }
4717 [[fallthrough]];
4718 default:
4719 continue;
4720 }
4721
4722 if (!MO.isReg())
4723 continue;
4724 Register Reg = MO.getReg();
4725 if (!Reg)
4726 continue;
4727
4728 // FIXME: Ideally we would have separate instruction definitions with the
4729 // aligned register constraint.
4730 // FIXME: We do not verify inline asm operands, but custom inline asm
4731 // verification is broken anyway
4732 if (ST.needsAlignedVGPRs()) {
4733 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
4734 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
4735 const TargetRegisterClass *SubRC =
4736 RI.getSubRegisterClass(RC, MO.getSubReg());
4737 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
4738 if (RC)
4739 RC = SubRC;
4740 }
4741
4742 // Check that this is the aligned version of the class.
4743 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
4744 ErrInfo = "Subtarget requires even aligned vector registers";
4745 return false;
4746 }
4747 }
4748
4749 if (RegClass != -1) {
4750 if (Reg.isVirtual())
4751 continue;
4752
4753 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
4754 if (!RC->contains(Reg)) {
4755 ErrInfo = "Operand has incorrect register class.";
4756 return false;
4757 }
4758 }
4759 }
4760
4761 // Verify SDWA
4762 if (isSDWA(MI)) {
4763 if (!ST.hasSDWA()) {
4764 ErrInfo = "SDWA is not supported on this target";
4765 return false;
4766 }
4767
4768 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
4769
4770 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
4771 if (OpIdx == -1)
4772 continue;
4773 const MachineOperand &MO = MI.getOperand(OpIdx);
4774
4775 if (!ST.hasSDWAScalar()) {
4776 // Only VGPRS on VI
4777 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
4778 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
4779 return false;
4780 }
4781 } else {
4782 // No immediates on GFX9
4783 if (!MO.isReg()) {
4784 ErrInfo =
4785 "Only reg allowed as operands in SDWA instructions on GFX9+";
4786 return false;
4787 }
4788 }
4789 }
4790
4791 if (!ST.hasSDWAOmod()) {
4792 // No omod allowed on VI
4793 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4794 if (OMod != nullptr &&
4795 (!OMod->isImm() || OMod->getImm() != 0)) {
4796 ErrInfo = "OMod not allowed in SDWA instructions on VI";
4797 return false;
4798 }
4799 }
4800
4801 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
4802 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
4803 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
4804 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
4805 const MachineOperand *Src0ModsMO =
4806 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4807 unsigned Mods = Src0ModsMO->getImm();
4808 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
4809 Mods & SISrcMods::SEXT) {
4810 ErrInfo = "sext, abs and neg are not allowed on this instruction";
4811 return false;
4812 }
4813 }
4814
4815 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
4816 if (isVOPC(BasicOpcode)) {
4817 if (!ST.hasSDWASdst() && DstIdx != -1) {
4818 // Only vcc allowed as dst on VI for VOPC
4819 const MachineOperand &Dst = MI.getOperand(DstIdx);
4820 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
4821 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
4822 return false;
4823 }
4824 } else if (!ST.hasSDWAOutModsVOPC()) {
4825 // No clamp allowed on GFX9 for VOPC
4826 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4827 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
4828 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
4829 return false;
4830 }
4831
4832 // No omod allowed on GFX9 for VOPC
4833 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4834 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
4835 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
4836 return false;
4837 }
4838 }
4839 }
4840
4841 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
4842 if (DstUnused && DstUnused->isImm() &&
4843 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
4844 const MachineOperand &Dst = MI.getOperand(DstIdx);
4845 if (!Dst.isReg() || !Dst.isTied()) {
4846 ErrInfo = "Dst register should have tied register";
4847 return false;
4848 }
4849
4850 const MachineOperand &TiedMO =
4851 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
4852 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
4853 ErrInfo =
4854 "Dst register should be tied to implicit use of preserved register";
4855 return false;
4856 } else if (TiedMO.getReg().isPhysical() &&
4857 Dst.getReg() != TiedMO.getReg()) {
4858 ErrInfo = "Dst register should use same physical register as preserved";
4859 return false;
4860 }
4861 }
4862 }
4863
4864 // Verify MIMG / VIMAGE / VSAMPLE
4865 if (isImage(MI.getOpcode()) && !MI.mayStore()) {
4866 // Ensure that the return type used is large enough for all the options
4867 // being used TFE/LWE require an extra result register.
4868 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
4869 if (DMask) {
4870 uint64_t DMaskImm = DMask->getImm();
4871 uint32_t RegCount =
4872 isGather4(MI.getOpcode()) ? 4 : llvm::popcount(DMaskImm);
4873 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
4874 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
4875 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
4876
4877 // Adjust for packed 16 bit values
4878 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
4879 RegCount = divideCeil(RegCount, 2);
4880
4881 // Adjust if using LWE or TFE
4882 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
4883 RegCount += 1;
4884
4885 const uint32_t DstIdx =
4886 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
4887 const MachineOperand &Dst = MI.getOperand(DstIdx);
4888 if (Dst.isReg()) {
4889 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
4890 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
4891 if (RegCount > DstSize) {
4892 ErrInfo = "Image instruction returns too many registers for dst "
4893 "register class";
4894 return false;
4895 }
4896 }
4897 }
4898 }
4899
4900 // Verify VOP*. Ignore multiple sgpr operands on writelane.
4901 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
4902 unsigned ConstantBusCount = 0;
4903 bool UsesLiteral = false;
4904 const MachineOperand *LiteralVal = nullptr;
4905
4906 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
4907 if (ImmIdx != -1) {
4908 ++ConstantBusCount;
4909 UsesLiteral = true;
4910 LiteralVal = &MI.getOperand(ImmIdx);
4911 }
4912
4913 SmallVector<Register, 2> SGPRsUsed;
4914 Register SGPRUsed;
4915
4916 // Only look at the true operands. Only a real operand can use the constant
4917 // bus, and we don't want to check pseudo-operands like the source modifier
4918 // flags.
4919 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
4920 if (OpIdx == -1)
4921 continue;
4922 const MachineOperand &MO = MI.getOperand(OpIdx);
4923 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
4924 if (MO.isReg()) {
4925 SGPRUsed = MO.getReg();
4926 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
4927 ++ConstantBusCount;
4928 SGPRsUsed.push_back(SGPRUsed);
4929 }
4930 } else {
4931 if (!UsesLiteral) {
4932 ++ConstantBusCount;
4933 UsesLiteral = true;
4934 LiteralVal = &MO;
4935 } else if (!MO.isIdenticalTo(*LiteralVal)) {
4936 assert(isVOP2(MI) || isVOP3(MI));
4937 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
4938 return false;
4939 }
4940 }
4941 }
4942 }
4943
4944 SGPRUsed = findImplicitSGPRRead(MI);
4945 if (SGPRUsed) {
4946 // Implicit uses may safely overlap true operands
4947 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
4948 return !RI.regsOverlap(SGPRUsed, SGPR);
4949 })) {
4950 ++ConstantBusCount;
4951 SGPRsUsed.push_back(SGPRUsed);
4952 }
4953 }
4954
4955 // v_writelane_b32 is an exception from constant bus restriction:
4956 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
4957 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
4958 Opcode != AMDGPU::V_WRITELANE_B32) {
4959 ErrInfo = "VOP* instruction violates constant bus restriction";
4960 return false;
4961 }
4962
4963 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
4964 ErrInfo = "VOP3 instruction uses literal";
4965 return false;
4966 }
4967 }
4968
4969 // Special case for writelane - this can break the multiple constant bus rule,
4970 // but still can't use more than one SGPR register
4971 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
4972 unsigned SGPRCount = 0;
4973 Register SGPRUsed;
4974
4975 for (int OpIdx : {Src0Idx, Src1Idx}) {
4976 if (OpIdx == -1)
4977 break;
4978
4979 const MachineOperand &MO = MI.getOperand(OpIdx);
4980
4981 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
4982 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
4983 if (MO.getReg() != SGPRUsed)
4984 ++SGPRCount;
4985 SGPRUsed = MO.getReg();
4986 }
4987 }
4988 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
4989 ErrInfo = "WRITELANE instruction violates constant bus restriction";
4990 return false;
4991 }
4992 }
4993 }
4994
4995 // Verify misc. restrictions on specific instructions.
4996 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
4997 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
4998 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
4999 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5000 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5001 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5002 if (!compareMachineOp(Src0, Src1) &&
5003 !compareMachineOp(Src0, Src2)) {
5004 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5005 return false;
5006 }
5007 }
5008 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5009 SISrcMods::ABS) ||
5010 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5011 SISrcMods::ABS) ||
5012 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5013 SISrcMods::ABS)) {
5014 ErrInfo = "ABS not allowed in VOP3B instructions";
5015 return false;
5016 }
5017 }
5018
5019 if (isSOP2(MI) || isSOPC(MI)) {
5020 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5021 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5022
5023 if (!Src0.isReg() && !Src1.isReg() &&
5024 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5025 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5026 !Src0.isIdenticalTo(Src1)) {
5027 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5028 return false;
5029 }
5030 }
5031
5032 if (isSOPK(MI)) {
5033 auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5034 if (Desc.isBranch()) {
5035 if (!Op->isMBB()) {
5036 ErrInfo = "invalid branch target for SOPK instruction";
5037 return false;
5038 }
5039 } else {
5040 uint64_t Imm = Op->getImm();
5041 if (sopkIsZext(Opcode)) {
5042 if (!isUInt<16>(Imm)) {
5043 ErrInfo = "invalid immediate for SOPK instruction";
5044 return false;
5045 }
5046 } else {
5047 if (!isInt<16>(Imm)) {
5048 ErrInfo = "invalid immediate for SOPK instruction";
5049 return false;
5050 }
5051 }
5052 }
5053 }
5054
5055 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5056 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5057 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5058 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5059 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5060 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5061
5062 const unsigned StaticNumOps =
5063 Desc.getNumOperands() + Desc.implicit_uses().size();
5064 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5065
5066 // Allow additional implicit operands. This allows a fixup done by the post
5067 // RA scheduler where the main implicit operand is killed and implicit-defs
5068 // are added for sub-registers that remain live after this instruction.
5069 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5070 ErrInfo = "missing implicit register operands";
5071 return false;
5072 }
5073
5074 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5075 if (IsDst) {
5076 if (!Dst->isUse()) {
5077 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5078 return false;
5079 }
5080
5081 unsigned UseOpIdx;
5082 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5083 UseOpIdx != StaticNumOps + 1) {
5084 ErrInfo = "movrel implicit operands should be tied";
5085 return false;
5086 }
5087 }
5088
5089 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5090 const MachineOperand &ImpUse
5091 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5092 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5093 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5094 ErrInfo = "src0 should be subreg of implicit vector use";
5095 return false;
5096 }
5097 }
5098
5099 // Make sure we aren't losing exec uses in the td files. This mostly requires
5100 // being careful when using let Uses to try to add other use registers.
5101 if (shouldReadExec(MI)) {
5102 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5103 ErrInfo = "VALU instruction does not implicitly read exec mask";
5104 return false;
5105 }
5106 }
5107
5108 if (isSMRD(MI)) {
5109 if (MI.mayStore() &&
5111 // The register offset form of scalar stores may only use m0 as the
5112 // soffset register.
5113 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5114 if (Soff && Soff->getReg() != AMDGPU::M0) {
5115 ErrInfo = "scalar stores must use m0 as offset register";
5116 return false;
5117 }
5118 }
5119 }
5120
5121 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5122 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5123 if (Offset->getImm() != 0) {
5124 ErrInfo = "subtarget does not support offsets in flat instructions";
5125 return false;
5126 }
5127 }
5128
5129 if (isDS(MI) && !ST.hasGDS()) {
5130 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5131 if (GDSOp && GDSOp->getImm() != 0) {
5132 ErrInfo = "GDS is not supported on this subtarget";
5133 return false;
5134 }
5135 }
5136
5137 if (isImage(MI)) {
5138 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5139 if (DimOp) {
5140 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5141 AMDGPU::OpName::vaddr0);
5142 int RSrcOpName =
5143 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5144 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5145 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5146 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5148 const AMDGPU::MIMGDimInfo *Dim =
5150
5151 if (!Dim) {
5152 ErrInfo = "dim is out of range";
5153 return false;
5154 }
5155
5156 bool IsA16 = false;
5157 if (ST.hasR128A16()) {
5158 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5159 IsA16 = R128A16->getImm() != 0;
5160 } else if (ST.hasA16()) {
5161 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5162 IsA16 = A16->getImm() != 0;
5163 }
5164
5165 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5166
5167 unsigned AddrWords =
5168 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5169
5170 unsigned VAddrWords;
5171 if (IsNSA) {
5172 VAddrWords = RsrcIdx - VAddr0Idx;
5173 if (ST.hasPartialNSAEncoding() &&
5174 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5175 unsigned LastVAddrIdx = RsrcIdx - 1;
5176 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5177 }
5178 } else {
5179 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5180 if (AddrWords > 12)
5181 AddrWords = 16;
5182 }
5183
5184 if (VAddrWords != AddrWords) {
5185 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5186 << " but got " << VAddrWords << "\n");
5187 ErrInfo = "bad vaddr size";
5188 return false;
5189 }
5190 }
5191 }
5192
5193 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5194 if (DppCt) {
5195 using namespace AMDGPU::DPP;
5196
5197 unsigned DC = DppCt->getImm();
5198 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5199 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5200 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5201 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5202 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5203 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5204 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5205 ErrInfo = "Invalid dpp_ctrl value";
5206 return false;
5207 }
5208 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5210 ErrInfo = "Invalid dpp_ctrl value: "
5211 "wavefront shifts are not supported on GFX10+";
5212 return false;
5213 }
5214 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5216 ErrInfo = "Invalid dpp_ctrl value: "
5217 "broadcasts are not supported on GFX10+";
5218 return false;
5219 }
5220 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5222 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5223 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5224 !ST.hasGFX90AInsts()) {
5225 ErrInfo = "Invalid dpp_ctrl value: "
5226 "row_newbroadcast/row_share is not supported before "
5227 "GFX90A/GFX10";
5228 return false;
5229 } else if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5230 ErrInfo = "Invalid dpp_ctrl value: "
5231 "row_share and row_xmask are not supported before GFX10";
5232 return false;
5233 }
5234 }
5235
5236 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5238 ErrInfo = "Invalid dpp_ctrl value: "
5239 "DP ALU dpp only support row_newbcast";
5240 return false;
5241 }
5242 }
5243
5244 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5245 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5246 uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0
5247 : AMDGPU::OpName::vdata;
5248 const MachineOperand *Data = getNamedOperand(MI, DataNameIdx);
5249 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5250 if (Data && !Data->isReg())
5251 Data = nullptr;
5252
5253 if (ST.hasGFX90AInsts()) {
5254 if (Dst && Data &&
5255 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5256 ErrInfo = "Invalid register class: "
5257 "vdata and vdst should be both VGPR or AGPR";
5258 return false;
5259 }
5260 if (Data && Data2 &&
5261 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5262 ErrInfo = "Invalid register class: "
5263 "both data operands should be VGPR or AGPR";
5264 return false;
5265 }
5266 } else {
5267 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5268 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5269 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5270 ErrInfo = "Invalid register class: "
5271 "agpr loads and stores not supported on this GPU";
5272 return false;
5273 }
5274 }
5275 }
5276
5277 if (ST.needsAlignedVGPRs()) {
5278 const auto isAlignedReg = [&MI, &MRI, this](unsigned OpName) -> bool {
5280 if (!Op)
5281 return true;
5282 Register Reg = Op->getReg();
5283 if (Reg.isPhysical())
5284 return !(RI.getHWRegIndex(Reg) & 1);
5285 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5286 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5287 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5288 };
5289
5290 if (MI.getOpcode() == AMDGPU::DS_GWS_INIT ||
5291 MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
5292 MI.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
5293
5294 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5295 ErrInfo = "Subtarget requires even aligned vector registers "
5296 "for DS_GWS instructions";
5297 return false;
5298 }
5299 }
5300
5301 if (isMIMG(MI)) {
5302 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5303 ErrInfo = "Subtarget requires even aligned vector registers "
5304 "for vaddr operand of image instructions";
5305 return false;
5306 }
5307 }
5308 }
5309
5310 if (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
5311 !ST.hasGFX90AInsts()) {
5312 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5313 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5314 ErrInfo = "Invalid register class: "
5315 "v_accvgpr_write with an SGPR is not supported on this GPU";
5316 return false;
5317 }
5318 }
5319
5320 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5321 const MachineOperand &SrcOp = MI.getOperand(1);
5322 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5323 ErrInfo = "pseudo expects only physical SGPRs";
5324 return false;
5325 }
5326 }
5327
5328 return true;
5329}
5330
5331// It is more readable to list mapped opcodes on the same line.
5332// clang-format off
5333
5335 switch (MI.getOpcode()) {
5336 default: return AMDGPU::INSTRUCTION_LIST_END;
5337 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5338 case AMDGPU::COPY: return AMDGPU::COPY;
5339 case AMDGPU::PHI: return AMDGPU::PHI;
5340 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5341 case AMDGPU::WQM: return AMDGPU::WQM;
5342 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5343 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5344 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5345 case AMDGPU::S_MOV_B32: {
5346 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5347 return MI.getOperand(1).isReg() ||
5348 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5349 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5350 }
5351 case AMDGPU::S_ADD_I32:
5352 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5353 case AMDGPU::S_ADDC_U32:
5354 return AMDGPU::V_ADDC_U32_e32;
5355 case AMDGPU::S_SUB_I32:
5356 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5357 // FIXME: These are not consistently handled, and selected when the carry is
5358 // used.
5359 case AMDGPU::S_ADD_U32:
5360 return AMDGPU::V_ADD_CO_U32_e32;
5361 case AMDGPU::S_SUB_U32:
5362 return AMDGPU::V_SUB_CO_U32_e32;
5363 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5364 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5365 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5366 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5367 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5368 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5369 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5370 case AMDGPU::S_XNOR_B32:
5371 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5372 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5373 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5374 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5375 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5376 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5377 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5378 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5379 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5380 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5381 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5382 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5383 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5384 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5385 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5386 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5387 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5388 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5389 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5390 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5391 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5392 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5393 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5394 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5395 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5396 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5397 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5398 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5399 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5400 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5401 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5402 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5403 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5404 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5405 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5406 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5407 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5408 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5409 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5410 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5411 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5412 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5413 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5414 case AMDGPU::S_CVT_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
5415 case AMDGPU::S_CVT_HI_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
5416 case AMDGPU::S_CVT_F16_F32: return AMDGPU::V_CVT_F16_F32_t16_e64;
5417 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5418 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5419 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5420 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5421 case AMDGPU::S_CEIL_F16:
5422 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5423 : AMDGPU::V_CEIL_F16_fake16_e64;
5424 case AMDGPU::S_FLOOR_F16:
5425 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5426 : AMDGPU::V_FLOOR_F16_fake16_e64;
5427 case AMDGPU::S_TRUNC_F16:
5428 return AMDGPU::V_TRUNC_F16_fake16_e64;
5429 case AMDGPU::S_RNDNE_F16:
5430 return AMDGPU::V_RNDNE_F16_fake16_e64;
5431 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5432 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5433 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5434 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5435 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5436 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5437 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5438 case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64;
5439 case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
5440 case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
5441 case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
5442 case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64;
5443 case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64;
5444 case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
5445 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5446 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5447 case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_t16_e64;
5448 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5449 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5450 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5451 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5452 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5453 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5454 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5455 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5456 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5457 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5458 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5459 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5460 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5461 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5462 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5463 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5464 case AMDGPU::S_CMP_LT_F16: return AMDGPU::V_CMP_LT_F16_t16_e64;
5465 case AMDGPU::S_CMP_EQ_F16: return AMDGPU::V_CMP_EQ_F16_t16_e64;
5466 case AMDGPU::S_CMP_LE_F16: return AMDGPU::V_CMP_LE_F16_t16_e64;
5467 case AMDGPU::S_CMP_GT_F16: return AMDGPU::V_CMP_GT_F16_t16_e64;
5468 case AMDGPU::S_CMP_LG_F16: return AMDGPU::V_CMP_LG_F16_t16_e64;
5469 case AMDGPU::S_CMP_GE_F16: return AMDGPU::V_CMP_GE_F16_t16_e64;
5470 case AMDGPU::S_CMP_O_F16: return AMDGPU::V_CMP_O_F16_t16_e64;
5471 case AMDGPU::S_CMP_U_F16: return AMDGPU::V_CMP_U_F16_t16_e64;
5472 case AMDGPU::S_CMP_NGE_F16: return AMDGPU::V_CMP_NGE_F16_t16_e64;
5473 case AMDGPU::S_CMP_NLG_F16: return AMDGPU::V_CMP_NLG_F16_t16_e64;
5474 case AMDGPU::S_CMP_NGT_F16: return AMDGPU::V_CMP_NGT_F16_t16_e64;
5475 case AMDGPU::S_CMP_NLE_F16: return AMDGPU::V_CMP_NLE_F16_t16_e64;
5476 case AMDGPU::S_CMP_NEQ_F16: return AMDGPU::V_CMP_NEQ_F16_t16_e64;
5477 case AMDGPU::S_CMP_NLT_F16: return AMDGPU::V_CMP_NLT_F16_t16_e64;
5478 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5479 case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_fake16_e64;
5480 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5481 case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_fake16_e64;
5482 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5483 case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_fake16_e64;
5484 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5485 case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_fake16_e64;
5486 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5487 case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_fake16_e64;
5488 }
5490 "Unexpected scalar opcode without corresponding vector one!");
5491}
5492
5493// clang-format on
5494
5498 const DebugLoc &DL, Register Reg,
5499 bool IsSCCLive,
5500 SlotIndexes *Indexes) const {
5501 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5502 const SIInstrInfo *TII = ST.getInstrInfo();
5503 bool IsWave32 = ST.isWave32();
5504 if (IsSCCLive) {
5505 // Insert two move instructions, one to save the original value of EXEC and
5506 // the other to turn on all bits in EXEC. This is required as we can't use
5507 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5508 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5509 MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5510 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
5511 .addReg(Exec, RegState::Kill);
5512 auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
5513 if (Indexes) {
5514 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5515 Indexes->insertMachineInstrInMaps(*FlipExecMI);
5516 }
5517 } else {
5518 const unsigned OrSaveExec =
5519 IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
5520 auto SaveExec =
5521 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
5522 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5523 if (Indexes)
5524 Indexes->insertMachineInstrInMaps(*SaveExec);
5525 }
5526}
5527
5530 const DebugLoc &DL, Register Reg,
5531 SlotIndexes *Indexes) const {
5532 unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5533 MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5534 auto ExecRestoreMI =
5535 BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
5536 if (Indexes)
5537 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
5538}
5539
5540static const TargetRegisterClass *
5542 const MachineRegisterInfo &MRI,
5543 const MCInstrDesc &TID, unsigned RCID,
5544 bool IsAllocatable) {
5545 if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5546 (((TID.mayLoad() || TID.mayStore()) &&
5547 !(TID.TSFlags & SIInstrFlags::Spill)) ||
5549 switch (RCID) {
5550 case AMDGPU::AV_32RegClassID:
5551 RCID = AMDGPU::VGPR_32RegClassID;
5552 break;
5553 case AMDGPU::AV_64RegClassID:
5554 RCID = AMDGPU::VReg_64RegClassID;
5555 break;
5556 case AMDGPU::AV_96RegClassID:
5557 RCID = AMDGPU::VReg_96RegClassID;
5558 break;
5559 case AMDGPU::AV_128RegClassID:
5560 RCID = AMDGPU::VReg_128RegClassID;
5561 break;
5562 case AMDGPU::AV_160RegClassID:
5563 RCID = AMDGPU::VReg_160RegClassID;
5564 break;
5565 case AMDGPU::AV_512RegClassID:
5566 RCID = AMDGPU::VReg_512RegClassID;
5567 break;
5568 default:
5569 break;
5570 }
5571 }
5572
5573 return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
5574}
5575
5577 unsigned OpNum, const TargetRegisterInfo *TRI,
5578 const MachineFunction &MF)
5579 const {
5580 if (OpNum >= TID.getNumOperands())
5581 return nullptr;
5582 auto RegClass = TID.operands()[OpNum].RegClass;
5583 bool IsAllocatable = false;
5585 // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
5586 // with two data operands. Request register class constrained to VGPR only
5587 // of both operands present as Machine Copy Propagation can not check this
5588 // constraint and possibly other passes too.
5589 //
5590 // The check is limited to FLAT and DS because atomics in non-flat encoding
5591 // have their vdst and vdata tied to be the same register.
5592 const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5593 AMDGPU::OpName::vdst);
5594 const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5595 (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
5596 : AMDGPU::OpName::vdata);
5597 if (DataIdx != -1) {
5598 IsAllocatable = VDstIdx != -1 || AMDGPU::hasNamedOperand(
5599 TID.Opcode, AMDGPU::OpName::data1);
5600 }
5601 }
5602 return adjustAllocatableRegClass(ST, RI, MF.getRegInfo(), TID, RegClass,
5603 IsAllocatable);
5604}
5605
5607 unsigned OpNo) const {
5608 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5609 const MCInstrDesc &Desc = get(MI.getOpcode());
5610 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
5611 Desc.operands()[OpNo].RegClass == -1) {
5612 Register Reg = MI.getOperand(OpNo).getReg();
5613
5614 if (Reg.isVirtual())
5615 return MRI.getRegClass(Reg);
5616 return RI.getPhysRegBaseClass(Reg);
5617 }
5618
5619 unsigned RCID = Desc.operands()[OpNo].RegClass;
5620 return adjustAllocatableRegClass(ST, RI, MRI, Desc, RCID, true);
5621}
5622
5625 MachineBasicBlock *MBB = MI.getParent();
5626 MachineOperand &MO = MI.getOperand(OpIdx);
5628 unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
5629 const TargetRegisterClass *RC = RI.getRegClass(RCID);
5630 unsigned Size = RI.getRegSizeInBits(*RC);
5631 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32;
5632 if (MO.isReg())
5633 Opcode = AMDGPU::COPY;
5634 else if (RI.isSGPRClass(RC))
5635 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
5636
5637 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
5638 Register Reg = MRI.createVirtualRegister(VRC);
5640 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
5641 MO.ChangeToRegister(Reg, false);
5642}
5643
5646 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
5647 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5648 MachineBasicBlock *MBB = MI->getParent();
5649 DebugLoc DL = MI->getDebugLoc();
5650 Register SubReg = MRI.createVirtualRegister(SubRC);
5651
5652 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
5653 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
5654 .addReg(SuperReg.getReg(), 0, SubIdx);
5655 return SubReg;
5656 }
5657
5658 // Just in case the super register is itself a sub-register, copy it to a new
5659 // value so we don't need to worry about merging its subreg index with the
5660 // SubIdx passed to this function. The register coalescer should be able to
5661 // eliminate this extra copy.
5662 Register NewSuperReg = MRI.createVirtualRegister(SuperRC);
5663
5664 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
5665 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
5666
5667 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
5668 .addReg(NewSuperReg, 0, SubIdx);
5669
5670 return SubReg;
5671}
5672
5675 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
5676 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5677 if (Op.isImm()) {
5678 if (SubIdx == AMDGPU::sub0)
5679 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
5680 if (SubIdx == AMDGPU::sub1)
5681 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
5682
5683 llvm_unreachable("Unhandled register index for immediate");
5684 }
5685
5686 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
5687 SubIdx, SubRC);
5688 return MachineOperand::CreateReg(SubReg, false);
5689}
5690
5691// Change the order of operands from (0, 1, 2) to (0, 2, 1)
5692void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
5693 assert(Inst.getNumExplicitOperands() == 3);
5694 MachineOperand Op1 = Inst.getOperand(1);
5695 Inst.removeOperand(1);
5696 Inst.addOperand(Op1);
5697}
5698
5700 const MCOperandInfo &OpInfo,
5701 const MachineOperand &MO) const {
5702 if (!MO.isReg())
5703 return false;
5704
5705 Register Reg = MO.getReg();
5706
5707 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
5708 if (Reg.isPhysical())
5709 return DRC->contains(Reg);
5710
5711 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
5712
5713 if (MO.getSubReg()) {
5714 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
5715 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
5716 if (!SuperRC)
5717 return false;
5718
5719 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
5720 if (!DRC)
5721 return false;
5722 }
5723 return RC->hasSuperClassEq(DRC);
5724}
5725
5727 const MCOperandInfo &OpInfo,
5728 const MachineOperand &MO) const {
5729 if (MO.isReg())
5730 return isLegalRegOperand(MRI, OpInfo, MO);
5731
5732 // Handle non-register types that are treated like immediates.
5733 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
5734 return true;
5735}
5736
5737bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
5738 const MachineOperand *MO) const {
5739 const MachineFunction &MF = *MI.getParent()->getParent();
5740 const MachineRegisterInfo &MRI = MF.getRegInfo();
5741 const MCInstrDesc &InstDesc = MI.getDesc();
5742 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
5743 const TargetRegisterClass *DefinedRC =
5744 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
5745 if (!MO)
5746 MO = &MI.getOperand(OpIdx);
5747
5748 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
5749 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
5750 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
5751 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo) && !LiteralLimit--)
5752 return false;
5753
5755 if (MO->isReg())
5756 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
5757
5758 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
5759 if (i == OpIdx)
5760 continue;
5761 const MachineOperand &Op = MI.getOperand(i);
5762 if (Op.isReg()) {
5763 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
5764 if (!SGPRsUsed.count(SGPR) &&
5765 // FIXME: This can access off the end of the operands() array.
5766 usesConstantBus(MRI, Op, InstDesc.operands().begin()[i])) {
5767 if (--ConstantBusLimit <= 0)
5768 return false;
5769 SGPRsUsed.insert(SGPR);
5770 }
5771 } else if (AMDGPU::isSISrcOperand(InstDesc, i) &&
5772 !isInlineConstant(Op, InstDesc.operands()[i])) {
5773 if (!LiteralLimit--)
5774 return false;
5775 if (--ConstantBusLimit <= 0)
5776 return false;
5777 }
5778 }
5779 }
5780
5781 if (MO->isReg()) {
5782 if (!DefinedRC)
5783 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
5784 if (!isLegalRegOperand(MRI, OpInfo, *MO))
5785 return false;
5786 bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
5787 if (IsAGPR && !ST.hasMAIInsts())
5788 return false;
5789 unsigned Opc = MI.getOpcode();
5790 if (IsAGPR &&
5791 (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5792 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
5793 return false;
5794 // Atomics should have both vdst and vdata either vgpr or agpr.
5795 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
5796 const int DataIdx = AMDGPU::getNamedOperandIdx(Opc,
5797 isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
5798 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
5799 MI.getOperand(DataIdx).isReg() &&
5800 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
5801 return false;
5802 if ((int)OpIdx == DataIdx) {
5803 if (VDstIdx != -1 &&
5804 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
5805 return false;
5806 // DS instructions with 2 src operands also must have tied RC.
5807 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc,
5808 AMDGPU::OpName::data1);
5809 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
5810 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
5811 return false;
5812 }
5813 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
5814 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
5815 RI.isSGPRReg(MRI, MO->getReg()))
5816 return false;
5817 return true;
5818 }
5819
5820 if (MO->isImm()) {
5821 uint64_t Imm = MO->getImm();
5822 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
5823 bool Is64BitOp = Is64BitFPOp ||
5827 if (Is64BitOp &&
5829 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp))
5830 return false;
5831
5832 // FIXME: We can use sign extended 64-bit literals, but only for signed
5833 // operands. At the moment we do not know if an operand is signed.
5834 // Such operand will be encoded as its low 32 bits and then either
5835 // correctly sign extended or incorrectly zero extended by HW.
5836 if (!Is64BitFPOp && (int32_t)Imm < 0)
5837 return false;
5838 }
5839 }
5840
5841 // Handle non-register types that are treated like immediates.
5842 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
5843
5844 if (!DefinedRC) {
5845 // This operand expects an immediate.
5846 return true;
5847 }
5848
5849 return isImmOperandLegal(MI, OpIdx, *MO);
5850}
5851
5853 MachineInstr &MI) const {
5854 unsigned Opc = MI.getOpcode();
5855 const MCInstrDesc &InstrDesc = get(Opc);
5856
5857 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
5858 MachineOperand &Src0 = MI.getOperand(Src0Idx);
5859
5860 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
5861 MachineOperand &Src1 = MI.getOperand(Src1Idx);
5862
5863 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
5864 // we need to only have one constant bus use before GFX10.
5865 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
5866 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
5867 RI.isSGPRReg(MRI, Src0.getReg()))
5868 legalizeOpWithMove(MI, Src0Idx);
5869
5870 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
5871 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
5872 // src0/src1 with V_READFIRSTLANE.
5873 if (Opc == AMDGPU::V_WRITELANE_B32) {
5874 const DebugLoc &DL = MI.getDebugLoc();
5875 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
5876 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5877 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5878 .add(Src0);
5879 Src0.ChangeToRegister(Reg, false);
5880 }
5881 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
5882 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5883 const DebugLoc &DL = MI.getDebugLoc();
5884 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5885 .add(Src1);
5886 Src1.ChangeToRegister(Reg, false);
5887 }
5888 return;
5889 }
5890
5891 // No VOP2 instructions support AGPRs.
5892 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
5893 legalizeOpWithMove(MI, Src0Idx);
5894
5895 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
5896 legalizeOpWithMove(MI, Src1Idx);
5897
5898 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
5899 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
5900 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
5901 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
5902 legalizeOpWithMove(MI, Src2Idx);
5903 }
5904
5905 // VOP2 src0 instructions support all operand types, so we don't need to check
5906 // their legality. If src1 is already legal, we don't need to do anything.
5907 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
5908 return;
5909
5910 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
5911 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
5912 // select is uniform.
5913 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
5914 RI.isVGPR(MRI, Src1.getReg())) {
5915 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5916 const DebugLoc &DL = MI.getDebugLoc();
5917 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5918 .add(Src1);
5919 Src1.ChangeToRegister(Reg, false);
5920 return;
5921 }
5922
5923 // We do not use commuteInstruction here because it is too aggressive and will
5924 // commute if it is possible. We only want to commute here if it improves
5925 // legality. This can be called a fairly large number of times so don't waste
5926 // compile time pointlessly swapping and checking legality again.
5927 if (HasImplicitSGPR || !MI.isCommutable()) {
5928 legalizeOpWithMove(MI, Src1Idx);
5929 return;
5930 }
5931
5932 // If src0 can be used as src1, commuting will make the operands legal.
5933 // Otherwise we have to give up and insert a move.
5934 //
5935 // TODO: Other immediate-like operand kinds could be commuted if there was a
5936 // MachineOperand::ChangeTo* for them.
5937 if ((!Src1.isImm() && !Src1.isReg()) ||
5938 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
5939 legalizeOpWithMove(MI, Src1Idx);
5940 return;
5941 }
5942
5943 int CommutedOpc = commuteOpcode(MI);
5944 if (CommutedOpc == -1) {
5945 legalizeOpWithMove(MI, Src1Idx);
5946 return;
5947 }
5948
5949 MI.setDesc(get(CommutedOpc));
5950
5951 Register Src0Reg = Src0.getReg();
5952 unsigned Src0SubReg = Src0.getSubReg();
5953 bool Src0Kill = Src0.isKill();
5954
5955 if (Src1.isImm())
5956 Src0.ChangeToImmediate(Src1.getImm());
5957 else if (Src1.isReg()) {
5958 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
5959 Src0.setSubReg(Src1.getSubReg());
5960 } else
5961 llvm_unreachable("Should only have register or immediate operands");
5962
5963 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
5964 Src1.setSubReg(Src0SubReg);
5966}
5967
5968// Legalize VOP3 operands. All operand types are supported for any operand
5969// but only one literal constant and only starting from GFX10.
5971 MachineInstr &MI) const {
5972 unsigned Opc = MI.getOpcode();
5973
5974 int VOP3Idx[3] = {
5975 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
5976 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
5977 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
5978 };
5979
5980 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
5981 Opc == AMDGPU::V_PERMLANEX16_B32_e64) {
5982 // src1 and src2 must be scalar
5983 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
5984 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
5985 const DebugLoc &DL = MI.getDebugLoc();
5986 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
5987 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5988 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5989 .add(Src1);
5990 Src1.ChangeToRegister(Reg, false);
5991 }
5992 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
5993 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5994 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5995 .add(Src2);
5996 Src2.ChangeToRegister(Reg, false);
5997 }
5998 }
5999
6000 // Find the one SGPR operand we are allowed to use.
6001 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6002 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6003 SmallDenseSet<unsigned> SGPRsUsed;
6004 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6005 if (SGPRReg) {
6006 SGPRsUsed.insert(SGPRReg);
6007 --ConstantBusLimit;
6008 }
6009
6010 for (int Idx : VOP3Idx) {
6011 if (Idx == -1)
6012 break;
6013 MachineOperand &MO = MI.getOperand(Idx);
6014
6015 if (!MO.isReg()) {
6016 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6017 continue;
6018
6019 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6020 --LiteralLimit;
6021 --ConstantBusLimit;
6022 continue;
6023 }
6024
6025 --LiteralLimit;
6026 --ConstantBusLimit;
6028 continue;
6029 }
6030
6031 if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&
6032 !isOperandLegal(MI, Idx, &MO)) {
6034 continue;
6035 }
6036
6037 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6038 continue; // VGPRs are legal
6039
6040 // We can use one SGPR in each VOP3 instruction prior to GFX10
6041 // and two starting from GFX10.
6042 if (SGPRsUsed.count(MO.getReg()))
6043 continue;
6044 if (ConstantBusLimit > 0) {
6045 SGPRsUsed.insert(MO.getReg());
6046 --ConstantBusLimit;
6047 continue;
6048 }
6049
6050 // If we make it this far, then the operand is not legal and we must
6051 // legalize it.
6053 }
6054
6055 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6056 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6057 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6058 legalizeOpWithMove(MI, VOP3Idx[2]);
6059}
6060
6062 MachineRegisterInfo &MRI) const {
6063 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6064 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6065 Register DstReg = MRI.createVirtualRegister(SRC);
6066 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6067
6068 if (RI.hasAGPRs(VRC)) {
6069 VRC = RI.getEquivalentVGPRClass(VRC);
6070 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6071 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6072 get(TargetOpcode::COPY), NewSrcReg)
6073 .addReg(SrcReg);
6074 SrcReg = NewSrcReg;
6075 }
6076
6077 if (SubRegs == 1) {
6078 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6079 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6080 .addReg(SrcReg);
6081 return DstReg;
6082 }
6083
6085 for (unsigned i = 0; i < SubRegs; ++i) {
6086 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6087 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6088 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6089 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6090 SRegs.push_back(SGPR);
6091 }
6092
6094 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6095 get(AMDGPU::REG_SEQUENCE), DstReg);
6096 for (unsigned i = 0; i < SubRegs; ++i) {
6097 MIB.addReg(SRegs[i]);
6098 MIB.addImm(RI.getSubRegFromChannel(i));
6099 }
6100 return DstReg;
6101}
6102
6104 MachineInstr &MI) const {
6105
6106 // If the pointer is store in VGPRs, then we need to move them to
6107 // SGPRs using v_readfirstlane. This is safe because we only select
6108 // loads with uniform pointers to SMRD instruction so we know the
6109 // pointer value is uniform.
6110 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6111 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6112 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6113 SBase->setReg(SGPR);
6114 }
6115 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6116 if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
6117 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6118 SOff->setReg(SGPR);
6119 }
6120}
6121
6123 unsigned Opc = Inst.getOpcode();
6124 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6125 if (OldSAddrIdx < 0)
6126 return false;
6127
6129
6130 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6131 if (NewOpc < 0)
6133 if (NewOpc < 0)
6134 return false;
6135
6137 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6138 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6139 return false;
6140
6141 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6142 if (NewVAddrIdx < 0)
6143 return false;
6144
6145 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6146
6147 // Check vaddr, it shall be zero or absent.
6148 MachineInstr *VAddrDef = nullptr;
6149 if (OldVAddrIdx >= 0) {
6150 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6151 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6152 if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
6153 !VAddrDef->getOperand(1).isImm() ||
6154 VAddrDef->getOperand(1).getImm() != 0)
6155 return false;
6156 }
6157
6158 const MCInstrDesc &NewDesc = get(NewOpc);
6159 Inst.setDesc(NewDesc);
6160
6161 // Callers expect iterator to be valid after this call, so modify the
6162 // instruction in place.
6163 if (OldVAddrIdx == NewVAddrIdx) {
6164 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6165 // Clear use list from the old vaddr holding a zero register.
6166 MRI.removeRegOperandFromUseList(&NewVAddr);
6167 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6168 Inst.removeOperand(OldSAddrIdx);
6169 // Update the use list with the pointer we have just moved from vaddr to
6170 // saddr position. Otherwise new vaddr will be missing from the use list.
6171 MRI.removeRegOperandFromUseList(&NewVAddr);
6172 MRI.addRegOperandToUseList(&NewVAddr);
6173 } else {
6174 assert(OldSAddrIdx == NewVAddrIdx);
6175
6176 if (OldVAddrIdx >= 0) {
6177 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6178 AMDGPU::OpName::vdst_in);
6179
6180 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6181 // it asserts. Untie the operands for now and retie them afterwards.
6182 if (NewVDstIn != -1) {
6183 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6184 Inst.untieRegOperand(OldVDstIn);
6185 }
6186
6187 Inst.removeOperand(OldVAddrIdx);
6188
6189 if (NewVDstIn != -1) {
6190 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6191 Inst.tieOperands(NewVDst, NewVDstIn);
6192 }
6193 }
6194 }
6195
6196 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6197 VAddrDef->eraseFromParent();
6198
6199 return true;
6200}
6201
6202// FIXME: Remove this when SelectionDAG is obsoleted.
6204 MachineInstr &MI) const {
6206 return;
6207
6208 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6209 // thinks they are uniform, so a readfirstlane should be valid.
6210 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6211 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6212 return;
6213
6215 return;
6216
6217 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI);
6218 SAddr->setReg(ToSGPR);
6219}
6220
6223 const TargetRegisterClass *DstRC,
6226 const DebugLoc &DL) const {
6227 Register OpReg = Op.getReg();
6228 unsigned OpSubReg = Op.getSubReg();
6229
6230 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6231 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6232
6233 // Check if operand is already the correct register class.
6234 if (DstRC == OpRC)
6235 return;
6236
6237 Register DstReg = MRI.createVirtualRegister(DstRC);
6238 auto Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
6239
6240 Op.setReg(DstReg);
6241 Op.setSubReg(0);
6242
6243 MachineInstr *Def = MRI.getVRegDef(OpReg);
6244 if (!Def)
6245 return;
6246
6247 // Try to eliminate the copy if it is copying an immediate value.
6248 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6249 foldImmediate(*Copy, *Def, OpReg, &MRI);
6250
6251 bool ImpDef = Def->isImplicitDef();
6252 while (!ImpDef && Def && Def->isCopy()) {
6253 if (Def->getOperand(1).getReg().isPhysical())
6254 break;
6255 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6256 ImpDef = Def && Def->isImplicitDef();
6257 }
6258 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6259 !ImpDef)
6260 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6261}
6262
6263// Emit the actual waterfall loop, executing the wrapped instruction for each
6264// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6265// iteration, in the worst case we execute 64 (once per lane).
6268 MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL,
6269 ArrayRef<MachineOperand *> ScalarOps) {
6270 MachineFunction &MF = *OrigBB.getParent();
6271 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6272 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6273 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6274 unsigned SaveExecOpc =
6275 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
6276 unsigned XorTermOpc =
6277 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
6278 unsigned AndOpc =
6279 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6280 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6281
6283
6284 SmallVector<Register, 8> ReadlanePieces;
6285 Register CondReg;
6286
6287 for (MachineOperand *ScalarOp : ScalarOps) {
6288 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6289 unsigned NumSubRegs = RegSize / 32;
6290 Register VScalarOp = ScalarOp->getReg();
6291
6292 if (NumSubRegs == 1) {
6293 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6294
6295 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6296 .addReg(VScalarOp);
6297
6298 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6299
6300 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6301 .addReg(CurReg)
6302 .addReg(VScalarOp);
6303
6304 // Combine the comparison results with AND.
6305 if (!CondReg) // First.
6306 CondReg = NewCondReg;
6307 else { // If not the first, we create an AND.
6308 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6309 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6310 .addReg(CondReg)
6311 .addReg(NewCondReg);
6312 CondReg = AndReg;
6313 }
6314
6315 // Update ScalarOp operand to use the SGPR ScalarOp.
6316 ScalarOp->setReg(CurReg);
6317 ScalarOp->setIsKill();
6318 } else {
6319 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6320 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6321 "Unhandled register size");
6322
6323 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6324 Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6325 Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6326
6327 // Read the next variant <- also loop target.
6328 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6329 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6330
6331 // Read the next variant <- also loop target.
6332 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6333 .addReg(VScalarOp, VScalarOpUndef,
6334 TRI->getSubRegFromChannel(Idx + 1));
6335
6336 ReadlanePieces.push_back(CurRegLo);
6337 ReadlanePieces.push_back(CurRegHi);
6338
6339 // Comparison is to be done as 64-bit.
6340 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6341 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6342 .addReg(CurRegLo)
6343 .addImm(AMDGPU::sub0)
6344 .addReg(CurRegHi)
6345 .addImm(AMDGPU::sub1);
6346
6347 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6348 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6349 NewCondReg)
6350 .addReg(CurReg);
6351 if (NumSubRegs <= 2)
6352 Cmp.addReg(VScalarOp);
6353 else
6354 Cmp.addReg(VScalarOp, VScalarOpUndef,
6355 TRI->getSubRegFromChannel(Idx, 2));
6356
6357 // Combine the comparison results with AND.
6358 if (!CondReg) // First.
6359 CondReg = NewCondReg;
6360 else { // If not the first, we create an AND.
6361 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6362 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6363 .addReg(CondReg)
6364 .addReg(NewCondReg);
6365 CondReg = AndReg;
6366 }
6367 } // End for loop.
6368
6369 auto SScalarOpRC =
6370 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6371 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6372
6373 // Build scalar ScalarOp.
6374 auto Merge =
6375 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6376 unsigned Channel = 0;
6377 for (Register Piece : ReadlanePieces) {
6378 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6379 }
6380
6381 // Update ScalarOp operand to use the SGPR ScalarOp.
6382 ScalarOp->setReg(SScalarOp);
6383 ScalarOp->setIsKill();
6384 }
6385 }
6386
6387 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6388 MRI.setSimpleHint(SaveExec, CondReg);
6389
6390 // Update EXEC to matching lanes, saving original to SaveExec.
6391 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
6392 .addReg(CondReg, RegState::Kill);
6393
6394 // The original instruction is here; we insert the terminators after it.
6395 I = BodyBB.end();
6396
6397 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6398 BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec)
6399 .addReg(Exec)
6400 .addReg(SaveExec);
6401
6402 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
6403}
6404
6405// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6406// with SGPRs by iterating over all unique values across all lanes.
6407// Returns the loop basic block that now contains \p MI.
6408static MachineBasicBlock *
6412 MachineBasicBlock::iterator Begin = nullptr,
6413 MachineBasicBlock::iterator End = nullptr) {
6414 MachineBasicBlock &MBB = *MI.getParent();
6415 MachineFunction &MF = *MBB.getParent();
6416 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6417 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6419 if (!Begin.isValid())
6420 Begin = &MI;
6421 if (!End.isValid()) {
6422 End = &MI;
6423 ++End;
6424 }
6425 const DebugLoc &DL = MI.getDebugLoc();
6426 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6427 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6428 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6429
6430 // Save SCC. Waterfall Loop may overwrite SCC.
6431 Register SaveSCCReg;
6432 bool SCCNotDead = (MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI, 30) !=
6434 if (SCCNotDead) {
6435 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6436 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
6437 .addImm(1)
6438 .addImm(0);
6439 }
6440
6441 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6442
6443 // Save the EXEC mask
6444 BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
6445
6446 // Killed uses in the instruction we are waterfalling around will be
6447 // incorrect due to the added control-flow.
6449 ++AfterMI;
6450 for (auto I = Begin; I != AfterMI; I++) {
6451 for (auto &MO : I->all_uses())
6452 MRI.clearKillFlags(MO.getReg());
6453 }
6454
6455 // To insert the loop we need to split the block. Move everything after this
6456 // point to a new block, and insert a new empty block between the two.
6459 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
6461 ++MBBI;
6462
6463 MF.insert(MBBI, LoopBB);
6464 MF.insert(MBBI, BodyBB);
6465 MF.insert(MBBI, RemainderBB);
6466
6467 LoopBB->addSuccessor(BodyBB);
6468 BodyBB->addSuccessor(LoopBB);
6469 BodyBB->addSuccessor(RemainderBB);
6470
6471 // Move Begin to MI to the BodyBB, and the remainder of the block to
6472 // RemainderBB.
6473 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
6474 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
6475 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
6476
6477 MBB.addSuccessor(LoopBB);
6478
6479 // Update dominators. We know that MBB immediately dominates LoopBB, that
6480 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
6481 // RemainderBB. RemainderBB immediately dominates all of the successors
6482 // transferred to it from MBB that MBB used to properly dominate.
6483 if (MDT) {
6484 MDT->addNewBlock(LoopBB, &MBB);
6485 MDT->addNewBlock(BodyBB, LoopBB);
6486 MDT->addNewBlock(RemainderBB, BodyBB);
6487 for (auto &Succ : RemainderBB->successors()) {
6488 if (MDT->properlyDominates(&MBB, Succ)) {
6489 MDT->changeImmediateDominator(Succ, RemainderBB);
6490 }
6491 }
6492 }
6493
6494 emitLoadScalarOpsFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, ScalarOps);
6495
6496 MachineBasicBlock::iterator First = RemainderBB->begin();
6497 // Restore SCC
6498 if (SCCNotDead) {
6499 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
6500 .addReg(SaveSCCReg, RegState::Kill)
6501 .addImm(0);
6502 }
6503
6504 // Restore the EXEC mask
6505 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
6506 return BodyBB;
6507}
6508
6509// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
6510static std::tuple<unsigned, unsigned>
6512 MachineBasicBlock &MBB = *MI.getParent();
6513 MachineFunction &MF = *MBB.getParent();
6515
6516 // Extract the ptr from the resource descriptor.
6517 unsigned RsrcPtr =
6518 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
6519 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
6520
6521 // Create an empty resource descriptor
6522 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6523 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6524 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6525 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6526 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
6527
6528 // Zero64 = 0
6529 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
6530 .addImm(0);
6531
6532 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
6533 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
6534 .addImm(RsrcDataFormat & 0xFFFFFFFF);
6535
6536 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
6537 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
6538 .addImm(RsrcDataFormat >> 32);
6539
6540 // NewSRsrc = {Zero64, SRsrcFormat}
6541 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
6542 .addReg(Zero64)
6543 .addImm(AMDGPU::sub0_sub1)
6544 .addReg(SRsrcFormatLo)
6545 .addImm(AMDGPU::sub2)
6546 .addReg(SRsrcFormatHi)
6547 .addImm(AMDGPU::sub3);
6548
6549 return std::tuple(RsrcPtr, NewSRsrc);
6550}
6551
6554 MachineDominatorTree *MDT) const {
6555 MachineFunction &MF = *MI.getParent()->getParent();
6557 MachineBasicBlock *CreatedBB = nullptr;
6558
6559 // Legalize VOP2
6560 if (isVOP2(MI) || isVOPC(MI)) {
6562 return CreatedBB;
6563 }
6564
6565 // Legalize VOP3
6566 if (isVOP3(MI)) {
6568 return CreatedBB;
6569 }
6570
6571 // Legalize SMRD
6572 if (isSMRD(MI)) {
6574 return CreatedBB;
6575 }
6576
6577 // Legalize FLAT
6578 if (isFLAT(MI)) {
6580 return CreatedBB;
6581 }
6582
6583 // Legalize REG_SEQUENCE and PHI
6584 // The register class of the operands much be the same type as the register
6585 // class of the output.
6586 if (MI.getOpcode() == AMDGPU::PHI) {
6587 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
6588 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
6589 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
6590 continue;
6591 const TargetRegisterClass *OpRC =
6592 MRI.getRegClass(MI.getOperand(i).getReg());
6593 if (RI.hasVectorRegisters(OpRC)) {
6594 VRC = OpRC;
6595 } else {
6596 SRC = OpRC;
6597 }
6598 }
6599
6600 // If any of the operands are VGPR registers, then they all most be
6601 // otherwise we will create illegal VGPR->SGPR copies when legalizing
6602 // them.
6603 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
6604 if (!VRC) {
6605 assert(SRC);
6606 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
6607 VRC = &AMDGPU::VReg_1RegClass;
6608 } else
6609 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6610 ? RI.getEquivalentAGPRClass(SRC)
6611 : RI.getEquivalentVGPRClass(SRC);
6612 } else {
6613 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6614 ? RI.getEquivalentAGPRClass(VRC)
6615 : RI.getEquivalentVGPRClass(VRC);
6616 }
6617 RC = VRC;
6618 } else {
6619 RC = SRC;
6620 }
6621
6622 // Update all the operands so they have the same type.
6623 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6624 MachineOperand &Op = MI.getOperand(I);
6625 if (!Op.isReg() || !Op.getReg().isVirtual())
6626 continue;
6627
6628 // MI is a PHI instruction.
6629 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
6631
6632 // Avoid creating no-op copies with the same src and dst reg class. These
6633 // confuse some of the machine passes.
6634 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
6635 }
6636 }
6637
6638 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
6639 // VGPR dest type and SGPR sources, insert copies so all operands are
6640 // VGPRs. This seems to help operand folding / the register coalescer.
6641 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
6642 MachineBasicBlock *MBB = MI.getParent();
6643 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
6644 if (RI.hasVGPRs(DstRC)) {
6645 // Update all the operands so they are VGPR register classes. These may
6646 // not be the same register class because REG_SEQUENCE supports mixing
6647 // subregister index types e.g. sub0_sub1 + sub2 + sub3
6648 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6649 MachineOperand &Op = MI.getOperand(I);
6650 if (!Op.isReg() || !Op.getReg().isVirtual())
6651 continue;
6652
6653 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
6654 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
6655 if (VRC == OpRC)
6656 continue;
6657
6658 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
6659 Op.setIsKill();
6660 }
6661 }
6662
6663 return CreatedBB;
6664 }
6665
6666 // Legalize INSERT_SUBREG
6667 // src0 must have the same register class as dst
6668 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
6669 Register Dst = MI.getOperand(0).getReg();
6670 Register Src0 = MI.getOperand(1).getReg();
6671 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
6672 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
6673 if (DstRC != Src0RC) {
6674 MachineBasicBlock *MBB = MI.getParent();
6675 MachineOperand &Op = MI.getOperand(1);
6676 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
6677 }
6678 return CreatedBB;
6679 }
6680
6681 // Legalize SI_INIT_M0
6682 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
6683 MachineOperand &Src = MI.getOperand(0);
6684 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6685 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6686 return CreatedBB;
6687 }
6688
6689 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
6690 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
6691 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
6692 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
6693 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
6694 MI.getOpcode() == AMDGPU::S_WQM_B64) {
6695 MachineOperand &Src = MI.getOperand(1);
6696 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6697 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6698 return CreatedBB;
6699 }
6700
6701 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
6702 //
6703 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
6704 // scratch memory access. In both cases, the legalization never involves
6705 // conversion to the addr64 form.
6707 (isMUBUF(MI) || isMTBUF(MI)))) {
6708 int RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI)) ? AMDGPU::OpName::rsrc
6709 : AMDGPU::OpName::srsrc;
6710 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
6711 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
6712 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
6713
6714 int SampOpName = isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
6715 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
6716 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
6717 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
6718
6719 return CreatedBB;
6720 }
6721
6722 // Legalize SI_CALL
6723 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
6724 MachineOperand *Dest = &MI.getOperand(0);
6725 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
6726 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
6727 // following copies, we also need to move copies from and to physical
6728 // registers into the loop block.
6729 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
6730 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
6731
6732 // Also move the copies to physical registers into the loop block
6733 MachineBasicBlock &MBB = *MI.getParent();
6735 while (Start->getOpcode() != FrameSetupOpcode)
6736 --Start;
6738 while (End->getOpcode() != FrameDestroyOpcode)
6739 ++End;
6740 // Also include following copies of the return value
6741 ++End;
6742 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
6743 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
6744 ++End;
6745 CreatedBB =
6746 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
6747 }
6748 }
6749
6750 // Legalize s_sleep_var.
6751 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
6752 const DebugLoc &DL = MI.getDebugLoc();
6753 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6754 int Src0Idx =
6755 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
6756 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6757 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6758 .add(Src0);
6759 Src0.ChangeToRegister(Reg, false);
6760 return nullptr;
6761 }
6762
6763 // Legalize MUBUF instructions.
6764 bool isSoffsetLegal = true;
6765 int SoffsetIdx =
6766 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
6767 if (SoffsetIdx != -1) {
6768 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
6769 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
6770 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
6771 isSoffsetLegal = false;
6772 }
6773 }
6774
6775 bool isRsrcLegal = true;
6776 int RsrcIdx =
6777 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
6778 if (RsrcIdx != -1) {
6779 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6780 if (Rsrc->isReg() && !RI.isSGPRClass(MRI.getRegClass(Rsrc->getReg()))) {
6781 isRsrcLegal = false;
6782 }
6783 }
6784
6785 // The operands are legal.
6786 if (isRsrcLegal && isSoffsetLegal)
6787 return CreatedBB;
6788
6789 if (!isRsrcLegal) {
6790 // Legalize a VGPR Rsrc
6791 //
6792 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
6793 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
6794 // a zero-value SRsrc.
6795 //
6796 // If the instruction is _OFFSET (both idxen and offen disabled), and we
6797 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
6798 // above.
6799 //
6800 // Otherwise we are on non-ADDR64 hardware, and/or we have
6801 // idxen/offen/bothen and we fall back to a waterfall loop.
6802
6803 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6804 MachineBasicBlock &MBB = *MI.getParent();
6805
6806 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
6807 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
6808 // This is already an ADDR64 instruction so we need to add the pointer
6809 // extracted from the resource descriptor to the current value of VAddr.
6810 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6811 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6812 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6813
6814 const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6815 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
6816 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
6817
6818 unsigned RsrcPtr, NewSRsrc;
6819 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
6820
6821 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
6822 const DebugLoc &DL = MI.getDebugLoc();
6823 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
6824 .addDef(CondReg0)
6825 .addReg(RsrcPtr, 0, AMDGPU::sub0)
6826 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
6827 .addImm(0);
6828
6829 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
6830 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
6831 .addDef(CondReg1, RegState::Dead)
6832 .addReg(RsrcPtr, 0, AMDGPU::sub1)
6833 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
6834 .addReg(CondReg0, RegState::Kill)
6835 .addImm(0);
6836
6837 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6838 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
6839 .addReg(NewVAddrLo)
6840 .addImm(AMDGPU::sub0)
6841 .addReg(NewVAddrHi)
6842 .addImm(AMDGPU::sub1);
6843
6844 VAddr->setReg(NewVAddr);
6845 Rsrc->setReg(NewSRsrc);
6846 } else if (!VAddr && ST.hasAddr64()) {
6847 // This instructions is the _OFFSET variant, so we need to convert it to
6848 // ADDR64.
6850 "FIXME: Need to emit flat atomics here");
6851
6852 unsigned RsrcPtr, NewSRsrc;
6853 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
6854
6855 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6856 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
6857 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
6858 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6859 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
6860
6861 // Atomics with return have an additional tied operand and are
6862 // missing some of the special bits.
6863 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
6864 MachineInstr *Addr64;
6865
6866 if (!VDataIn) {
6867 // Regular buffer load / store.
6869 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
6870 .add(*VData)
6871 .addReg(NewVAddr)
6872 .addReg(NewSRsrc)
6873 .add(*SOffset)
6874 .add(*Offset);
6875
6876 if (const MachineOperand *CPol =
6877 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
6878 MIB.addImm(CPol->getImm());
6879 }
6880
6881 if (const MachineOperand *TFE =
6882 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
6883 MIB.addImm(TFE->getImm());
6884 }
6885
6886 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
6887
6888 MIB.cloneMemRefs(MI);
6889 Addr64 = MIB;
6890 } else {
6891 // Atomics with return.
6892 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
6893 .add(*VData)
6894 .add(*VDataIn)
6895 .addReg(NewVAddr)
6896 .addReg(NewSRsrc)
6897 .add(*SOffset)
6898 .add(*Offset)
6899 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
6900 .cloneMemRefs(MI);
6901 }
6902
6903 MI.removeFromParent();
6904
6905 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6906 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
6907 NewVAddr)
6908 .addReg(RsrcPtr, 0, AMDGPU::sub0)
6909 .addImm(AMDGPU::sub0)
6910 .addReg(RsrcPtr, 0, AMDGPU::sub1)
6911 .addImm(AMDGPU::sub1);
6912 } else {
6913 // Legalize a VGPR Rsrc and soffset together.
6914 if (!isSoffsetLegal) {
6915 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6916 CreatedBB =
6917 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
6918 return CreatedBB;
6919 }
6920 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
6921 return CreatedBB;
6922 }
6923 }
6924
6925 // Legalize a VGPR soffset.
6926 if (!isSoffsetLegal) {
6927 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6928 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
6929 return CreatedBB;
6930 }
6931 return CreatedBB;
6932}
6933
6935 InstrList.insert(MI);
6936 // Add MBUF instructiosn to deferred list.
6937 int RsrcIdx =
6938 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
6939 if (RsrcIdx != -1) {
6940 DeferredList.insert(MI);
6941 }
6942}
6943
6945 return DeferredList.contains(MI);
6946}
6947
6949 MachineDominatorTree *MDT) const {
6950
6951 while (!Worklist.empty()) {
6952 MachineInstr &Inst = *Worklist.top();
6953 Worklist.erase_top();
6954 // Skip MachineInstr in the deferred list.
6955 if (Worklist.isDeferred(&Inst))
6956 continue;
6957 moveToVALUImpl(Worklist, MDT, Inst);
6958 }
6959
6960 // Deferred list of instructions will be processed once
6961 // all the MachineInstr in the worklist are done.
6962 for (MachineInstr *Inst : Worklist.getDeferredList()) {
6963 moveToVALUImpl(Worklist, MDT, *Inst);
6964 assert(Worklist.empty() &&
6965 "Deferred MachineInstr are not supposed to re-populate worklist");
6966 }
6967}
6968
6971 MachineInstr &Inst) const {
6972
6974 if (!MBB)
6975 return;
6977 unsigned Opcode = Inst.getOpcode();
6978 unsigned NewOpcode = getVALUOp(Inst);
6979 // Handle some special cases
6980 switch (Opcode) {
6981 default:
6982 break;
6983 case AMDGPU::S_ADD_U64_PSEUDO:
6984 NewOpcode = AMDGPU::V_ADD_U64_PSEUDO;
6985 break;
6986 case AMDGPU::S_SUB_U64_PSEUDO:
6987 NewOpcode = AMDGPU::V_SUB_U64_PSEUDO;
6988 break;
6989 case AMDGPU::S_ADD_I32:
6990 case AMDGPU::S_SUB_I32: {
6991 // FIXME: The u32 versions currently selected use the carry.
6992 bool Changed;
6993 MachineBasicBlock *CreatedBBTmp = nullptr;
6994 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
6995 if (Changed)
6996 return;
6997
6998 // Default handling
6999 break;
7000 }
7001
7002 case AMDGPU::S_MUL_U64:
7003 // Split s_mul_u64 in 32-bit vector multiplications.
7004 splitScalarSMulU64(Worklist, Inst, MDT);
7005 Inst.eraseFromParent();
7006 return;
7007
7008 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7009 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7010 // This is a special case of s_mul_u64 where all the operands are either
7011 // zero extended or sign extended.
7012 splitScalarSMulPseudo(Worklist, Inst, MDT);
7013 Inst.eraseFromParent();
7014 return;
7015
7016 case AMDGPU::S_AND_B64:
7017 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7018 Inst.eraseFromParent();
7019 return;
7020
7021 case AMDGPU::S_OR_B64:
7022 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7023 Inst.eraseFromParent();
7024 return;
7025
7026 case AMDGPU::S_XOR_B64:
7027 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7028 Inst.eraseFromParent();
7029 return;
7030
7031 case AMDGPU::S_NAND_B64:
7032 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7033 Inst.eraseFromParent();
7034 return;
7035
7036 case AMDGPU::S_NOR_B64:
7037 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7038 Inst.eraseFromParent();
7039 return;
7040
7041 case AMDGPU::S_XNOR_B64:
7042 if (ST.hasDLInsts())
7043 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7044 else
7045 splitScalar64BitXnor(Worklist, Inst, MDT);
7046 Inst.eraseFromParent();
7047 return;
7048
7049 case AMDGPU::S_ANDN2_B64:
7050 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7051 Inst.eraseFromParent();
7052 return;
7053
7054 case AMDGPU::S_ORN2_B64:
7055 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7056 Inst.eraseFromParent();
7057 return;
7058
7059 case AMDGPU::S_BREV_B64:
7060 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7061 Inst.eraseFromParent();
7062 return;
7063
7064 case AMDGPU::S_NOT_B64:
7065 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7066 Inst.eraseFromParent();
7067 return;
7068
7069 case AMDGPU::S_BCNT1_I32_B64:
7070 splitScalar64BitBCNT(Worklist, Inst);
7071 Inst.eraseFromParent();
7072 return;
7073
7074 case AMDGPU::S_BFE_I64:
7075 splitScalar64BitBFE(Worklist, Inst);
7076 Inst.eraseFromParent();
7077 return;
7078
7079 case AMDGPU::S_FLBIT_I32_B64:
7080 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7081 Inst.eraseFromParent();
7082 return;
7083 case AMDGPU::S_FF1_I32_B64:
7084 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7085 Inst.eraseFromParent();
7086 return;
7087
7088 case AMDGPU::S_LSHL_B32:
7089 if (ST.hasOnlyRevVALUShifts()) {
7090 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7091 swapOperands(Inst);
7092 }
7093 break;
7094 case AMDGPU::S_ASHR_I32:
7095 if (ST.hasOnlyRevVALUShifts()) {
7096 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7097 swapOperands(Inst);
7098 }
7099 break;
7100 case AMDGPU::S_LSHR_B32:
7101 if (ST.hasOnlyRevVALUShifts()) {
7102 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7103 swapOperands(Inst);
7104 }
7105 break;
7106 case AMDGPU::S_LSHL_B64:
7107 if (ST.hasOnlyRevVALUShifts()) {
7108 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7109 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7110 : AMDGPU::V_LSHLREV_B64_e64;
7111 swapOperands(Inst);
7112 }
7113 break;
7114 case AMDGPU::S_ASHR_I64:
7115 if (ST.hasOnlyRevVALUShifts()) {
7116 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7117 swapOperands(Inst);
7118 }
7119 break;
7120 case AMDGPU::S_LSHR_B64:
7121 if (ST.hasOnlyRevVALUShifts()) {
7122 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7123 swapOperands(Inst);
7124 }
7125 break;
7126
7127 case AMDGPU::S_ABS_I32:
7128 lowerScalarAbs(Worklist, Inst);
7129 Inst.eraseFromParent();
7130 return;
7131
7132 case AMDGPU::S_CBRANCH_SCC0:
7133 case AMDGPU::S_CBRANCH_SCC1: {
7134 // Clear unused bits of vcc
7135 Register CondReg = Inst.getOperand(1).getReg();
7136 bool IsSCC = CondReg == AMDGPU::SCC;
7137 Register VCC = RI.getVCC();
7138 Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
7139 unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
7140 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
7141 .addReg(EXEC)
7142 .addReg(IsSCC ? VCC : CondReg);
7143 Inst.removeOperand(1);
7144 } break;
7145
7146 case AMDGPU::S_BFE_U64:
7147 case AMDGPU::S_BFM_B64:
7148 llvm_unreachable("Moving this op to VALU not implemented");
7149
7150 case AMDGPU::S_PACK_LL_B32_B16:
7151 case AMDGPU::S_PACK_LH_B32_B16:
7152 case AMDGPU::S_PACK_HL_B32_B16:
7153 case AMDGPU::S_PACK_HH_B32_B16:
7154 movePackToVALU(Worklist, MRI, Inst);
7155 Inst.eraseFromParent();
7156 return;
7157
7158 case AMDGPU::S_XNOR_B32:
7159 lowerScalarXnor(Worklist, Inst);
7160 Inst.eraseFromParent();
7161 return;
7162
7163 case AMDGPU::S_NAND_B32:
7164 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7165 Inst.eraseFromParent();
7166 return;
7167
7168 case AMDGPU::S_NOR_B32:
7169 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7170 Inst.eraseFromParent();
7171 return;
7172
7173 case AMDGPU::S_ANDN2_B32:
7174 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7175 Inst.eraseFromParent();
7176 return;
7177
7178 case AMDGPU::S_ORN2_B32:
7179 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7180 Inst.eraseFromParent();
7181 return;
7182
7183 // TODO: remove as soon as everything is ready
7184 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7185 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7186 // can only be selected from the uniform SDNode.
7187 case AMDGPU::S_ADD_CO_PSEUDO:
7188 case AMDGPU::S_SUB_CO_PSEUDO: {
7189 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7190 ? AMDGPU::V_ADDC_U32_e64
7191 : AMDGPU::V_SUBB_U32_e64;
7192 const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
7193
7194 Register CarryInReg = Inst.getOperand(4).getReg();
7195 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7196 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7197 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7198 .addReg(CarryInReg);
7199 }
7200
7201 Register CarryOutReg = Inst.getOperand(1).getReg();
7202
7203 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7204 MRI.getRegClass(Inst.getOperand(0).getReg())));
7205 MachineInstr *CarryOp =
7206 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7207 .addReg(CarryOutReg, RegState::Define)
7208 .add(Inst.getOperand(2))
7209 .add(Inst.getOperand(3))
7210 .addReg(CarryInReg)
7211 .addImm(0);
7212 legalizeOperands(*CarryOp);
7213 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7214 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7215 Inst.eraseFromParent();
7216 }
7217 return;
7218 case AMDGPU::S_UADDO_PSEUDO:
7219 case AMDGPU::S_USUBO_PSEUDO: {
7220 const DebugLoc &DL = Inst.getDebugLoc();
7221 MachineOperand &Dest0 = Inst.getOperand(0);
7222 MachineOperand &Dest1 = Inst.getOperand(1);
7223 MachineOperand &Src0 = Inst.getOperand(2);
7224 MachineOperand &Src1 = Inst.getOperand(3);
7225
7226 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7227 ? AMDGPU::V_ADD_CO_U32_e64
7228 : AMDGPU::V_SUB_CO_U32_e64;
7229 const TargetRegisterClass *NewRC =
7230 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7231 Register DestReg = MRI.createVirtualRegister(NewRC);
7232 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7233 .addReg(Dest1.getReg(), RegState::Define)
7234 .add(Src0)
7235 .add(Src1)
7236 .addImm(0); // clamp bit
7237
7238 legalizeOperands(*NewInstr, MDT);
7239 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7240 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7241 Worklist);
7242 Inst.eraseFromParent();
7243 }
7244 return;
7245
7246 case AMDGPU::S_CSELECT_B32:
7247 case AMDGPU::S_CSELECT_B64:
7248 lowerSelect(Worklist, Inst, MDT);
7249 Inst.eraseFromParent();
7250 return;
7251 case AMDGPU::S_CMP_EQ_I32:
7252 case AMDGPU::S_CMP_LG_I32:
7253 case AMDGPU::S_CMP_GT_I32:
7254 case AMDGPU::S_CMP_GE_I32:
7255 case AMDGPU::S_CMP_LT_I32:
7256 case AMDGPU::S_CMP_LE_I32:
7257 case AMDGPU::S_CMP_EQ_U32:
7258 case AMDGPU::S_CMP_LG_U32:
7259 case AMDGPU::S_CMP_GT_U32:
7260 case AMDGPU::S_CMP_GE_U32:
7261 case AMDGPU::S_CMP_LT_U32:
7262 case AMDGPU::S_CMP_LE_U32:
7263 case AMDGPU::S_CMP_EQ_U64:
7264 case AMDGPU::S_CMP_LG_U64:
7265 case AMDGPU::S_CMP_LT_F32:
7266 case AMDGPU::S_CMP_EQ_F32:
7267 case AMDGPU::S_CMP_LE_F32:
7268 case AMDGPU::S_CMP_GT_F32:
7269 case AMDGPU::S_CMP_LG_F32:
7270 case AMDGPU::S_CMP_GE_F32:
7271 case AMDGPU::S_CMP_O_F32:
7272 case AMDGPU::S_CMP_U_F32:
7273 case AMDGPU::S_CMP_NGE_F32:
7274 case AMDGPU::S_CMP_NLG_F32:
7275 case AMDGPU::S_CMP_NGT_F32:
7276 case AMDGPU::S_CMP_NLE_F32:
7277 case AMDGPU::S_CMP_NEQ_F32:
7278 case AMDGPU::S_CMP_NLT_F32:
7279 case AMDGPU::S_CMP_LT_F16:
7280 case AMDGPU::S_CMP_EQ_F16:
7281 case AMDGPU::S_CMP_LE_F16:
7282 case AMDGPU::S_CMP_GT_F16:
7283 case AMDGPU::S_CMP_LG_F16:
7284 case AMDGPU::S_CMP_GE_F16:
7285 case AMDGPU::S_CMP_O_F16:
7286 case AMDGPU::S_CMP_U_F16:
7287 case AMDGPU::S_CMP_NGE_F16:
7288 case AMDGPU::S_CMP_NLG_F16:
7289 case AMDGPU::S_CMP_NGT_F16:
7290 case AMDGPU::S_CMP_NLE_F16:
7291 case AMDGPU::S_CMP_NEQ_F16:
7292 case AMDGPU::S_CMP_NLT_F16: {
7293 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7294 auto NewInstr =
7295 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7296 .setMIFlags(Inst.getFlags());
7297 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7298 AMDGPU::OpName::src0_modifiers) >= 0) {
7299 NewInstr
7300 .addImm(0) // src0_modifiers
7301 .add(Inst.getOperand(0)) // src0
7302 .addImm(0) // src1_modifiers
7303 .add(Inst.getOperand(1)) // src1
7304 .addImm(0); // clamp
7305 } else {
7306 NewInstr
7307 .add(Inst.getOperand(0))
7308 .add(Inst.getOperand(1));
7309 }
7310 legalizeOperands(*NewInstr, MDT);
7311 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7312 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7313 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7314 Inst.eraseFromParent();
7315 return;
7316 }
7317 case AMDGPU::S_CVT_HI_F32_F16: {
7318 const DebugLoc &DL = Inst.getDebugLoc();
7319 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7320 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7321 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
7322 .addImm(16)
7323 .add(Inst.getOperand(1));
7324 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7325 .addImm(0) // src0_modifiers
7326 .addReg(TmpReg)
7327 .addImm(0) // clamp
7328 .addImm(0); // omod
7329
7330 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7331 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7332 Inst.eraseFromParent();
7333 return;
7334 }
7335 case AMDGPU::S_MINIMUM_F32:
7336 case AMDGPU::S_MAXIMUM_F32:
7337 case AMDGPU::S_MINIMUM_F16:
7338 case AMDGPU::S_MAXIMUM_F16: {
7339 const DebugLoc &DL = Inst.getDebugLoc();
7340 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7341 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7342 .addImm(0) // src0_modifiers
7343 .add(Inst.getOperand(1))
7344 .addImm(0) // src1_modifiers
7345 .add(Inst.getOperand(2))
7346 .addImm(0) // clamp
7347 .addImm(0); // omod
7348 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7349
7350 legalizeOperands(*NewInstr, MDT);
7351 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7352 Inst.eraseFromParent();
7353 return;
7354 }
7355 }
7356
7357 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
7358 // We cannot move this instruction to the VALU, so we should try to
7359 // legalize its operands instead.
7360 legalizeOperands(Inst, MDT);
7361 return;
7362 }
7363 // Handle converting generic instructions like COPY-to-SGPR into
7364 // COPY-to-VGPR.
7365 if (NewOpcode == Opcode) {
7366 Register DstReg = Inst.getOperand(0).getReg();
7367 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
7368
7369 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7370 // hope for the best.
7371 if (Inst.isCopy() && DstReg.isPhysical() &&
7372 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
7373 // TODO: Only works for 32 bit registers.
7374 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7375 get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
7376 .add(Inst.getOperand(1));
7377 Inst.eraseFromParent();
7378 return;
7379 }
7380
7381 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
7382 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
7383 // Instead of creating a copy where src and dst are the same register
7384 // class, we just replace all uses of dst with src. These kinds of
7385 // copies interfere with the heuristics MachineSink uses to decide
7386 // whether or not to split a critical edge. Since the pass assumes
7387 // that copies will end up as machine instructions and not be
7388 // eliminated.
7389 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
7390 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
7391 MRI.clearKillFlags(Inst.getOperand(1).getReg());
7392 Inst.getOperand(0).setReg(DstReg);
7393 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
7394 // these are deleted later, but at -O0 it would leave a suspicious
7395 // looking illegal copy of an undef register.
7396 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
7397 Inst.removeOperand(I);
7398 Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
7399 return;
7400 }
7401 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
7402 MRI.replaceRegWith(DstReg, NewDstReg);
7403 legalizeOperands(Inst, MDT);
7404 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7405 return;
7406 }
7407
7408 // Use the new VALU Opcode.
7409 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
7410 .setMIFlags(Inst.getFlags());
7411 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
7412 // Intersperse VOP3 modifiers among the SALU operands.
7413 NewInstr->addOperand(Inst.getOperand(0));
7414 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7415 AMDGPU::OpName::src0_modifiers) >= 0)
7416 NewInstr.addImm(0);
7417 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
7418 MachineOperand Src = Inst.getOperand(1);
7419 if (AMDGPU::isTrue16Inst(NewOpcode) && ST.useRealTrue16Insts() &&
7420 Src.isReg() && RI.isVGPR(MRI, Src.getReg()))
7421 NewInstr.addReg(Src.getReg(), 0, AMDGPU::lo16);
7422 else
7423 NewInstr->addOperand(Src);
7424 }
7425
7426 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
7427 // We are converting these to a BFE, so we need to add the missing
7428 // operands for the size and offset.
7429 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
7430 NewInstr.addImm(0);
7431 NewInstr.addImm(Size);
7432 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
7433 // The VALU version adds the second operand to the result, so insert an
7434 // extra 0 operand.
7435 NewInstr.addImm(0);
7436 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
7437 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
7438 // If we need to move this to VGPRs, we need to unpack the second
7439 // operand back into the 2 separate ones for bit offset and width.
7440 assert(OffsetWidthOp.isImm() &&
7441 "Scalar BFE is only implemented for constant width and offset");
7442 uint32_t Imm = OffsetWidthOp.getImm();
7443
7444 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
7445 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
7446 NewInstr.addImm(Offset);
7447 NewInstr.addImm(BitWidth);
7448 } else {
7449 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7450 AMDGPU::OpName::src1_modifiers) >= 0)
7451 NewInstr.addImm(0);
7452 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
7453 NewInstr->addOperand(Inst.getOperand(2));
7454 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7455 AMDGPU::OpName::src2_modifiers) >= 0)
7456 NewInstr.addImm(0);
7457 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
7458 NewInstr->addOperand(Inst.getOperand(3));
7459 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
7460 NewInstr.addImm(0);
7461 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
7462 NewInstr.addImm(0);
7463 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
7464 NewInstr.addImm(0);
7465 }
7466 } else {
7467 // Just copy the SALU operands.
7468 for (const MachineOperand &Op : Inst.explicit_operands())
7469 NewInstr->addOperand(Op);
7470 }
7471
7472 // Remove any references to SCC. Vector instructions can't read from it, and
7473 // We're just about to add the implicit use / defs of VCC, and we don't want
7474 // both.
7475 for (MachineOperand &Op : Inst.implicit_operands()) {
7476 if (Op.getReg() == AMDGPU::SCC) {
7477 // Only propagate through live-def of SCC.
7478 if (Op.isDef() && !Op.isDead())
7479 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
7480 if (Op.isUse())
7481 addSCCDefsToVALUWorklist(NewInstr, Worklist);
7482 }
7483 }
7484 Inst.eraseFromParent();
7485 Register NewDstReg;
7486 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
7487 Register DstReg = NewInstr->getOperand(0).getReg();
7488 assert(DstReg.isVirtual());
7489 // Update the destination register class.
7490 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
7491 assert(NewDstRC);
7492 NewDstReg = MRI.createVirtualRegister(NewDstRC);
7493 MRI.replaceRegWith(DstReg, NewDstReg);
7494 }
7495 fixImplicitOperands(*NewInstr);
7496 // Legalize the operands
7497 legalizeOperands(*NewInstr, MDT);
7498 if (NewDstReg)
7499 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7500}
7501
7502// Add/sub require special handling to deal with carry outs.
7503std::pair<bool, MachineBasicBlock *>
7504SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
7505 MachineDominatorTree *MDT) const {
7506 if (ST.hasAddNoCarry()) {
7507 // Assume there is no user of scc since we don't select this in that case.
7508 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
7509 // is used.
7510
7511 MachineBasicBlock &MBB = *Inst.getParent();
7513
7514 Register OldDstReg = Inst.getOperand(0).getReg();
7515 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7516
7517 unsigned Opc = Inst.getOpcode();
7518 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
7519
7520 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
7521 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
7522
7523 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
7524 Inst.removeOperand(3);
7525
7526 Inst.setDesc(get(NewOpc));
7527 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
7529 MRI.replaceRegWith(OldDstReg, ResultReg);
7530 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
7531
7532 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7533 return std::pair(true, NewBB);
7534 }
7535
7536 return std::pair(false, nullptr);
7537}
7538
7539void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
7540 MachineDominatorTree *MDT) const {
7541
7542 MachineBasicBlock &MBB = *Inst.getParent();
7544 MachineBasicBlock::iterator MII = Inst;
7545 DebugLoc DL = Inst.getDebugLoc();
7546
7547 MachineOperand &Dest = Inst.getOperand(0);
7548 MachineOperand &Src0 = Inst.getOperand(1);
7549 MachineOperand &Src1 = Inst.getOperand(2);
7550 MachineOperand &Cond = Inst.getOperand(3);
7551
7552 Register CondReg = Cond.getReg();
7553 bool IsSCC = (CondReg == AMDGPU::SCC);
7554
7555 // If this is a trivial select where the condition is effectively not SCC
7556 // (CondReg is a source of copy to SCC), then the select is semantically
7557 // equivalent to copying CondReg. Hence, there is no need to create
7558 // V_CNDMASK, we can just use that and bail out.
7559 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
7560 (Src1.getImm() == 0)) {
7561 MRI.replaceRegWith(Dest.getReg(), CondReg);
7562 return;
7563 }
7564
7565 Register NewCondReg = CondReg;
7566 if (IsSCC) {
7567 const TargetRegisterClass *TC =
7568 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
7569 NewCondReg = MRI.createVirtualRegister(TC);
7570
7571 // Now look for the closest SCC def if it is a copy
7572 // replacing the CondReg with the COPY source register
7573 bool CopyFound = false;
7574 for (MachineInstr &CandI :
7576 Inst.getParent()->rend())) {
7577 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
7578 -1) {
7579 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
7580 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
7581 .addReg(CandI.getOperand(1).getReg());
7582 CopyFound = true;
7583 }
7584 break;
7585 }
7586 }
7587 if (!CopyFound) {
7588 // SCC def is not a copy
7589 // Insert a trivial select instead of creating a copy, because a copy from
7590 // SCC would semantically mean just copying a single bit, but we may need
7591 // the result to be a vector condition mask that needs preserving.
7592 unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
7593 : AMDGPU::S_CSELECT_B32;
7594 auto NewSelect =
7595 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
7596 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
7597 }
7598 }
7599
7600 Register NewDestReg = MRI.createVirtualRegister(
7601 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
7602 MachineInstr *NewInst;
7603 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
7604 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
7605 .addImm(0)
7606 .add(Src1) // False
7607 .addImm(0)
7608 .add(Src0) // True
7609 .addReg(NewCondReg);
7610 } else {
7611 NewInst =
7612 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
7613 .add(Src1) // False
7614 .add(Src0) // True
7615 .addReg(NewCondReg);
7616 }
7617 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
7618 legalizeOperands(*NewInst, MDT);
7619 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
7620}
7621
7622void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
7623 MachineInstr &Inst) const {
7624 MachineBasicBlock &MBB = *Inst.getParent();
7626 MachineBasicBlock::iterator MII = Inst;
7627 DebugLoc DL = Inst.getDebugLoc();
7628
7629 MachineOperand &Dest = Inst.getOperand(0);
7630 MachineOperand &Src = Inst.getOperand(1);
7631 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7632 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7633
7634 unsigned SubOp = ST.hasAddNoCarry() ?
7635 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
7636
7637 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
7638 .addImm(0)
7639 .addReg(Src.getReg());
7640
7641 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
7642 .addReg(Src.getReg())
7643 .addReg(TmpReg);
7644
7645 MRI.replaceRegWith(Dest.getReg(), ResultReg);
7646 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7647}
7648
7649void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
7650 MachineInstr &Inst) const {
7651 MachineBasicBlock &MBB = *Inst.getParent();
7653 MachineBasicBlock::iterator MII = Inst;
7654 const DebugLoc &DL = Inst.getDebugLoc();
7655
7656 MachineOperand &Dest = Inst.getOperand(0);
7657 MachineOperand &Src0 = Inst.getOperand(1);
7658 MachineOperand &Src1 = Inst.getOperand(2);
7659
7660 if (ST.hasDLInsts()) {
7661 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7662 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
7663 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
7664
7665 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
7666 .add(Src0)
7667 .add(Src1);
7668
7669 MRI.replaceRegWith(Dest.getReg(), NewDest);
7670 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7671 } else {
7672 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
7673 // invert either source and then perform the XOR. If either source is a
7674 // scalar register, then we can leave the inversion on the scalar unit to
7675 // achieve a better distribution of scalar and vector instructions.
7676 bool Src0IsSGPR = Src0.isReg() &&
7677 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
7678 bool Src1IsSGPR = Src1.isReg() &&
7679 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
7681 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7682 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7683
7684 // Build a pair of scalar instructions and add them to the work list.
7685 // The next iteration over the work list will lower these to the vector
7686 // unit as necessary.
7687 if (Src0IsSGPR) {
7688 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
7689 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7690 .addReg(Temp)
7691 .add(Src1);
7692 } else if (Src1IsSGPR) {
7693 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
7694 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7695 .add(Src0)
7696 .addReg(Temp);
7697 } else {
7698 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
7699 .add(Src0)
7700 .add(Src1);
7701 MachineInstr *Not =
7702 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
7703 Worklist.insert(Not);
7704 }
7705
7706 MRI.replaceRegWith(Dest.getReg(), NewDest);
7707
7708 Worklist.insert(Xor);
7709
7710 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7711 }
7712}
7713
7714void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
7715 MachineInstr &Inst,
7716 unsigned Opcode) const {
7717 MachineBasicBlock &MBB = *Inst.getParent();
7719 MachineBasicBlock::iterator MII = Inst;
7720 const DebugLoc &DL = Inst.getDebugLoc();
7721
7722 MachineOperand &Dest = Inst.getOperand(0);
7723 MachineOperand &Src0 = Inst.getOperand(1);
7724 MachineOperand &Src1 = Inst.getOperand(2);
7725
7726 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7727 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7728
7729 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
7730 .add(Src0)
7731 .add(Src1);
7732
7733 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
7734 .addReg(Interm);
7735
7736 Worklist.insert(&Op);
7737 Worklist.insert(&Not);
7738
7739 MRI.replaceRegWith(Dest.getReg(), NewDest);
7740 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7741}
7742
7743void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
7744 MachineInstr &Inst,
7745 unsigned Opcode) const {
7746 MachineBasicBlock &MBB = *Inst.getParent();
7748 MachineBasicBlock::iterator MII = Inst;
7749 const DebugLoc &DL = Inst.getDebugLoc();
7750
7751 MachineOperand &Dest = Inst.getOperand(0);
7752 MachineOperand &Src0 = Inst.getOperand(1);
7753 MachineOperand &Src1 = Inst.getOperand(2);
7754
7755 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7756 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7757
7758 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
7759 .add(Src1);
7760
7761 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
7762 .add(Src0)
7763 .addReg(Interm);
7764
7765 Worklist.insert(&Not);
7766 Worklist.insert(&Op);
7767
7768 MRI.replaceRegWith(Dest.getReg(), NewDest);
7769 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7770}
7771
7772void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
7773 MachineInstr &Inst, unsigned Opcode,
7774 bool Swap) const {
7775 MachineBasicBlock &MBB = *Inst.getParent();
7777
7778 MachineOperand &Dest = Inst.getOperand(0);
7779 MachineOperand &Src0 = Inst.getOperand(1);
7780 DebugLoc DL = Inst.getDebugLoc();
7781
7782 MachineBasicBlock::iterator MII = Inst;
7783
7784 const MCInstrDesc &InstDesc = get(Opcode);
7785 const TargetRegisterClass *Src0RC = Src0.isReg() ?
7786 MRI.getRegClass(Src0.getReg()) :
7787 &AMDGPU::SGPR_32RegClass;
7788
7789 const TargetRegisterClass *Src0SubRC =
7790 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7791
7792 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7793 AMDGPU::sub0, Src0SubRC);
7794
7795 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
7796 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
7797 const TargetRegisterClass *NewDestSubRC =
7798 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
7799
7800 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
7801 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
7802
7803 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7804 AMDGPU::sub1, Src0SubRC);
7805
7806 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
7807 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
7808
7809 if (Swap)
7810 std::swap(DestSub0, DestSub1);
7811
7812 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
7813 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7814 .addReg(DestSub0)
7815 .addImm(AMDGPU::sub0)
7816 .addReg(DestSub1)
7817 .addImm(AMDGPU::sub1);
7818
7819 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7820
7821 Worklist.insert(&LoHalf);
7822 Worklist.insert(&HiHalf);
7823
7824 // We don't need to legalizeOperands here because for a single operand, src0
7825 // will support any kind of input.
7826
7827 // Move all users of this moved value.
7828 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
7829}
7830
7831// There is not a vector equivalent of s_mul_u64. For this reason, we need to
7832// split the s_mul_u64 in 32-bit vector multiplications.
7833void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
7834 MachineInstr &Inst,
7835 MachineDominatorTree *MDT) const {
7836 MachineBasicBlock &MBB = *Inst.getParent();
7838
7839 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7840 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7841 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7842
7843 MachineOperand &Dest = Inst.getOperand(0);
7844 MachineOperand &Src0 = Inst.getOperand(1);
7845 MachineOperand &Src1 = Inst.getOperand(2);
7846 const DebugLoc &DL = Inst.getDebugLoc();
7847 MachineBasicBlock::iterator MII = Inst;
7848
7849 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
7850 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
7851 const TargetRegisterClass *Src0SubRC =
7852 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7853 if (RI.isSGPRClass(Src0SubRC))
7854 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
7855 const TargetRegisterClass *Src1SubRC =
7856 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
7857 if (RI.isSGPRClass(Src1SubRC))
7858 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
7859
7860 // First, we extract the low 32-bit and high 32-bit values from each of the
7861 // operands.
7862 MachineOperand Op0L =
7863 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
7864 MachineOperand Op1L =
7865 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
7866 MachineOperand Op0H =
7867 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
7868 MachineOperand Op1H =
7869 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
7870
7871 // The multilication is done as follows:
7872 //
7873 // Op1H Op1L
7874 // * Op0H Op0L
7875 // --------------------
7876 // Op1H*Op0L Op1L*Op0L
7877 // + Op1H*Op0H Op1L*Op0H
7878 // -----------------------------------------
7879 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
7880 //
7881 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
7882 // value and that would overflow.
7883 // The low 32-bit value is Op1L*Op0L.
7884 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
7885
7886 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7887 MachineInstr *Op1L_Op0H =
7888 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
7889 .add(Op1L)
7890 .add(Op0H);
7891
7892 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7893 MachineInstr *Op1H_Op0L =
7894 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
7895 .add(Op1H)
7896 .add(Op0L);
7897
7898 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7899 MachineInstr *Carry =
7900 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
7901 .add(Op1L)
7902 .add(Op0L);
7903
7904 MachineInstr *LoHalf =
7905 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
7906 .add(Op1L)
7907 .add(Op0L);
7908
7909 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7910 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
7911 .addReg(Op1L_Op0H_Reg)
7912 .addReg(Op1H_Op0L_Reg);
7913
7914 MachineInstr *HiHalf =
7915 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
7916 .addReg(AddReg)
7917 .addReg(CarryReg);
7918
7919 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7920 .addReg(DestSub0)
7921 .addImm(AMDGPU::sub0)
7922 .addReg(DestSub1)
7923 .addImm(AMDGPU::sub1);
7924
7925 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7926
7927 // Try to legalize the operands in case we need to swap the order to keep it
7928 // valid.
7929 legalizeOperands(*Op1L_Op0H, MDT);
7930 legalizeOperands(*Op1H_Op0L, MDT);
7931 legalizeOperands(*Carry, MDT);
7932 legalizeOperands(*LoHalf, MDT);
7933 legalizeOperands(*Add, MDT);
7934 legalizeOperands(*HiHalf, MDT);
7935
7936 // Move all users of this moved value.
7937 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
7938}
7939
7940// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
7941// multiplications.
7942void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
7943 MachineInstr &Inst,
7944 MachineDominatorTree *MDT) const {
7945 MachineBasicBlock &MBB = *Inst.getParent();
7947
7948 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7949 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7950 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7951
7952 MachineOperand &Dest = Inst.getOperand(0);
7953 MachineOperand &Src0 = Inst.getOperand(1);
7954 MachineOperand &Src1 = Inst.getOperand(2);
7955 const DebugLoc &DL = Inst.getDebugLoc();
7956 MachineBasicBlock::iterator MII = Inst;
7957
7958 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
7959 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
7960 const TargetRegisterClass *Src0SubRC =
7961 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7962 if (RI.isSGPRClass(Src0SubRC))
7963 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
7964 const TargetRegisterClass *Src1SubRC =
7965 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
7966 if (RI.isSGPRClass(Src1SubRC))
7967 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
7968
7969 // First, we extract the low 32-bit and high 32-bit values from each of the
7970 // operands.
7971 MachineOperand Op0L =
7972 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
7973 MachineOperand Op1L =
7974 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
7975
7976 unsigned Opc = Inst.getOpcode();
7977 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
7978 ? AMDGPU::V_MUL_HI_U32_e64
7979 : AMDGPU::V_MUL_HI_I32_e64;
7980 MachineInstr *HiHalf =
7981 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
7982
7983 MachineInstr *LoHalf =
7984 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
7985 .add(Op1L)
7986 .add(Op0L);
7987
7988 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7989 .addReg(DestSub0)
7990 .addImm(AMDGPU::sub0)
7991 .addReg(DestSub1)
7992 .addImm(AMDGPU::sub1);
7993
7994 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7995
7996 // Try to legalize the operands in case we need to swap the order to keep it
7997 // valid.
7998 legalizeOperands(*HiHalf, MDT);
7999 legalizeOperands(*LoHalf, MDT);
8000
8001 // Move all users of this moved value.
8002 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8003}
8004
8005void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8006 MachineInstr &Inst, unsigned Opcode,
8007 MachineDominatorTree *MDT) const {
8008 MachineBasicBlock &MBB = *Inst.getParent();
8010
8011 MachineOperand &Dest = Inst.getOperand(0);
8012 MachineOperand &Src0 = Inst.getOperand(1);
8013 MachineOperand &Src1 = Inst.getOperand(2);
8014 DebugLoc DL = Inst.getDebugLoc();
8015
8016 MachineBasicBlock::iterator MII = Inst;
8017
8018 const MCInstrDesc &InstDesc = get(Opcode);
8019 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8020 MRI.getRegClass(Src0.getReg()) :
8021 &AMDGPU::SGPR_32RegClass;
8022
8023 const TargetRegisterClass *Src0SubRC =
8024 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8025 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8026 MRI.getRegClass(Src1.getReg()) :
8027 &AMDGPU::SGPR_32RegClass;
8028
8029 const TargetRegisterClass *Src1SubRC =
8030 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8031
8032 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8033 AMDGPU::sub0, Src0SubRC);
8034 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8035 AMDGPU::sub0, Src1SubRC);
8036 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8037 AMDGPU::sub1, Src0SubRC);
8038 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8039 AMDGPU::sub1, Src1SubRC);
8040
8041 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8042 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8043 const TargetRegisterClass *NewDestSubRC =
8044 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8045
8046 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8047 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8048 .add(SrcReg0Sub0)
8049 .add(SrcReg1Sub0);
8050
8051 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8052 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8053 .add(SrcReg0Sub1)
8054 .add(SrcReg1Sub1);
8055
8056 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8057 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8058 .addReg(DestSub0)
8059 .addImm(AMDGPU::sub0)
8060 .addReg(DestSub1)
8061 .addImm(AMDGPU::sub1);
8062
8063 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8064
8065 Worklist.insert(&LoHalf);
8066 Worklist.insert(&HiHalf);
8067
8068 // Move all users of this moved value.
8069 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8070}
8071
8072void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8073 MachineInstr &Inst,
8074 MachineDominatorTree *MDT) const {
8075 MachineBasicBlock &MBB = *Inst.getParent();
8077
8078 MachineOperand &Dest = Inst.getOperand(0);
8079 MachineOperand &Src0 = Inst.getOperand(1);
8080 MachineOperand &Src1 = Inst.getOperand(2);
8081 const DebugLoc &DL = Inst.getDebugLoc();
8082
8083 MachineBasicBlock::iterator MII = Inst;
8084
8085 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8086
8087 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8088
8089 MachineOperand* Op0;
8090 MachineOperand* Op1;
8091
8092 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8093 Op0 = &Src0;
8094 Op1 = &Src1;
8095 } else {
8096 Op0 = &Src1;
8097 Op1 = &Src0;
8098 }
8099
8100 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8101 .add(*Op0);
8102
8103 Register NewDest = MRI.createVirtualRegister(DestRC);
8104
8105 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8106 .addReg(Interm)
8107 .add(*Op1);
8108
8109 MRI.replaceRegWith(Dest.getReg(), NewDest);
8110
8111 Worklist.insert(&Xor);
8112}
8113
8114void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8115 MachineInstr &Inst) const {
8116 MachineBasicBlock &MBB = *Inst.getParent();
8118
8119 MachineBasicBlock::iterator MII = Inst;
8120 const DebugLoc &DL = Inst.getDebugLoc();
8121
8122 MachineOperand &Dest = Inst.getOperand(0);
8123 MachineOperand &Src = Inst.getOperand(1);
8124
8125 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8126 const TargetRegisterClass *SrcRC = Src.isReg() ?
8127 MRI.getRegClass(Src.getReg()) :
8128 &AMDGPU::SGPR_32RegClass;
8129
8130 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8131 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8132
8133 const TargetRegisterClass *SrcSubRC =
8134 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8135
8136 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8137 AMDGPU::sub0, SrcSubRC);
8138 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8139 AMDGPU::sub1, SrcSubRC);
8140
8141 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8142
8143 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8144
8145 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8146
8147 // We don't need to legalize operands here. src0 for either instruction can be
8148 // an SGPR, and the second input is unused or determined here.
8149 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8150}
8151
8152void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8153 MachineInstr &Inst) const {
8154 MachineBasicBlock &MBB = *Inst.getParent();
8156 MachineBasicBlock::iterator MII = Inst;
8157 const DebugLoc &DL = Inst.getDebugLoc();
8158
8159 MachineOperand &Dest = Inst.getOperand(0);
8160 uint32_t Imm = Inst.getOperand(2).getImm();
8161 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8162 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8163
8164 (void) Offset;
8165
8166 // Only sext_inreg cases handled.
8167 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8168 Offset == 0 && "Not implemented");
8169
8170 if (BitWidth < 32) {
8171 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8172 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8173 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8174
8175 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8176 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8177 .addImm(0)
8178 .addImm(BitWidth);
8179
8180 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8181 .addImm(31)
8182 .addReg(MidRegLo);
8183
8184 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8185 .addReg(MidRegLo)
8186 .addImm(AMDGPU::sub0)
8187 .addReg(MidRegHi)
8188 .addImm(AMDGPU::sub1);
8189
8190 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8191 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8192 return;
8193 }
8194
8195 MachineOperand &Src = Inst.getOperand(1);
8196 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8197 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8198
8199 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8200 .addImm(31)
8201 .addReg(Src.getReg(), 0, AMDGPU::sub0);
8202
8203 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8204 .addReg(Src.getReg(), 0, AMDGPU::sub0)
8205 .addImm(AMDGPU::sub0)
8206 .addReg(TmpReg)
8207 .addImm(AMDGPU::sub1);
8208
8209 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8210 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8211}
8212
8213void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8214 MachineInstr &Inst, unsigned Opcode,
8215 MachineDominatorTree *MDT) const {
8216 // (S_FLBIT_I32_B64 hi:lo) ->
8217 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8218 // (S_FF1_I32_B64 hi:lo) ->
8219 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8220
8221 MachineBasicBlock &MBB = *Inst.getParent();
8223 MachineBasicBlock::iterator MII = Inst;
8224 const DebugLoc &DL = Inst.getDebugLoc();
8225
8226 MachineOperand &Dest = Inst.getOperand(0);
8227 MachineOperand &Src = Inst.getOperand(1);
8228
8229 const MCInstrDesc &InstDesc = get(Opcode);
8230
8231 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
8232 unsigned OpcodeAdd =
8233 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
8234
8235 const TargetRegisterClass *SrcRC =
8236 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
8237 const TargetRegisterClass *SrcSubRC =
8238 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8239
8240 MachineOperand SrcRegSub0 =
8241 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
8242 MachineOperand SrcRegSub1 =
8243 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
8244
8245 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8246 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8247 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8248 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8249
8250 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
8251
8252 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
8253
8254 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
8255 .addReg(IsCtlz ? MidReg1 : MidReg2)
8256 .addImm(32)
8257 .addImm(1); // enable clamp
8258
8259 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
8260 .addReg(MidReg3)
8261 .addReg(IsCtlz ? MidReg2 : MidReg1);
8262
8263 MRI.replaceRegWith(Dest.getReg(), MidReg4);
8264
8265 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
8266}
8267
8268void SIInstrInfo::addUsersToMoveToVALUWorklist(
8270 SIInstrWorklist &Worklist) const {
8271 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
8272 E = MRI.use_end(); I != E;) {
8273 MachineInstr &UseMI = *I->getParent();
8274
8275 unsigned OpNo = 0;
8276
8277 switch (UseMI.getOpcode()) {
8278 case AMDGPU::COPY:
8279 case AMDGPU::WQM:
8280 case AMDGPU::SOFT_WQM:
8281 case AMDGPU::STRICT_WWM:
8282 case AMDGPU::STRICT_WQM:
8283 case AMDGPU::REG_SEQUENCE:
8284 case AMDGPU::PHI:
8285 case AMDGPU::INSERT_SUBREG:
8286 break;
8287 default:
8288 OpNo = I.getOperandNo();
8289 break;
8290 }
8291
8292 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) {
8293 Worklist.insert(&UseMI);
8294
8295 do {
8296 ++I;
8297 } while (I != E && I->getParent() == &UseMI);
8298 } else {
8299 ++I;
8300 }
8301 }
8302}
8303
8304void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
8306 MachineInstr &Inst) const {
8307 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8309 MachineOperand &Src0 = Inst.getOperand(1);
8310 MachineOperand &Src1 = Inst.getOperand(2);
8311 const DebugLoc &DL = Inst.getDebugLoc();
8312
8313 switch (Inst.getOpcode()) {
8314 case AMDGPU::S_PACK_LL_B32_B16: {
8315 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8316 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8317
8318 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
8319 // 0.
8320 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8321 .addImm(0xffff);
8322
8323 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
8324 .addReg(ImmReg, RegState::Kill)
8325 .add(Src0);
8326
8327 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8328 .add(Src1)
8329 .addImm(16)
8330 .addReg(TmpReg, RegState::Kill);
8331 break;
8332 }
8333 case AMDGPU::S_PACK_LH_B32_B16: {
8334 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8335 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8336 .addImm(0xffff);
8337 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
8338 .addReg(ImmReg, RegState::Kill)
8339 .add(Src0)
8340 .add(Src1);
8341 break;
8342 }
8343 case AMDGPU::S_PACK_HL_B32_B16: {
8344 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8345 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8346 .addImm(16)
8347 .add(Src0);
8348 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8349 .add(Src1)
8350 .addImm(16)
8351 .addReg(TmpReg, RegState::Kill);
8352 break;
8353 }
8354 case AMDGPU::S_PACK_HH_B32_B16: {
8355 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8356 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8357 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8358 .addImm(16)
8359 .add(Src0);
8360 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8361 .addImm(0xffff0000);
8362 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
8363 .add(Src1)
8364 .addReg(ImmReg, RegState::Kill)
8365 .addReg(TmpReg, RegState::Kill);
8366 break;
8367 }
8368 default:
8369 llvm_unreachable("unhandled s_pack_* instruction");
8370 }
8371
8372 MachineOperand &Dest = Inst.getOperand(0);
8373 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8374 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8375}
8376
8377void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
8378 MachineInstr &SCCDefInst,
8379 SIInstrWorklist &Worklist,
8380 Register NewCond) const {
8381
8382 // Ensure that def inst defines SCC, which is still live.
8383 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
8384 !Op.isDead() && Op.getParent() == &SCCDefInst);
8385 SmallVector<MachineInstr *, 4> CopyToDelete;
8386 // This assumes that all the users of SCC are in the same block
8387 // as the SCC def.
8388 for (MachineInstr &MI : // Skip the def inst itself.
8389 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
8390 SCCDefInst.getParent()->end())) {
8391 // Check if SCC is used first.
8392 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
8393 if (SCCIdx != -1) {
8394 if (MI.isCopy()) {
8395 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8396 Register DestReg = MI.getOperand(0).getReg();
8397
8398 MRI.replaceRegWith(DestReg, NewCond);
8399 CopyToDelete.push_back(&MI);
8400 } else {
8401
8402 if (NewCond.isValid())
8403 MI.getOperand(SCCIdx).setReg(NewCond);
8404
8405 Worklist.insert(&MI);
8406 }
8407 }
8408 // Exit if we find another SCC def.
8409 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
8410 break;
8411 }
8412 for (auto &Copy : CopyToDelete)
8413 Copy->eraseFromParent();
8414}
8415
8416// Instructions that use SCC may be converted to VALU instructions. When that
8417// happens, the SCC register is changed to VCC_LO. The instruction that defines
8418// SCC must be changed to an instruction that defines VCC. This function makes
8419// sure that the instruction that defines SCC is added to the moveToVALU
8420// worklist.
8421void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
8422 SIInstrWorklist &Worklist) const {
8423 // Look for a preceding instruction that either defines VCC or SCC. If VCC
8424 // then there is nothing to do because the defining instruction has been
8425 // converted to a VALU already. If SCC then that instruction needs to be
8426 // converted to a VALU.
8427 for (MachineInstr &MI :
8428 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
8429 SCCUseInst->getParent()->rend())) {
8430 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
8431 break;
8432 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
8433 Worklist.insert(&MI);
8434 break;
8435 }
8436 }
8437}
8438
8439const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
8440 const MachineInstr &Inst) const {
8441 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
8442
8443 switch (Inst.getOpcode()) {
8444 // For target instructions, getOpRegClass just returns the virtual register
8445 // class associated with the operand, so we need to find an equivalent VGPR
8446 // register class in order to move the instruction to the VALU.
8447 case AMDGPU::COPY:
8448 case AMDGPU::PHI:
8449 case AMDGPU::REG_SEQUENCE:
8450 case AMDGPU::INSERT_SUBREG:
8451 case AMDGPU::WQM:
8452 case AMDGPU::SOFT_WQM:
8453 case AMDGPU::STRICT_WWM:
8454 case AMDGPU::STRICT_WQM: {
8455 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
8456 if (RI.isAGPRClass(SrcRC)) {
8457 if (RI.isAGPRClass(NewDstRC))
8458 return nullptr;
8459
8460 switch (Inst.getOpcode()) {
8461 case AMDGPU::PHI:
8462 case AMDGPU::REG_SEQUENCE:
8463 case AMDGPU::INSERT_SUBREG:
8464 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
8465 break;
8466 default:
8467 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8468 }
8469
8470 if (!NewDstRC)
8471 return nullptr;
8472 } else {
8473 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
8474 return nullptr;
8475
8476 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8477 if (!NewDstRC)
8478 return nullptr;
8479 }
8480
8481 return NewDstRC;
8482 }
8483 default:
8484 return NewDstRC;
8485 }
8486}
8487
8488// Find the one SGPR operand we are allowed to use.
8489Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
8490 int OpIndices[3]) const {
8491 const MCInstrDesc &Desc = MI.getDesc();
8492
8493 // Find the one SGPR operand we are allowed to use.
8494 //
8495 // First we need to consider the instruction's operand requirements before
8496 // legalizing. Some operands are required to be SGPRs, such as implicit uses
8497 // of VCC, but we are still bound by the constant bus requirement to only use
8498 // one.
8499 //
8500 // If the operand's class is an SGPR, we can never move it.
8501
8502 Register SGPRReg = findImplicitSGPRRead(MI);
8503 if (SGPRReg)
8504 return SGPRReg;
8505
8506 Register UsedSGPRs[3] = {Register()};
8507 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8508
8509 for (unsigned i = 0; i < 3; ++i) {
8510 int Idx = OpIndices[i];
8511 if (Idx == -1)
8512 break;
8513
8514 const MachineOperand &MO = MI.getOperand(Idx);
8515 if (!MO.isReg())
8516 continue;
8517
8518 // Is this operand statically required to be an SGPR based on the operand
8519 // constraints?
8520 const TargetRegisterClass *OpRC =
8521 RI.getRegClass(Desc.operands()[Idx].RegClass);
8522 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
8523 if (IsRequiredSGPR)
8524 return MO.getReg();
8525
8526 // If this could be a VGPR or an SGPR, Check the dynamic register class.
8527 Register Reg = MO.getReg();
8528 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
8529 if (RI.isSGPRClass(RegRC))
8530 UsedSGPRs[i] = Reg;
8531 }
8532
8533 // We don't have a required SGPR operand, so we have a bit more freedom in
8534 // selecting operands to move.
8535
8536 // Try to select the most used SGPR. If an SGPR is equal to one of the
8537 // others, we choose that.
8538 //
8539 // e.g.
8540 // V_FMA_F32 v0, s0, s0, s0 -> No moves
8541 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
8542
8543 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
8544 // prefer those.
8545
8546 if (UsedSGPRs[0]) {
8547 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
8548 SGPRReg = UsedSGPRs[0];
8549 }
8550
8551 if (!SGPRReg && UsedSGPRs[1]) {
8552 if (UsedSGPRs[1] == UsedSGPRs[2])
8553 SGPRReg = UsedSGPRs[1];
8554 }
8555
8556 return SGPRReg;
8557}
8558
8560 unsigned OperandName) const {
8561 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
8562 if (Idx == -1)
8563 return nullptr;
8564
8565 return &MI.getOperand(Idx);
8566}
8567
8573 return (Format << 44) |
8574 (1ULL << 56) | // RESOURCE_LEVEL = 1
8575 (3ULL << 60); // OOB_SELECT = 3
8576 }
8577
8578 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
8579 if (ST.isAmdHsaOS()) {
8580 // Set ATC = 1. GFX9 doesn't have this bit.
8582 RsrcDataFormat |= (1ULL << 56);
8583
8584 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
8585 // BTW, it disables TC L2 and therefore decreases performance.
8587 RsrcDataFormat |= (2ULL << 59);
8588 }
8589
8590 return RsrcDataFormat;
8591}
8592
8596 0xffffffff; // Size;
8597
8598 // GFX9 doesn't have ELEMENT_SIZE.
8600 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
8601 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
8602 }
8603
8604 // IndexStride = 64 / 32.
8605 uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2;
8606 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
8607
8608 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
8609 // Clear them unless we want a huge stride.
8612 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
8613
8614 return Rsrc23;
8615}
8616
8618 unsigned Opc = MI.getOpcode();
8619
8620 return isSMRD(Opc);
8621}
8622
8624 return get(Opc).mayLoad() &&
8625 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
8626}
8627
8629 int &FrameIndex) const {
8630 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
8631 if (!Addr || !Addr->isFI())
8632 return Register();
8633
8634 assert(!MI.memoperands_empty() &&
8635 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
8636
8637 FrameIndex = Addr->getIndex();
8638 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
8639}
8640
8642 int &FrameIndex) const {
8643 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
8644 assert(Addr && Addr->isFI());
8645 FrameIndex = Addr->getIndex();
8646 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
8647}
8648
8650 int &FrameIndex) const {
8651 if (!MI.mayLoad())
8652 return Register();
8653
8654 if (isMUBUF(MI) || isVGPRSpill(MI))
8655 return isStackAccess(MI, FrameIndex);
8656
8657 if (isSGPRSpill(MI))
8658 return isSGPRStackAccess(MI, FrameIndex);
8659
8660 return Register();
8661}
8662
8664 int &FrameIndex) const {
8665 if (!MI.mayStore())
8666 return Register();
8667
8668 if (isMUBUF(MI) || isVGPRSpill(MI))
8669 return isStackAccess(MI, FrameIndex);
8670
8671 if (isSGPRSpill(MI))
8672 return isSGPRStackAccess(MI, FrameIndex);
8673
8674 return Register();
8675}
8676
8678 unsigned Size = 0;
8680 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
8681 while (++I != E && I->isInsideBundle()) {
8682 assert(!I->isBundle() && "No nested bundle!");
8684 }
8685
8686 return Size;
8687}
8688
8690 unsigned Opc = MI.getOpcode();
8692 unsigned DescSize = Desc.getSize();
8693
8694 // If we have a definitive size, we can use it. Otherwise we need to inspect
8695 // the operands to know the size.
8696 if (isFixedSize(MI)) {
8697 unsigned Size = DescSize;
8698
8699 // If we hit the buggy offset, an extra nop will be inserted in MC so
8700 // estimate the worst case.
8701 if (MI.isBranch() && ST.hasOffset3fBug())
8702 Size += 4;
8703
8704 return Size;
8705 }
8706
8707 // Instructions may have a 32-bit literal encoded after them. Check
8708 // operands that could ever be literals.
8709 if (isVALU(MI) || isSALU(MI)) {
8710 if (isDPP(MI))
8711 return DescSize;
8712 bool HasLiteral = false;
8713 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
8714 const MachineOperand &Op = MI.getOperand(I);
8715 const MCOperandInfo &OpInfo = Desc.operands()[I];
8716 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
8717 HasLiteral = true;
8718 break;
8719 }
8720 }
8721 return HasLiteral ? DescSize + 4 : DescSize;
8722 }
8723
8724 // Check whether we have extra NSA words.
8725 if (isMIMG(MI)) {
8726 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
8727 if (VAddr0Idx < 0)
8728 return 8;
8729
8730 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
8731 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
8732 }
8733
8734 switch (Opc) {
8735 case TargetOpcode::BUNDLE:
8736 return getInstBundleSize(MI);
8737 case TargetOpcode::INLINEASM:
8738 case TargetOpcode::INLINEASM_BR: {
8739 const MachineFunction *MF = MI.getParent()->getParent();
8740 const char *AsmStr = MI.getOperand(0).getSymbolName();
8741 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
8742 }
8743 default:
8744 if (MI.isMetaInstruction())
8745 return 0;
8746 return DescSize;
8747 }
8748}
8749
8751 if (!isFLAT(MI))
8752 return false;
8753
8754 if (MI.memoperands_empty())
8755 return true;
8756
8757 for (const MachineMemOperand *MMO : MI.memoperands()) {
8758 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
8759 return true;
8760 }
8761 return false;
8762}
8763
8765 return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
8766}
8767
8769 MachineBasicBlock *IfEnd) const {
8771 assert(TI != IfEntry->end());
8772
8773 MachineInstr *Branch = &(*TI);
8774 MachineFunction *MF = IfEntry->getParent();
8776
8777 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
8778 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
8779 MachineInstr *SIIF =
8780 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
8781 .add(Branch->getOperand(0))
8782 .add(Branch->getOperand(1));
8783 MachineInstr *SIEND =
8784 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
8785 .addReg(DstReg);
8786
8787 IfEntry->erase(TI);
8788 IfEntry->insert(IfEntry->end(), SIIF);
8789 IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
8790 }
8791}
8792
8794 MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
8796 // We expect 2 terminators, one conditional and one unconditional.
8797 assert(TI != LoopEnd->end());
8798
8799 MachineInstr *Branch = &(*TI);
8800 MachineFunction *MF = LoopEnd->getParent();
8802
8803 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
8804
8805 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
8806 Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC());
8807 MachineInstrBuilder HeaderPHIBuilder =
8808 BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
8809 for (MachineBasicBlock *PMBB : LoopEntry->predecessors()) {
8810 if (PMBB == LoopEnd) {
8811 HeaderPHIBuilder.addReg(BackEdgeReg);
8812 } else {
8813 Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC());
8814 materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
8815 ZeroReg, 0);
8816 HeaderPHIBuilder.addReg(ZeroReg);
8817 }
8818 HeaderPHIBuilder.addMBB(PMBB);
8819 }
8820 MachineInstr *HeaderPhi = HeaderPHIBuilder;
8821 MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
8822 get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
8823 .addReg(DstReg)
8824 .add(Branch->getOperand(0));
8825 MachineInstr *SILOOP =
8826 BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
8827 .addReg(BackEdgeReg)
8828 .addMBB(LoopEntry);
8829
8830 LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
8831 LoopEnd->erase(TI);
8832 LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
8833 LoopEnd->insert(LoopEnd->end(), SILOOP);
8834 }
8835}
8836
8839 static const std::pair<int, const char *> TargetIndices[] = {
8840 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
8841 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
8842 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
8843 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
8844 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
8845 return ArrayRef(TargetIndices);
8846}
8847
8848/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
8849/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
8852 const ScheduleDAG *DAG) const {
8853 return new GCNHazardRecognizer(DAG->MF);
8854}
8855
8856/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
8857/// pass.
8860 return new GCNHazardRecognizer(MF);
8861}
8862
8863// Called during:
8864// - pre-RA scheduling and post-RA scheduling
8867 const ScheduleDAGMI *DAG) const {
8868 // Borrowed from Arm Target
8869 // We would like to restrict this hazard recognizer to only
8870 // post-RA scheduling; we can tell that we're post-RA because we don't
8871 // track VRegLiveness.
8872 if (!DAG->hasVRegLiveness())
8873 return new GCNHazardRecognizer(DAG->MF);
8875}
8876
8877std::pair<unsigned, unsigned>
8879 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
8880}
8881
8884 static const std::pair<unsigned, const char *> TargetFlags[] = {
8885 { MO_GOTPCREL, "amdgpu-gotprel" },
8886 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
8887 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
8888 { MO_REL32_LO, "amdgpu-rel32-lo" },
8889 { MO_REL32_HI, "amdgpu-rel32-hi" },
8890 { MO_ABS32_LO, "amdgpu-abs32-lo" },
8891 { MO_ABS32_HI, "amdgpu-abs32-hi" },
8892 };
8893
8894 return ArrayRef(TargetFlags);
8895}
8896
8899 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
8900 {
8901 {MONoClobber, "amdgpu-noclobber"},
8902 {MOLastUse, "amdgpu-last-use"},
8903 };
8904
8905 return ArrayRef(TargetFlags);
8906}
8907
8909 const MachineFunction &MF) const {
8911 assert(SrcReg.isVirtual());
8912 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
8913 return AMDGPU::WWM_COPY;
8914
8915 return AMDGPU::COPY;
8916}
8917
8919 Register Reg) const {
8920 // We need to handle instructions which may be inserted during register
8921 // allocation to handle the prolog. The initial prolog instruction may have
8922 // been separated from the start of the block by spills and copies inserted
8923 // needed by the prolog. However, the insertions for scalar registers can
8924 // always be placed at the BB top as they are independent of the exec mask
8925 // value.
8926 bool IsNullOrVectorRegister = true;
8927 if (Reg) {
8928 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8929 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
8930 }
8931
8932 uint16_t Opcode = MI.getOpcode();
8933 // FIXME: Copies inserted in the block prolog for live-range split should also
8934 // be included.
8935 return IsNullOrVectorRegister &&
8936 (isSpill(Opcode) || (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
8937 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
8938}
8939
8943 const DebugLoc &DL,
8944 Register DestReg) const {
8945 if (ST.hasAddNoCarry())
8946 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
8947
8949 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
8950 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
8951
8952 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
8953 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
8954}
8955
8958 const DebugLoc &DL,
8959 Register DestReg,
8960 RegScavenger &RS) const {
8961 if (ST.hasAddNoCarry())
8962 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
8963
8964 // If available, prefer to use vcc.
8965 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
8966 ? Register(RI.getVCC())
8968 *RI.getBoolRC(), I, /* RestoreAfter */ false,
8969 0, /* AllowSpill */ false);
8970
8971 // TODO: Users need to deal with this.
8972 if (!UnusedCarry.isValid())
8973 return MachineInstrBuilder();
8974
8975 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
8976 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
8977}
8978
8979bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
8980 switch (Opcode) {
8981 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
8982 case AMDGPU::SI_KILL_I1_TERMINATOR:
8983 return true;
8984 default:
8985 return false;
8986 }
8987}
8988
8990 switch (Opcode) {
8991 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
8992 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
8993 case AMDGPU::SI_KILL_I1_PSEUDO:
8994 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
8995 default:
8996 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
8997 }
8998}
8999
9000bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9001 return Imm <= getMaxMUBUFImmOffset(ST);
9002}
9003
9005 // GFX12 field is non-negative 24-bit signed byte offset.
9006 const unsigned OffsetBits =
9007 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9008 return (1 << OffsetBits) - 1;
9009}
9010
9012 if (!ST.isWave32())
9013 return;
9014
9015 if (MI.isInlineAsm())
9016 return;
9017
9018 for (auto &Op : MI.implicit_operands()) {
9019 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9020 Op.setReg(AMDGPU::VCC_LO);
9021 }
9022}
9023
9025 if (!isSMRD(MI))
9026 return false;
9027
9028 // Check that it is using a buffer resource.
9029 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9030 if (Idx == -1) // e.g. s_memtime
9031 return false;
9032
9033 const auto RCID = MI.getDesc().operands()[Idx].RegClass;
9034 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9035}
9036
9037// Given Imm, split it into the values to put into the SOffset and ImmOffset
9038// fields in an MUBUF instruction. Return false if it is not possible (due to a
9039// hardware bug needing a workaround).
9040//
9041// The required alignment ensures that individual address components remain
9042// aligned if they are aligned to begin with. It also ensures that additional
9043// offsets within the given alignment can be added to the resulting ImmOffset.
9045 uint32_t &ImmOffset, Align Alignment) const {
9046 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9047 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9048 uint32_t Overflow = 0;
9049
9050 if (Imm > MaxImm) {
9051 if (Imm <= MaxImm + 64) {
9052 // Use an SOffset inline constant for 4..64
9053 Overflow = Imm - MaxImm;
9054 Imm = MaxImm;
9055 } else {
9056 // Try to keep the same value in SOffset for adjacent loads, so that
9057 // the corresponding register contents can be re-used.
9058 //
9059 // Load values with all low-bits (except for alignment bits) set into
9060 // SOffset, so that a larger range of values can be covered using
9061 // s_movk_i32.
9062 //
9063 // Atomic operations fail to work correctly when individual address
9064 // components are unaligned, even if their sum is aligned.
9065 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9066 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9067 Imm = Low;
9068 Overflow = High - Alignment.value();
9069 }
9070 }
9071
9072 if (Overflow > 0) {
9073 // There is a hardware bug in SI and CI which prevents address clamping in
9074 // MUBUF instructions from working correctly with SOffsets. The immediate
9075 // offset is unaffected.
9077 return false;
9078
9079 // It is not possible to set immediate in SOffset field on some targets.
9080 if (ST.hasRestrictedSOffset())
9081 return false;
9082 }
9083
9084 ImmOffset = Imm;
9085 SOffset = Overflow;
9086 return true;
9087}
9088
9089// Depending on the used address space and instructions, some immediate offsets
9090// are allowed and some are not.
9091// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9092// scratch instruction offsets can also be negative. On GFX12, offsets can be
9093// negative for all variants.
9094//
9095// There are several bugs related to these offsets:
9096// On gfx10.1, flat instructions that go into the global address space cannot
9097// use an offset.
9098//
9099// For scratch instructions, the address can be either an SGPR or a VGPR.
9100// The following offsets can be used, depending on the architecture (x means
9101// cannot be used):
9102// +----------------------------+------+------+
9103// | Address-Mode | SGPR | VGPR |
9104// +----------------------------+------+------+
9105// | gfx9 | | |
9106// | negative, 4-aligned offset | x | ok |
9107// | negative, unaligned offset | x | ok |
9108// +----------------------------+------+------+
9109// | gfx10 | | |
9110// | negative, 4-aligned offset | ok | ok |
9111// | negative, unaligned offset | ok | x |
9112// +----------------------------+------+------+
9113// | gfx10.3 | | |
9114// | negative, 4-aligned offset | ok | ok |
9115// | negative, unaligned offset | ok | ok |
9116// +----------------------------+------+------+
9117//
9118// This function ignores the addressing mode, so if an offset cannot be used in
9119// one addressing mode, it is considered illegal.
9120bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9121 uint64_t FlatVariant) const {
9122 // TODO: Should 0 be special cased?
9123 if (!ST.hasFlatInstOffsets())
9124 return false;
9125
9126 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9127 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9128 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9129 return false;
9130
9132 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9133 (Offset % 4) != 0) {
9134 return false;
9135 }
9136
9137 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9138 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9139 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9140}
9141
9142// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9143std::pair<int64_t, int64_t>
9144SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9145 uint64_t FlatVariant) const {
9146 int64_t RemainderOffset = COffsetVal;
9147 int64_t ImmField = 0;
9148
9149 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9150 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9151
9152 if (AllowNegative) {
9153 // Use signed division by a power of two to truncate towards 0.
9154 int64_t D = 1LL << NumBits;
9155 RemainderOffset = (COffsetVal / D) * D;
9156 ImmField = COffsetVal - RemainderOffset;
9157
9159 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9160 (ImmField % 4) != 0) {
9161 // Make ImmField a multiple of 4
9162 RemainderOffset += ImmField % 4;
9163 ImmField -= ImmField % 4;
9164 }
9165 } else if (COffsetVal >= 0) {
9166 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9167 RemainderOffset = COffsetVal - ImmField;
9168 }
9169
9170 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9171 assert(RemainderOffset + ImmField == COffsetVal);
9172 return {ImmField, RemainderOffset};
9173}
9174
9176 if (ST.hasNegativeScratchOffsetBug() &&
9177 FlatVariant == SIInstrFlags::FlatScratch)
9178 return false;
9179
9180 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9181}
9182
9183static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9184 switch (ST.getGeneration()) {
9185 default:
9186 break;
9189 return SIEncodingFamily::SI;
9192 return SIEncodingFamily::VI;
9199 }
9200 llvm_unreachable("Unknown subtarget generation!");
9201}
9202
9203bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9204 switch(MCOp) {
9205 // These opcodes use indirect register addressing so
9206 // they need special handling by codegen (currently missing).
9207 // Therefore it is too risky to allow these opcodes
9208 // to be selected by dpp combiner or sdwa peepholer.
9209 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
9210 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
9211 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
9212 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
9213 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
9214 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
9215 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
9216 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
9217 return true;
9218 default:
9219 return false;
9220 }
9221}
9222
9223int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
9224 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
9225
9226 unsigned Gen = subtargetEncodingFamily(ST);
9227
9228 if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
9231
9232 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
9233 // subtarget has UnpackedD16VMem feature.
9234 // TODO: remove this when we discard GFX80 encoding.
9235 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
9237
9238 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
9239 switch (ST.getGeneration()) {
9240 default:
9242 break;
9245 break;
9248 break;
9249 }
9250 }
9251
9252 if (isMAI(Opcode)) {
9253 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
9254 if (MFMAOp != -1)
9255 Opcode = MFMAOp;
9256 }
9257
9258 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
9259
9260 // -1 means that Opcode is already a native instruction.
9261 if (MCOp == -1)
9262 return Opcode;
9263
9264 if (ST.hasGFX90AInsts()) {
9265 uint16_t NMCOp = (uint16_t)-1;
9266 if (ST.hasGFX940Insts())
9268 if (NMCOp == (uint16_t)-1)
9270 if (NMCOp == (uint16_t)-1)
9272 if (NMCOp != (uint16_t)-1)
9273 MCOp = NMCOp;
9274 }
9275
9276 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
9277 // no encoding in the given subtarget generation.
9278 if (MCOp == (uint16_t)-1)
9279 return -1;
9280
9281 if (isAsmOnlyOpcode(MCOp))
9282 return -1;
9283
9284 return MCOp;
9285}
9286
9287static
9289 assert(RegOpnd.isReg());
9290 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
9291 getRegSubRegPair(RegOpnd);
9292}
9293
9296 assert(MI.isRegSequence());
9297 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
9298 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
9299 auto &RegOp = MI.getOperand(1 + 2 * I);
9300 return getRegOrUndef(RegOp);
9301 }
9303}
9304
9305// Try to find the definition of reg:subreg in subreg-manipulation pseudos
9306// Following a subreg of reg:subreg isn't supported
9309 if (!RSR.SubReg)
9310 return false;
9311 switch (MI.getOpcode()) {
9312 default: break;
9313 case AMDGPU::REG_SEQUENCE:
9314 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
9315 return true;
9316 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
9317 case AMDGPU::INSERT_SUBREG:
9318 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
9319 // inserted the subreg we're looking for
9320 RSR = getRegOrUndef(MI.getOperand(2));
9321 else { // the subreg in the rest of the reg
9322 auto R1 = getRegOrUndef(MI.getOperand(1));
9323 if (R1.SubReg) // subreg of subreg isn't supported
9324 return false;
9325 RSR.Reg = R1.Reg;
9326 }
9327 return true;
9328 }
9329 return false;
9330}
9331
9334 assert(MRI.isSSA());
9335 if (!P.Reg.isVirtual())
9336 return nullptr;
9337
9338 auto RSR = P;
9339 auto *DefInst = MRI.getVRegDef(RSR.Reg);
9340 while (auto *MI = DefInst) {
9341 DefInst = nullptr;
9342 switch (MI->getOpcode()) {
9343 case AMDGPU::COPY:
9344 case AMDGPU::V_MOV_B32_e32: {
9345 auto &Op1 = MI->getOperand(1);
9346 if (Op1.isReg() && Op1.getReg().isVirtual()) {
9347 if (Op1.isUndef())
9348 return nullptr;
9349 RSR = getRegSubRegPair(Op1);
9350 DefInst = MRI.getVRegDef(RSR.Reg);
9351 }
9352 break;
9353 }
9354 default:
9355 if (followSubRegDef(*MI, RSR)) {
9356 if (!RSR.Reg)
9357 return nullptr;
9358 DefInst = MRI.getVRegDef(RSR.Reg);
9359 }
9360 }
9361 if (!DefInst)
9362 return MI;
9363 }
9364 return nullptr;
9365}
9366
9368 Register VReg,
9369 const MachineInstr &DefMI,
9370 const MachineInstr &UseMI) {
9371 assert(MRI.isSSA() && "Must be run on SSA");
9372
9373 auto *TRI = MRI.getTargetRegisterInfo();
9374 auto *DefBB = DefMI.getParent();
9375
9376 // Don't bother searching between blocks, although it is possible this block
9377 // doesn't modify exec.
9378 if (UseMI.getParent() != DefBB)
9379 return true;
9380
9381 const int MaxInstScan = 20;
9382 int NumInst = 0;
9383
9384 // Stop scan at the use.
9385 auto E = UseMI.getIterator();
9386 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
9387 if (I->isDebugInstr())
9388 continue;
9389
9390 if (++NumInst > MaxInstScan)
9391 return true;
9392
9393 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
9394 return true;
9395 }
9396
9397 return false;
9398}
9399
9401 Register VReg,
9402 const MachineInstr &DefMI) {
9403 assert(MRI.isSSA() && "Must be run on SSA");
9404
9405 auto *TRI = MRI.getTargetRegisterInfo();
9406 auto *DefBB = DefMI.getParent();
9407
9408 const int MaxUseScan = 10;
9409 int NumUse = 0;
9410
9411 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
9412 auto &UseInst = *Use.getParent();
9413 // Don't bother searching between blocks, although it is possible this block
9414 // doesn't modify exec.
9415 if (UseInst.getParent() != DefBB || UseInst.isPHI())
9416 return true;
9417
9418 if (++NumUse > MaxUseScan)
9419 return true;
9420 }
9421
9422 if (NumUse == 0)
9423 return false;
9424
9425 const int MaxInstScan = 20;
9426 int NumInst = 0;
9427
9428 // Stop scan when we have seen all the uses.
9429 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
9430 assert(I != DefBB->end());
9431
9432 if (I->isDebugInstr())
9433 continue;
9434
9435 if (++NumInst > MaxInstScan)
9436 return true;
9437
9438 for (const MachineOperand &Op : I->operands()) {
9439 // We don't check reg masks here as they're used only on calls:
9440 // 1. EXEC is only considered const within one BB
9441 // 2. Call should be a terminator instruction if present in a BB
9442
9443 if (!Op.isReg())
9444 continue;
9445
9446 Register Reg = Op.getReg();
9447 if (Op.isUse()) {
9448 if (Reg == VReg && --NumUse == 0)
9449 return false;
9450 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
9451 return true;
9452 }
9453 }
9454}
9455
9458 const DebugLoc &DL, Register Src, Register Dst) const {
9459 auto Cur = MBB.begin();
9460 if (Cur != MBB.end())
9461 do {
9462 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
9463 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
9464 ++Cur;
9465 } while (Cur != MBB.end() && Cur != LastPHIIt);
9466
9467 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
9468 Dst);
9469}
9470
9473 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
9474 if (InsPt != MBB.end() &&
9475 (InsPt->getOpcode() == AMDGPU::SI_IF ||
9476 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
9477 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
9478 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
9479 InsPt++;
9480 return BuildMI(MBB, InsPt, DL,
9481 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
9482 : AMDGPU::S_MOV_B64_term),
9483 Dst)
9484 .addReg(Src, 0, SrcSubReg)
9485 .addReg(AMDGPU::EXEC, RegState::Implicit);
9486 }
9487 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
9488 Dst);
9489}
9490
9491bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
9492
9495 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
9496 VirtRegMap *VRM) const {
9497 // This is a bit of a hack (copied from AArch64). Consider this instruction:
9498 //
9499 // %0:sreg_32 = COPY $m0
9500 //
9501 // We explicitly chose SReg_32 for the virtual register so such a copy might
9502 // be eliminated by RegisterCoalescer. However, that may not be possible, and
9503 // %0 may even spill. We can't spill $m0 normally (it would require copying to
9504 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
9505 // TargetInstrInfo::foldMemoryOperand() is going to try.
9506 // A similar issue also exists with spilling and reloading $exec registers.
9507 //
9508 // To prevent that, constrain the %0 register class here.
9509 if (isFullCopyInstr(MI)) {
9510 Register DstReg = MI.getOperand(0).getReg();
9511 Register SrcReg = MI.getOperand(1).getReg();
9512 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
9513 (DstReg.isVirtual() != SrcReg.isVirtual())) {
9515 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
9516 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
9517 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
9518 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
9519 return nullptr;
9520 } else if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
9521 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
9522 return nullptr;
9523 }
9524 }
9525 }
9526
9527 return nullptr;
9528}
9529
9531 const MachineInstr &MI,
9532 unsigned *PredCost) const {
9533 if (MI.isBundle()) {
9535 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
9536 unsigned Lat = 0, Count = 0;
9537 for (++I; I != E && I->isBundledWithPred(); ++I) {
9538 ++Count;
9539 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
9540 }
9541 return Lat + Count - 1;
9542 }
9543
9544 return SchedModel.computeInstrLatency(&MI);
9545}
9546
9549 unsigned opcode = MI.getOpcode();
9550 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
9551 auto IID = GI->getIntrinsicID();
9556
9557 switch (IID) {
9558 case Intrinsic::amdgcn_if:
9559 case Intrinsic::amdgcn_else:
9560 // FIXME: Uniform if second result
9561 break;
9562 }
9563
9565 }
9566
9567 // Loads from the private and flat address spaces are divergent, because
9568 // threads can execute the load instruction with the same inputs and get
9569 // different results.
9570 //
9571 // All other loads are not divergent, because if threads issue loads with the
9572 // same arguments, they will always get the same result.
9573 if (opcode == AMDGPU::G_LOAD) {
9574 if (MI.memoperands_empty())
9575 return InstructionUniformity::NeverUniform; // conservative assumption
9576
9577 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9578 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9579 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9580 })) {
9581 // At least one MMO in a non-global address space.
9583 }
9585 }
9586
9587 if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) ||
9588 opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
9589 opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
9590 AMDGPU::isGenericAtomic(opcode)) {
9592 }
9594}
9595
9598
9599 if (isNeverUniform(MI))
9601
9602 unsigned opcode = MI.getOpcode();
9603 if (opcode == AMDGPU::V_READLANE_B32 ||
9604 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
9605 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
9607
9608 if (isCopyInstr(MI)) {
9609 const MachineOperand &srcOp = MI.getOperand(1);
9610 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
9611 const TargetRegisterClass *regClass =
9612 RI.getPhysRegBaseClass(srcOp.getReg());
9615 }
9617 }
9618
9619 // GMIR handling
9620 if (MI.isPreISelOpcode())
9622
9623 // Atomics are divergent because they are executed sequentially: when an
9624 // atomic operation refers to the same address in each thread, then each
9625 // thread after the first sees the value written by the previous thread as
9626 // original value.
9627
9628 if (isAtomic(MI))
9630
9631 // Loads from the private and flat address spaces are divergent, because
9632 // threads can execute the load instruction with the same inputs and get
9633 // different results.
9634 if (isFLAT(MI) && MI.mayLoad()) {
9635 if (MI.memoperands_empty())
9636 return InstructionUniformity::NeverUniform; // conservative assumption
9637
9638 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9639 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9640 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9641 })) {
9642 // At least one MMO in a non-global address space.
9644 }
9645
9647 }
9648
9649 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9650 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
9651
9652 // FIXME: It's conceptually broken to report this for an instruction, and not
9653 // a specific def operand. For inline asm in particular, there could be mixed
9654 // uniform and divergent results.
9655 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
9656 const MachineOperand &SrcOp = MI.getOperand(I);
9657 if (!SrcOp.isReg())
9658 continue;
9659
9660 Register Reg = SrcOp.getReg();
9661 if (!Reg || !SrcOp.readsReg())
9662 continue;
9663
9664 // If RegBank is null, this is unassigned or an unallocatable special
9665 // register, which are all scalars.
9666 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
9667 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
9669 }
9670
9671 // TODO: Uniformity check condtions above can be rearranged for more
9672 // redability
9673
9674 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
9675 // currently turned into no-op COPYs by SelectionDAG ISel and are
9676 // therefore no longer recognizable.
9677
9679}
9680
9682 switch (MF.getFunction().getCallingConv()) {
9684 return 1;
9686 return 2;
9688 return 3;
9692 report_fatal_error("ds_ordered_count unsupported for this calling conv");
9695 case CallingConv::C:
9696 case CallingConv::Fast:
9697 default:
9698 // Assume other calling conventions are various compute callable functions
9699 return 0;
9700 }
9701}
9702
9704 Register &SrcReg2, int64_t &CmpMask,
9705 int64_t &CmpValue) const {
9706 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
9707 return false;
9708
9709 switch (MI.getOpcode()) {
9710 default:
9711 break;
9712 case AMDGPU::S_CMP_EQ_U32:
9713 case AMDGPU::S_CMP_EQ_I32:
9714 case AMDGPU::S_CMP_LG_U32:
9715 case AMDGPU::S_CMP_LG_I32:
9716 case AMDGPU::S_CMP_LT_U32:
9717 case AMDGPU::S_CMP_LT_I32:
9718 case AMDGPU::S_CMP_GT_U32:
9719 case AMDGPU::S_CMP_GT_I32:
9720 case AMDGPU::S_CMP_LE_U32:
9721 case AMDGPU::S_CMP_LE_I32:
9722 case AMDGPU::S_CMP_GE_U32:
9723 case AMDGPU::S_CMP_GE_I32:
9724 case AMDGPU::S_CMP_EQ_U64:
9725 case AMDGPU::S_CMP_LG_U64:
9726 SrcReg = MI.getOperand(0).getReg();
9727 if (MI.getOperand(1).isReg()) {
9728 if (MI.getOperand(1).getSubReg())
9729 return false;
9730 SrcReg2 = MI.getOperand(1).getReg();
9731 CmpValue = 0;
9732 } else if (MI.getOperand(1).isImm()) {
9733 SrcReg2 = Register();
9734 CmpValue = MI.getOperand(1).getImm();
9735 } else {
9736 return false;
9737 }
9738 CmpMask = ~0;
9739 return true;
9740 case AMDGPU::S_CMPK_EQ_U32:
9741 case AMDGPU::S_CMPK_EQ_I32:
9742 case AMDGPU::S_CMPK_LG_U32:
9743 case AMDGPU::S_CMPK_LG_I32:
9744 case AMDGPU::S_CMPK_LT_U32:
9745 case AMDGPU::S_CMPK_LT_I32:
9746 case AMDGPU::S_CMPK_GT_U32:
9747 case AMDGPU::S_CMPK_GT_I32:
9748 case AMDGPU::S_CMPK_LE_U32:
9749 case AMDGPU::S_CMPK_LE_I32:
9750 case AMDGPU::S_CMPK_GE_U32:
9751 case AMDGPU::S_CMPK_GE_I32:
9752 SrcReg = MI.getOperand(0).getReg();
9753 SrcReg2 = Register();
9754 CmpValue = MI.getOperand(1).getImm();
9755 CmpMask = ~0;
9756 return true;
9757 }
9758
9759 return false;
9760}
9761
9763 Register SrcReg2, int64_t CmpMask,
9764 int64_t CmpValue,
9765 const MachineRegisterInfo *MRI) const {
9766 if (!SrcReg || SrcReg.isPhysical())
9767 return false;
9768
9769 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
9770 return false;
9771
9772 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
9773 this](int64_t ExpectedValue, unsigned SrcSize,
9774 bool IsReversible, bool IsSigned) -> bool {
9775 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9776 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9777 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9778 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9779 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
9780 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9781 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9782 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9783 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9784 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
9785 //
9786 // Signed ge/gt are not used for the sign bit.
9787 //
9788 // If result of the AND is unused except in the compare:
9789 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
9790 //
9791 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9792 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9793 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
9794 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9795 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9796 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
9797
9798 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
9799 if (!Def || Def->getParent() != CmpInstr.getParent())
9800 return false;
9801
9802 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
9803 Def->getOpcode() != AMDGPU::S_AND_B64)
9804 return false;
9805
9806 int64_t Mask;
9807 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
9808 if (MO->isImm())
9809 Mask = MO->getImm();
9810 else if (!getFoldableImm(MO, Mask))
9811 return false;
9812 Mask &= maxUIntN(SrcSize);
9813 return isPowerOf2_64(Mask);
9814 };
9815
9816 MachineOperand *SrcOp = &Def->getOperand(1);
9817 if (isMask(SrcOp))
9818 SrcOp = &Def->getOperand(2);
9819 else if (isMask(&Def->getOperand(2)))
9820 SrcOp = &Def->getOperand(1);
9821 else
9822 return false;
9823
9824 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
9825 if (IsSigned && BitNo == SrcSize - 1)
9826 return false;
9827
9828 ExpectedValue <<= BitNo;
9829
9830 bool IsReversedCC = false;
9831 if (CmpValue != ExpectedValue) {
9832 if (!IsReversible)
9833 return false;
9834 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
9835 if (!IsReversedCC)
9836 return false;
9837 }
9838
9839 Register DefReg = Def->getOperand(0).getReg();
9840 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
9841 return false;
9842
9843 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
9844 I != E; ++I) {
9845 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
9846 I->killsRegister(AMDGPU::SCC, &RI))
9847 return false;
9848 }
9849
9850 MachineOperand *SccDef =
9851 Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
9852 SccDef->setIsDead(false);
9853 CmpInstr.eraseFromParent();
9854
9855 if (!MRI->use_nodbg_empty(DefReg)) {
9856 assert(!IsReversedCC);
9857 return true;
9858 }
9859
9860 // Replace AND with unused result with a S_BITCMP.
9861 MachineBasicBlock *MBB = Def->getParent();
9862
9863 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
9864 : AMDGPU::S_BITCMP1_B32
9865 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
9866 : AMDGPU::S_BITCMP1_B64;
9867
9868 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
9869 .add(*SrcOp)
9870 .addImm(BitNo);
9871 Def->eraseFromParent();
9872
9873 return true;
9874 };
9875
9876 switch (CmpInstr.getOpcode()) {
9877 default:
9878 break;
9879 case AMDGPU::S_CMP_EQ_U32:
9880 case AMDGPU::S_CMP_EQ_I32:
9881 case AMDGPU::S_CMPK_EQ_U32:
9882 case AMDGPU::S_CMPK_EQ_I32:
9883 return optimizeCmpAnd(1, 32, true, false);
9884 case AMDGPU::S_CMP_GE_U32:
9885 case AMDGPU::S_CMPK_GE_U32:
9886 return optimizeCmpAnd(1, 32, false, false);
9887 case AMDGPU::S_CMP_GE_I32:
9888 case AMDGPU::S_CMPK_GE_I32:
9889 return optimizeCmpAnd(1, 32, false, true);
9890 case AMDGPU::S_CMP_EQ_U64:
9891 return optimizeCmpAnd(1, 64, true, false);
9892 case AMDGPU::S_CMP_LG_U32:
9893 case AMDGPU::S_CMP_LG_I32:
9894 case AMDGPU::S_CMPK_LG_U32:
9895 case AMDGPU::S_CMPK_LG_I32:
9896 return optimizeCmpAnd(0, 32, true, false);
9897 case AMDGPU::S_CMP_GT_U32:
9898 case AMDGPU::S_CMPK_GT_U32:
9899 return optimizeCmpAnd(0, 32, false, false);
9900 case AMDGPU::S_CMP_GT_I32:
9901 case AMDGPU::S_CMPK_GT_I32:
9902 return optimizeCmpAnd(0, 32, false, true);
9903 case AMDGPU::S_CMP_LG_U64:
9904 return optimizeCmpAnd(0, 64, true, false);
9905 }
9906
9907 return false;
9908}
9909
9911 unsigned OpName) const {
9912 if (!ST.needsAlignedVGPRs())
9913 return;
9914
9915 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
9916 if (OpNo < 0)
9917 return;
9918 MachineOperand &Op = MI.getOperand(OpNo);
9919 if (getOpSize(MI, OpNo) > 4)
9920 return;
9921
9922 // Add implicit aligned super-reg to force alignment on the data operand.
9923 const DebugLoc &DL = MI.getDebugLoc();
9924 MachineBasicBlock *BB = MI.getParent();
9926 Register DataReg = Op.getReg();
9927 bool IsAGPR = RI.isAGPR(MRI, DataReg);
9928 Register Undef = MRI.createVirtualRegister(
9929 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
9930 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
9931 Register NewVR =
9932 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
9933 : &AMDGPU::VReg_64_Align2RegClass);
9934 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
9935 .addReg(DataReg, 0, Op.getSubReg())
9936 .addImm(AMDGPU::sub0)
9937 .addReg(Undef)
9938 .addImm(AMDGPU::sub1);
9939 Op.setReg(NewVR);
9940 Op.setSubReg(AMDGPU::sub0);
9941 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
9942}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
unsigned RegSize
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
amdgpu AMDGPU Register Bank Select
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
uint64_t High
#define P(N)
TargetInstrInfo::RegSubRegPair RegSubRegPair
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillRestoreOpcode(unsigned Size)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, const MachineRegisterInfo &MRI, const MCInstrDesc &TID, unsigned RCID, bool IsAllocatable)
static unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillSaveOpcode(unsigned Size)
static bool resultDependsOnExec(const MachineInstr &MI)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, unsigned OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
Definition: SIInstrInfo.cpp:85
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static constexpr unsigned ModifierOpNames[]
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
Definition: SIInstrInfo.cpp:76
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool IsDead
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool has16BitInsts() const
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
Class for arbitrary precision integers.
Definition: APInt.h:76
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:168
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:263
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:356
bool useVGPRIndexMode() const
bool hasSDWAOmod() const
Definition: GCNSubtarget.h:735
bool hasA16() const
bool hasSDWAScalar() const
Definition: GCNSubtarget.h:739
bool hasScalarCompareEq64() const
Definition: GCNSubtarget.h:950
bool hasOnlyRevVALUShifts() const
Definition: GCNSubtarget.h:378
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:614
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:755
bool hasMAIInsts() const
Definition: GCNSubtarget.h:805
bool hasMFMAInlineLiteralBug() const
bool hasNegativeScratchOffsetBug() const
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasPkMovB32() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasR128A16() const
bool hasOffset3fBug() const
bool hasGetPCZeroExtension() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:265
const AMDGPURegisterBankInfo * getRegBankInfo() const override
Definition: GCNSubtarget.h:285
bool hasSDWAOutModsVOPC() const
Definition: GCNSubtarget.h:751
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:670
bool hasGFX940Insts() const
bool hasSDWASdst() const
Definition: GCNSubtarget.h:743
bool hasMovB64() const
bool isWave32() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:331
bool hasNegativeUnalignedScratchOffsetBug() const
bool hasG16() const
unsigned getNSAMaxSize(bool HasSampler=false) const
Generation getGeneration() const
Definition: GCNSubtarget.h:304
bool hasVOP3Literal() const
Definition: GCNSubtarget.h:875
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:722
bool hasAddr64() const
Definition: GCNSubtarget.h:368
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:714
bool hasGDS() const
bool hasPartialNSAEncoding() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
A possibly irreducible generalization of a Loop.
void getExitBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all of the successor blocks of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:157
iterator find(SlotIndex Pos)
find - Return an iterator pointing to the first segment that ends after Pos, or end().
void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:541
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:611
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:621
static const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition: MCExpr.cpp:194
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
bool mayStore() const
Return true if this instruction could possibly modify memory.
Definition: MCInstrDesc.h:444
bool mayLoad() const
Return true if this instruction could possibly read memory.
Definition: MCInstrDesc.h:438
unsigned short Opcode
Definition: MCInstrDesc.h:205
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
Definition: MCInstrDesc.h:565
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
uint8_t OperandType
Information about the type of the operand.
Definition: MCInstrDesc.h:97
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition: MCInstrDesc.h:91
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx)
Definition: MCExpr.h:397
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:40
void setVariableValue(const MCExpr *Value)
Definition: MCSymbol.cpp:47
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
unsigned pred_size() const
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
reverse_iterator rend()
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< iterator > terminators()
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineDomTreeNode * addNewBlock(MachineBasicBlock *BB, MachineBasicBlock *DomBB)
addNewBlock - Add a new node to the dominator tree information.
bool properlyDominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
void changeImmediateDominator(MachineBasicBlock *N, MachineBasicBlock *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:558
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:341
void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:561
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
iterator_range< mop_iterator > explicit_operands()
Definition: MachineInstr.h:680
unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
Definition: MachineInstr.h:804
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:789
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:771
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:487
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
iterator_range< mop_iterator > implicit_operands()
Definition: MachineInstr.h:688
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:568
uint32_t getFlags() const
Return the MI flags bitvector.
Definition: MachineInstr.h:386
int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
bool addRegisterDead(Register Reg, const TargetRegisterInfo *RegInfo, bool AddIfNotFound=false)
We have determined MI defined a register without a use.
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
reg_begin/reg_end - Provide iteration support to walk over all definitions and uses of a register wit...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
void enterBasicBlockEnd(MachineBasicBlock &MBB)
Start tracking liveness from the end of basic block MBB.
bool isRegUsed(Register Reg, bool includeReserved=true) const
Return if a specific register is currently used.
void setRegUsed(Register Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Tell the scavenger a register is used.
void backward()
Update internal register state and move MBB iterator backwards.
void enterBasicBlock(MachineBasicBlock &MBB)
Start tracking liveness from the begin of basic block MBB.
Register scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, bool AllowSpill=true)
Make a register of the specific register class available from the current position backwards to the p...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
static bool isMAI(const MachineInstr &MI)
Definition: SIInstrInfo.h:792
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:554
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
bool isNonUniformBranchInstr(MachineInstr &Instr) const
static bool isVOP3(const MachineInstr &MI)
Definition: SIInstrInfo.h:504
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
Definition: SIInstrInfo.h:924
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
Definition: SIInstrInfo.h:1094
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:636
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:544
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const
Get required immediate operand.
Definition: SIInstrInfo.h:1222
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:536
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:649
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:408
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:496
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:512
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
Definition: SIInstrInfo.h:604
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:520
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
bool isSpill(uint16_t Opcode) const
Definition: SIInstrInfo.h:740
static bool isVIMAGE(const MachineInstr &MI)
Definition: SIInstrInfo.h:588
static bool isSOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:448
static bool isGWS(const MachineInstr &MI)
Definition: SIInstrInfo.h:570
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, MachineOperand &Src1, unsigned Src1OpName) const
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:628
static bool isVSAMPLE(const MachineInstr &MI)
Definition: SIInstrInfo.h:596
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
unsigned isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isImage(const MachineInstr &MI)
Definition: SIInstrInfo.h:424
static bool isSOPK(const MachineInstr &MI)
Definition: SIInstrInfo.h:464
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
Definition: SIInstrInfo.h:936
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const override
bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, const MachineOperand &MO) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:528
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void convertNonUniformLoopRegion(MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const override final
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:618
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool isDPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:760
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
bool isLowLatencyInstruction(const MachineInstr &MI) const
void materializeImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, int64_t Value) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
bool isAlwaysGDS(uint16_t Opcode) const
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool isVGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:716
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool isVGPRCopy(const MachineInstr &MI) const
Definition: SIInstrInfo.h:959
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:580
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description.
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
const TargetRegisterClass * getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI, const MachineFunction &MF) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc) const override
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
void convertNonUniformIfRegion(MachineBasicBlock *IfEntry, MachineBasicBlock *IfEnd) const
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
Whether we must prevent this instruction from executing with EXEC = 0.
static bool isAtomic(const MachineInstr &MI)
Definition: SIInstrInfo.h:681
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
static bool sopkIsZext(unsigned Opcode)
Definition: SIInstrInfo.h:863
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const override
static bool isSGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:728
static bool isWMMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:809
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:456
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:612
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:416
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
Definition: SIInstrInfo.h:1235
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
bool hasModifiersSet(const MachineInstr &MI, unsigned OpName) const
static bool isFixedSize(const MachineInstr &MI)
Definition: SIInstrInfo.h:880
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isLDSDMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:562
unsigned isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isVOP1(const MachineInstr &MI)
Definition: SIInstrInfo.h:488
SIInstrInfo(const GCNSubtarget &ST)
Definition: SIInstrInfo.cpp:66
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
const TargetRegisterClass * getRegClass(unsigned RCID) const
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
static bool hasVectorRegisters(const TargetRegisterClass *RC)
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool opCanUseInlineConstant(unsigned OpType) const
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
bool opCanUseLiteralConstant(unsigned OpType) const
static bool hasVGPRs(const TargetRegisterClass *RC)
static bool isVGPRClass(const TargetRegisterClass *RC)
unsigned getHWRegIndex(MCRegister Reg) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
const TargetRegisterClass * getBoolRC() const
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getChannelFromSubReg(unsigned SubReg) const
MCRegister getVCC() const
static bool hasAGPRs(const TargetRegisterClass *RC)
const TargetRegisterClass * getWaveMaskRegClass() const
bool spillSGPRToVGPR() const
const TargetRegisterClass * getVGPR64Class() const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:559
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:68
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
Definition: SlotIndexes.h:240
SlotIndexes pass.
Definition: SlotIndexes.h:300
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:523
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
const TargetRegisterClass * getAllocatableClass(const TargetRegisterClass *RC) const
Return the maximal subclass of the given register class that is allocatable or NULL.
unsigned getSubRegIdxSize(unsigned Idx) const
Get the size of the bit range covered by a sub-register index.
unsigned getSubRegIdxOffset(unsigned Idx) const
Get the offset of the bit range covered by a sub-register index.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
self_iterator getIterator()
Definition: ilist_node.h:109
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
Definition: SIInstrInfo.h:1513
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isDPALU_DPP(const MCInstrDesc &OpDesc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
Definition: SIInstrInfo.h:1514
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
bool isTrue16Inst(unsigned Opc)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo)
Is this an AMDGPU specific source operand? These include registers, inline constants,...
const uint64_t RSRC_TID_ENABLE
Definition: SIInstrInfo.h:1516
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isGenericAtomic(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isHi(unsigned Reg, const MCRegisterInfo &MRI)
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition: SIDefines.h:234
@ OPERAND_REG_IMM_INT64
Definition: SIDefines.h:201
@ OPERAND_REG_IMM_V2FP16
Definition: SIDefines.h:211
@ OPERAND_REG_INLINE_C_V2INT32
Definition: SIDefines.h:227
@ OPERAND_REG_INLINE_C_FP64
Definition: SIDefines.h:223
@ OPERAND_REG_INLINE_C_BF16
Definition: SIDefines.h:220
@ OPERAND_REG_INLINE_C_V2BF16
Definition: SIDefines.h:225
@ OPERAND_REG_IMM_V2INT16
Definition: SIDefines.h:212
@ OPERAND_REG_IMM_BF16
Definition: SIDefines.h:205
@ OPERAND_REG_INLINE_AC_V2FP16
Definition: SIDefines.h:246
@ OPERAND_REG_IMM_INT32
Operands with register or 32-bit immediate.
Definition: SIDefines.h:200
@ OPERAND_REG_IMM_V2BF16
Definition: SIDefines.h:210
@ OPERAND_REG_IMM_BF16_DEFERRED
Definition: SIDefines.h:207
@ OPERAND_REG_IMM_FP16
Definition: SIDefines.h:206
@ OPERAND_REG_INLINE_C_INT64
Definition: SIDefines.h:219
@ OPERAND_REG_INLINE_AC_BF16
Definition: SIDefines.h:240
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition: SIDefines.h:217
@ OPERAND_REG_INLINE_AC_INT16
Operands with an AccVGPR register or inline constant.
Definition: SIDefines.h:238
@ OPERAND_REG_IMM_FP64
Definition: SIDefines.h:204
@ OPERAND_REG_INLINE_C_V2FP16
Definition: SIDefines.h:226
@ OPERAND_REG_INLINE_AC_V2INT16
Definition: SIDefines.h:244
@ OPERAND_REG_INLINE_AC_FP16
Definition: SIDefines.h:241
@ OPERAND_REG_INLINE_AC_INT32
Definition: SIDefines.h:239
@ OPERAND_REG_INLINE_AC_FP32
Definition: SIDefines.h:242
@ OPERAND_REG_INLINE_AC_V2BF16
Definition: SIDefines.h:245
@ OPERAND_REG_IMM_V2INT32
Definition: SIDefines.h:213
@ OPERAND_REG_IMM_FP32
Definition: SIDefines.h:203
@ OPERAND_INPUT_MODS
Definition: SIDefines.h:251
@ OPERAND_REG_INLINE_C_FP32
Definition: SIDefines.h:222
@ OPERAND_REG_INLINE_C_INT32
Definition: SIDefines.h:218
@ OPERAND_REG_INLINE_C_V2INT16
Definition: SIDefines.h:224
@ OPERAND_REG_IMM_V2FP32
Definition: SIDefines.h:214
@ OPERAND_REG_INLINE_AC_FP64
Definition: SIDefines.h:243
@ OPERAND_REG_INLINE_C_FP16
Definition: SIDefines.h:221
@ OPERAND_REG_IMM_INT16
Definition: SIDefines.h:202
@ OPERAND_REG_INLINE_C_V2FP32
Definition: SIDefines.h:228
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition: SIDefines.h:231
@ OPERAND_REG_IMM_FP32_DEFERRED
Definition: SIDefines.h:209
@ OPERAND_REG_IMM_FP16_DEFERRED
Definition: SIDefines.h:208
@ TI_SCRATCH_RSRC_DWORD1
Definition: AMDGPU.h:409
@ TI_SCRATCH_RSRC_DWORD3
Definition: AMDGPU.h:411
@ TI_SCRATCH_RSRC_DWORD0
Definition: AMDGPU.h:408
@ TI_SCRATCH_RSRC_DWORD2
Definition: AMDGPU.h:410
@ TI_CONSTDATA_START
Definition: AMDGPU.h:407
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
int getMCOpcode(uint16_t Opcode, unsigned Gen)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
Definition: SIInstrInfo.h:1515
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
bool isGraphics(CallingConv::ID cc)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ OPERAND_GENERIC_4
Definition: MCInstrDesc.h:70
@ OPERAND_GENERIC_2
Definition: MCInstrDesc.h:68
@ OPERAND_GENERIC_1
Definition: MCInstrDesc.h:67
@ OPERAND_REGISTER
Definition: MCInstrDesc.h:61
@ OPERAND_GENERIC_3
Definition: MCInstrDesc.h:69
@ OPERAND_IMMEDIATE
Definition: MCInstrDesc.h:60
@ OPERAND_UNKNOWN
Definition: MCInstrDesc.h:59
@ OPERAND_GENERIC_0
Definition: MCInstrDesc.h:66
@ OPERAND_GENERIC_5
Definition: MCInstrDesc.h:71
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:456
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1404
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:280
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:41
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:138
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:143
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:244
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
Definition: TargetOpcodes.h:36
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Error
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:45
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:439
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition: Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
@ NeverUniform
The result values can never be assumed to be uniform.
@ Default
The result values are uniform if and only if all operands are uniform.
uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition: MathExtras.h:203
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition: APFloat.cpp:216
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Description of the encoding of one expression Op.
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
Definition: LiveVariables.h:85
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition: SIInstrInfo.h:49
MachineInstr * top() const
Definition: SIInstrInfo.h:54
bool empty() const
Definition: SIInstrInfo.h:64
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition: SIInstrInfo.h:73
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.