LLVM 19.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "GCNHazardRecognizer.h"
18#include "GCNSubtarget.h"
31#include "llvm/IR/IntrinsicsAMDGPU.h"
32#include "llvm/MC/MCContext.h"
35
36using namespace llvm;
37
38#define DEBUG_TYPE "si-instr-info"
39
40#define GET_INSTRINFO_CTOR_DTOR
41#include "AMDGPUGenInstrInfo.inc"
42
43namespace llvm {
44namespace AMDGPU {
45#define GET_D16ImageDimIntrinsics_IMPL
46#define GET_ImageDimIntrinsicTable_IMPL
47#define GET_RsrcIntrinsics_IMPL
48#include "AMDGPUGenSearchableTables.inc"
49}
50}
51
52
53// Must be at least 4 to be able to branch over minimum unconditional branch
54// code. This is only for making it possible to write reasonably small tests for
55// long branches.
57BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
58 cl::desc("Restrict range of branch instructions (DEBUG)"));
59
61 "amdgpu-fix-16-bit-physreg-copies",
62 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
63 cl::init(true),
65
67 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
68 RI(ST), ST(ST) {
69 SchedModel.init(&ST);
70}
71
72//===----------------------------------------------------------------------===//
73// TargetInstrInfo callbacks
74//===----------------------------------------------------------------------===//
75
76static unsigned getNumOperandsNoGlue(SDNode *Node) {
77 unsigned N = Node->getNumOperands();
78 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
79 --N;
80 return N;
81}
82
83/// Returns true if both nodes have the same value for the given
84/// operand \p Op, or if both nodes do not have this operand.
85static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
86 unsigned Opc0 = N0->getMachineOpcode();
87 unsigned Opc1 = N1->getMachineOpcode();
88
89 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
90 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
91
92 if (Op0Idx == -1 && Op1Idx == -1)
93 return true;
94
95
96 if ((Op0Idx == -1 && Op1Idx != -1) ||
97 (Op1Idx == -1 && Op0Idx != -1))
98 return false;
99
100 // getNamedOperandIdx returns the index for the MachineInstr's operands,
101 // which includes the result as the first operand. We are indexing into the
102 // MachineSDNode's operands, so we need to skip the result operand to get
103 // the real index.
104 --Op0Idx;
105 --Op1Idx;
106
107 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
108}
109
110static bool canRemat(const MachineInstr &MI) {
111
115 return true;
116
117 if (SIInstrInfo::isSMRD(MI)) {
118 return !MI.memoperands_empty() &&
119 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
120 return MMO->isLoad() && MMO->isInvariant();
121 });
122 }
123
124 return false;
125}
126
128 const MachineInstr &MI) const {
129
130 if (canRemat(MI)) {
131 // Normally VALU use of exec would block the rematerialization, but that
132 // is OK in this case to have an implicit exec read as all VALU do.
133 // We really want all of the generic logic for this except for this.
134
135 // Another potential implicit use is mode register. The core logic of
136 // the RA will not attempt rematerialization if mode is set anywhere
137 // in the function, otherwise it is safe since mode is not changed.
138
139 // There is difference to generic method which does not allow
140 // rematerialization if there are virtual register uses. We allow this,
141 // therefore this method includes SOP instructions as well.
142 if (!MI.hasImplicitDef() &&
143 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
144 !MI.mayRaiseFPException())
145 return true;
146 }
147
149}
150
151// Returns true if the scalar result of a VALU instruction depends on exec.
153 // Ignore comparisons which are only used masked with exec.
154 // This allows some hoisting/sinking of VALU comparisons.
155 if (MI.isCompare()) {
156 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
157 Register DstReg = MI.getOperand(0).getReg();
158 if (!DstReg.isVirtual())
159 return true;
160 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
161 switch (Use.getOpcode()) {
162 case AMDGPU::S_AND_SAVEEXEC_B32:
163 case AMDGPU::S_AND_SAVEEXEC_B64:
164 break;
165 case AMDGPU::S_AND_B32:
166 case AMDGPU::S_AND_B64:
167 if (!Use.readsRegister(AMDGPU::EXEC))
168 return true;
169 break;
170 default:
171 return true;
172 }
173 }
174 return false;
175 }
176
177 switch (MI.getOpcode()) {
178 default:
179 break;
180 case AMDGPU::V_READFIRSTLANE_B32:
181 return true;
182 }
183
184 return false;
185}
186
188 // Any implicit use of exec by VALU is not a real register read.
189 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
191}
192
194 MachineBasicBlock *SuccToSinkTo,
195 MachineCycleInfo *CI) const {
196 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
197 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
198 return true;
199
200 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
201 // Check if sinking of MI would create temporal divergent use.
202 for (auto Op : MI.uses()) {
203 if (Op.isReg() && Op.getReg().isVirtual() &&
204 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
205 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
206
207 // SgprDef defined inside cycle
208 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
209 if (FromCycle == nullptr)
210 continue;
211
212 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
213 // Check if there is a FromCycle that contains SgprDef's basic block but
214 // does not contain SuccToSinkTo and also has divergent exit condition.
215 while (FromCycle && !FromCycle->contains(ToCycle)) {
216 // After structurize-cfg, there should be exactly one cycle exit.
218 FromCycle->getExitBlocks(ExitBlocks);
219 assert(ExitBlocks.size() == 1);
220 assert(ExitBlocks[0]->getSinglePredecessor());
221
222 // FromCycle has divergent exit condition.
223 if (hasDivergentBranch(ExitBlocks[0]->getSinglePredecessor())) {
224 return false;
225 }
226
227 FromCycle = FromCycle->getParentCycle();
228 }
229 }
230 }
231
232 return true;
233}
234
236 int64_t &Offset0,
237 int64_t &Offset1) const {
238 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
239 return false;
240
241 unsigned Opc0 = Load0->getMachineOpcode();
242 unsigned Opc1 = Load1->getMachineOpcode();
243
244 // Make sure both are actually loads.
245 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
246 return false;
247
248 // A mayLoad instruction without a def is not a load. Likely a prefetch.
249 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
250 return false;
251
252 if (isDS(Opc0) && isDS(Opc1)) {
253
254 // FIXME: Handle this case:
255 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
256 return false;
257
258 // Check base reg.
259 if (Load0->getOperand(0) != Load1->getOperand(0))
260 return false;
261
262 // Skip read2 / write2 variants for simplicity.
263 // TODO: We should report true if the used offsets are adjacent (excluded
264 // st64 versions).
265 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
266 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
267 if (Offset0Idx == -1 || Offset1Idx == -1)
268 return false;
269
270 // XXX - be careful of dataless loads
271 // getNamedOperandIdx returns the index for MachineInstrs. Since they
272 // include the output in the operand list, but SDNodes don't, we need to
273 // subtract the index by one.
274 Offset0Idx -= get(Opc0).NumDefs;
275 Offset1Idx -= get(Opc1).NumDefs;
276 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
277 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
278 return true;
279 }
280
281 if (isSMRD(Opc0) && isSMRD(Opc1)) {
282 // Skip time and cache invalidation instructions.
283 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
284 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
285 return false;
286
287 unsigned NumOps = getNumOperandsNoGlue(Load0);
288 if (NumOps != getNumOperandsNoGlue(Load1))
289 return false;
290
291 // Check base reg.
292 if (Load0->getOperand(0) != Load1->getOperand(0))
293 return false;
294
295 // Match register offsets, if both register and immediate offsets present.
296 assert(NumOps == 4 || NumOps == 5);
297 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
298 return false;
299
300 const ConstantSDNode *Load0Offset =
301 dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3));
302 const ConstantSDNode *Load1Offset =
303 dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3));
304
305 if (!Load0Offset || !Load1Offset)
306 return false;
307
308 Offset0 = Load0Offset->getZExtValue();
309 Offset1 = Load1Offset->getZExtValue();
310 return true;
311 }
312
313 // MUBUF and MTBUF can access the same addresses.
314 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
315
316 // MUBUF and MTBUF have vaddr at different indices.
317 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
318 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
319 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
320 return false;
321
322 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
323 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
324
325 if (OffIdx0 == -1 || OffIdx1 == -1)
326 return false;
327
328 // getNamedOperandIdx returns the index for MachineInstrs. Since they
329 // include the output in the operand list, but SDNodes don't, we need to
330 // subtract the index by one.
331 OffIdx0 -= get(Opc0).NumDefs;
332 OffIdx1 -= get(Opc1).NumDefs;
333
334 SDValue Off0 = Load0->getOperand(OffIdx0);
335 SDValue Off1 = Load1->getOperand(OffIdx1);
336
337 // The offset might be a FrameIndexSDNode.
338 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
339 return false;
340
341 Offset0 = Off0->getAsZExtVal();
342 Offset1 = Off1->getAsZExtVal();
343 return true;
344 }
345
346 return false;
347}
348
349static bool isStride64(unsigned Opc) {
350 switch (Opc) {
351 case AMDGPU::DS_READ2ST64_B32:
352 case AMDGPU::DS_READ2ST64_B64:
353 case AMDGPU::DS_WRITE2ST64_B32:
354 case AMDGPU::DS_WRITE2ST64_B64:
355 return true;
356 default:
357 return false;
358 }
359}
360
363 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
364 const TargetRegisterInfo *TRI) const {
365 if (!LdSt.mayLoadOrStore())
366 return false;
367
368 unsigned Opc = LdSt.getOpcode();
369 OffsetIsScalable = false;
370 const MachineOperand *BaseOp, *OffsetOp;
371 int DataOpIdx;
372
373 if (isDS(LdSt)) {
374 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
375 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
376 if (OffsetOp) {
377 // Normal, single offset LDS instruction.
378 if (!BaseOp) {
379 // DS_CONSUME/DS_APPEND use M0 for the base address.
380 // TODO: find the implicit use operand for M0 and use that as BaseOp?
381 return false;
382 }
383 BaseOps.push_back(BaseOp);
384 Offset = OffsetOp->getImm();
385 // Get appropriate operand, and compute width accordingly.
386 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
387 if (DataOpIdx == -1)
388 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
389 Width = getOpSize(LdSt, DataOpIdx);
390 } else {
391 // The 2 offset instructions use offset0 and offset1 instead. We can treat
392 // these as a load with a single offset if the 2 offsets are consecutive.
393 // We will use this for some partially aligned loads.
394 const MachineOperand *Offset0Op =
395 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
396 const MachineOperand *Offset1Op =
397 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
398
399 unsigned Offset0 = Offset0Op->getImm() & 0xff;
400 unsigned Offset1 = Offset1Op->getImm() & 0xff;
401 if (Offset0 + 1 != Offset1)
402 return false;
403
404 // Each of these offsets is in element sized units, so we need to convert
405 // to bytes of the individual reads.
406
407 unsigned EltSize;
408 if (LdSt.mayLoad())
409 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
410 else {
411 assert(LdSt.mayStore());
412 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
413 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
414 }
415
416 if (isStride64(Opc))
417 EltSize *= 64;
418
419 BaseOps.push_back(BaseOp);
420 Offset = EltSize * Offset0;
421 // Get appropriate operand(s), and compute width accordingly.
422 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
423 if (DataOpIdx == -1) {
424 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
425 Width = getOpSize(LdSt, DataOpIdx);
426 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
427 Width = Width.getValue() + getOpSize(LdSt, DataOpIdx);
428 } else {
429 Width = getOpSize(LdSt, DataOpIdx);
430 }
431 }
432 return true;
433 }
434
435 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
436 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
437 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
438 return false;
439 BaseOps.push_back(RSrc);
440 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
441 if (BaseOp && !BaseOp->isFI())
442 BaseOps.push_back(BaseOp);
443 const MachineOperand *OffsetImm =
444 getNamedOperand(LdSt, AMDGPU::OpName::offset);
445 Offset = OffsetImm->getImm();
446 const MachineOperand *SOffset =
447 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
448 if (SOffset) {
449 if (SOffset->isReg())
450 BaseOps.push_back(SOffset);
451 else
452 Offset += SOffset->getImm();
453 }
454 // Get appropriate operand, and compute width accordingly.
455 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
456 if (DataOpIdx == -1)
457 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
458 if (DataOpIdx == -1) // LDS DMA
459 return false;
460 Width = getOpSize(LdSt, DataOpIdx);
461 return true;
462 }
463
464 if (isMIMG(LdSt)) {
465 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
466 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
467 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
468 if (VAddr0Idx >= 0) {
469 // GFX10 possible NSA encoding.
470 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
471 BaseOps.push_back(&LdSt.getOperand(I));
472 } else {
473 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
474 }
475 Offset = 0;
476 // Get appropriate operand, and compute width accordingly.
477 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
478 Width = getOpSize(LdSt, DataOpIdx);
479 return true;
480 }
481
482 if (isSMRD(LdSt)) {
483 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
484 if (!BaseOp) // e.g. S_MEMTIME
485 return false;
486 BaseOps.push_back(BaseOp);
487 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
488 Offset = OffsetOp ? OffsetOp->getImm() : 0;
489 // Get appropriate operand, and compute width accordingly.
490 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
491 if (DataOpIdx == -1)
492 return false;
493 Width = getOpSize(LdSt, DataOpIdx);
494 return true;
495 }
496
497 if (isFLAT(LdSt)) {
498 // Instructions have either vaddr or saddr or both or none.
499 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
500 if (BaseOp)
501 BaseOps.push_back(BaseOp);
502 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
503 if (BaseOp)
504 BaseOps.push_back(BaseOp);
505 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
506 // Get appropriate operand, and compute width accordingly.
507 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
508 if (DataOpIdx == -1)
509 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
510 if (DataOpIdx == -1) // LDS DMA
511 return false;
512 Width = getOpSize(LdSt, DataOpIdx);
513 return true;
514 }
515
516 return false;
517}
518
519static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
521 const MachineInstr &MI2,
523 // Only examine the first "base" operand of each instruction, on the
524 // assumption that it represents the real base address of the memory access.
525 // Other operands are typically offsets or indices from this base address.
526 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
527 return true;
528
529 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
530 return false;
531
532 auto MO1 = *MI1.memoperands_begin();
533 auto MO2 = *MI2.memoperands_begin();
534 if (MO1->getAddrSpace() != MO2->getAddrSpace())
535 return false;
536
537 auto Base1 = MO1->getValue();
538 auto Base2 = MO2->getValue();
539 if (!Base1 || !Base2)
540 return false;
541 Base1 = getUnderlyingObject(Base1);
542 Base2 = getUnderlyingObject(Base2);
543
544 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
545 return false;
546
547 return Base1 == Base2;
548}
549
551 int64_t Offset1, bool OffsetIsScalable1,
553 int64_t Offset2, bool OffsetIsScalable2,
554 unsigned ClusterSize,
555 unsigned NumBytes) const {
556 // If the mem ops (to be clustered) do not have the same base ptr, then they
557 // should not be clustered
558 if (!BaseOps1.empty() && !BaseOps2.empty()) {
559 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
560 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
561 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
562 return false;
563 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
564 // If only one base op is empty, they do not have the same base ptr
565 return false;
566 }
567
568 // In order to avoid register pressure, on an average, the number of DWORDS
569 // loaded together by all clustered mem ops should not exceed 8. This is an
570 // empirical value based on certain observations and performance related
571 // experiments.
572 // The good thing about this heuristic is - it avoids clustering of too many
573 // sub-word loads, and also avoids clustering of wide loads. Below is the
574 // brief summary of how the heuristic behaves for various `LoadSize`.
575 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
576 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
577 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
578 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
579 // (5) LoadSize >= 17: do not cluster
580 const unsigned LoadSize = NumBytes / ClusterSize;
581 const unsigned NumDWORDs = ((LoadSize + 3) / 4) * ClusterSize;
582 return NumDWORDs <= 8;
583}
584
585// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
586// the first 16 loads will be interleaved with the stores, and the next 16 will
587// be clustered as expected. It should really split into 2 16 store batches.
588//
589// Loads are clustered until this returns false, rather than trying to schedule
590// groups of stores. This also means we have to deal with saying different
591// address space loads should be clustered, and ones which might cause bank
592// conflicts.
593//
594// This might be deprecated so it might not be worth that much effort to fix.
596 int64_t Offset0, int64_t Offset1,
597 unsigned NumLoads) const {
598 assert(Offset1 > Offset0 &&
599 "Second offset should be larger than first offset!");
600 // If we have less than 16 loads in a row, and the offsets are within 64
601 // bytes, then schedule together.
602
603 // A cacheline is 64 bytes (for global memory).
604 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
605}
606
609 const DebugLoc &DL, MCRegister DestReg,
610 MCRegister SrcReg, bool KillSrc,
611 const char *Msg = "illegal VGPR to SGPR copy") {
613 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
615 C.diagnose(IllegalCopy);
616
617 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
618 .addReg(SrcReg, getKillRegState(KillSrc));
619}
620
621/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
622/// possible to have a direct copy in these cases on GFX908, so an intermediate
623/// VGPR copy is required.
627 const DebugLoc &DL, MCRegister DestReg,
628 MCRegister SrcReg, bool KillSrc,
629 RegScavenger &RS, bool RegsOverlap,
630 Register ImpDefSuperReg = Register(),
631 Register ImpUseSuperReg = Register()) {
632 assert((TII.getSubtarget().hasMAIInsts() &&
633 !TII.getSubtarget().hasGFX90AInsts()) &&
634 "Expected GFX908 subtarget.");
635
636 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
637 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
638 "Source register of the copy should be either an SGPR or an AGPR.");
639
640 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
641 "Destination register of the copy should be an AGPR.");
642
643 const SIRegisterInfo &RI = TII.getRegisterInfo();
644
645 // First try to find defining accvgpr_write to avoid temporary registers.
646 // In the case of copies of overlapping AGPRs, we conservatively do not
647 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
648 // an accvgpr_write used for this same copy due to implicit-defs
649 if (!RegsOverlap) {
650 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
651 --Def;
652
653 if (!Def->modifiesRegister(SrcReg, &RI))
654 continue;
655
656 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
657 Def->getOperand(0).getReg() != SrcReg)
658 break;
659
660 MachineOperand &DefOp = Def->getOperand(1);
661 assert(DefOp.isReg() || DefOp.isImm());
662
663 if (DefOp.isReg()) {
664 bool SafeToPropagate = true;
665 // Check that register source operand is not clobbered before MI.
666 // Immediate operands are always safe to propagate.
667 for (auto I = Def; I != MI && SafeToPropagate; ++I)
668 if (I->modifiesRegister(DefOp.getReg(), &RI))
669 SafeToPropagate = false;
670
671 if (!SafeToPropagate)
672 break;
673
674 DefOp.setIsKill(false);
675 }
676
677 MachineInstrBuilder Builder =
678 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
679 .add(DefOp);
680 if (ImpDefSuperReg)
681 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
682
683 if (ImpUseSuperReg) {
684 Builder.addReg(ImpUseSuperReg,
686 }
687
688 return;
689 }
690 }
691
693 RS.backward(std::next(MI));
694
695 // Ideally we want to have three registers for a long reg_sequence copy
696 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
697 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
698 *MBB.getParent());
699
700 // Registers in the sequence are allocated contiguously so we can just
701 // use register number to pick one of three round-robin temps.
702 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
703 Register Tmp =
704 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
706 "VGPR used for an intermediate copy should have been reserved.");
707
708 // Only loop through if there are any free registers left. We don't want to
709 // spill.
710 while (RegNo--) {
711 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
712 /* RestoreAfter */ false, 0,
713 /* AllowSpill */ false);
714 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
715 break;
716 Tmp = Tmp2;
717 RS.setRegUsed(Tmp);
718 }
719
720 // Insert copy to temporary VGPR.
721 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
722 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
723 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
724 } else {
725 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
726 }
727
728 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
729 .addReg(SrcReg, getKillRegState(KillSrc));
730 if (ImpUseSuperReg) {
731 UseBuilder.addReg(ImpUseSuperReg,
733 }
734
735 MachineInstrBuilder DefBuilder
736 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
737 .addReg(Tmp, RegState::Kill);
738
739 if (ImpDefSuperReg)
740 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
741}
742
745 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
746 const TargetRegisterClass *RC, bool Forward) {
747 const SIRegisterInfo &RI = TII.getRegisterInfo();
748 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
750 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
751
752 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
753 int16_t SubIdx = BaseIndices[Idx];
754 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
755 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
756 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
757 unsigned Opcode = AMDGPU::S_MOV_B32;
758
759 // Is SGPR aligned? If so try to combine with next.
760 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
761 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
762 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
763 // Can use SGPR64 copy
764 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
765 SubIdx = RI.getSubRegFromChannel(Channel, 2);
766 DestSubReg = RI.getSubReg(DestReg, SubIdx);
767 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
768 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
769 Opcode = AMDGPU::S_MOV_B64;
770 Idx++;
771 }
772
773 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
774 .addReg(SrcSubReg)
775 .addReg(SrcReg, RegState::Implicit);
776
777 if (!FirstMI)
778 FirstMI = LastMI;
779
780 if (!Forward)
781 I--;
782 }
783
784 assert(FirstMI && LastMI);
785 if (!Forward)
786 std::swap(FirstMI, LastMI);
787
788 FirstMI->addOperand(
789 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
790
791 if (KillSrc)
792 LastMI->addRegisterKilled(SrcReg, &RI);
793}
794
797 const DebugLoc &DL, MCRegister DestReg,
798 MCRegister SrcReg, bool KillSrc) const {
799 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
800 unsigned Size = RI.getRegSizeInBits(*RC);
801 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
802 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
803
804 // The rest of copyPhysReg assumes Src and Dst size are the same size.
805 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
806 // we remove Fix16BitCopies and this code block?
807 if (Fix16BitCopies) {
808 if (((Size == 16) != (SrcSize == 16))) {
809 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
811 MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg;
812 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
813 RegToFix = SubReg;
814
815 if (DestReg == SrcReg) {
816 // Identity copy. Insert empty bundle since ExpandPostRA expects an
817 // instruction here.
818 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
819 return;
820 }
821 RC = RI.getPhysRegBaseClass(DestReg);
822 Size = RI.getRegSizeInBits(*RC);
823 SrcRC = RI.getPhysRegBaseClass(SrcReg);
824 SrcSize = RI.getRegSizeInBits(*SrcRC);
825 }
826 }
827
828 if (RC == &AMDGPU::VGPR_32RegClass) {
829 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
830 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
831 AMDGPU::AGPR_32RegClass.contains(SrcReg));
832 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
833 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
834 BuildMI(MBB, MI, DL, get(Opc), DestReg)
835 .addReg(SrcReg, getKillRegState(KillSrc));
836 return;
837 }
838
839 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
840 RC == &AMDGPU::SReg_32RegClass) {
841 if (SrcReg == AMDGPU::SCC) {
842 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
843 .addImm(1)
844 .addImm(0);
845 return;
846 }
847
848 if (DestReg == AMDGPU::VCC_LO) {
849 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
850 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
851 .addReg(SrcReg, getKillRegState(KillSrc));
852 } else {
853 // FIXME: Hack until VReg_1 removed.
854 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
855 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
856 .addImm(0)
857 .addReg(SrcReg, getKillRegState(KillSrc));
858 }
859
860 return;
861 }
862
863 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
864 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
865 return;
866 }
867
868 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
869 .addReg(SrcReg, getKillRegState(KillSrc));
870 return;
871 }
872
873 if (RC == &AMDGPU::SReg_64RegClass) {
874 if (SrcReg == AMDGPU::SCC) {
875 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
876 .addImm(1)
877 .addImm(0);
878 return;
879 }
880
881 if (DestReg == AMDGPU::VCC) {
882 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
883 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
884 .addReg(SrcReg, getKillRegState(KillSrc));
885 } else {
886 // FIXME: Hack until VReg_1 removed.
887 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
888 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
889 .addImm(0)
890 .addReg(SrcReg, getKillRegState(KillSrc));
891 }
892
893 return;
894 }
895
896 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
897 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
898 return;
899 }
900
901 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
902 .addReg(SrcReg, getKillRegState(KillSrc));
903 return;
904 }
905
906 if (DestReg == AMDGPU::SCC) {
907 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
908 // but SelectionDAG emits such copies for i1 sources.
909 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
910 // This copy can only be produced by patterns
911 // with explicit SCC, which are known to be enabled
912 // only for subtargets with S_CMP_LG_U64 present.
914 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
915 .addReg(SrcReg, getKillRegState(KillSrc))
916 .addImm(0);
917 } else {
918 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
919 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
920 .addReg(SrcReg, getKillRegState(KillSrc))
921 .addImm(0);
922 }
923
924 return;
925 }
926
927 if (RC == &AMDGPU::AGPR_32RegClass) {
928 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
929 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
930 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
931 .addReg(SrcReg, getKillRegState(KillSrc));
932 return;
933 }
934
935 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
936 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
937 .addReg(SrcReg, getKillRegState(KillSrc));
938 return;
939 }
940
941 // FIXME: Pass should maintain scavenger to avoid scan through the block on
942 // every AGPR spill.
943 RegScavenger RS;
944 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
945 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
946 return;
947 }
948
949 if (Size == 16) {
950 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
951 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
952 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
953
954 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
955 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
956 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
957 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
958 bool DstLow = !AMDGPU::isHi(DestReg, RI);
959 bool SrcLow = !AMDGPU::isHi(SrcReg, RI);
960 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
961 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
962
963 if (IsSGPRDst) {
964 if (!IsSGPRSrc) {
965 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
966 return;
967 }
968
969 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
970 .addReg(NewSrcReg, getKillRegState(KillSrc));
971 return;
972 }
973
974 if (IsAGPRDst || IsAGPRSrc) {
975 if (!DstLow || !SrcLow) {
976 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
977 "Cannot use hi16 subreg with an AGPR!");
978 }
979
980 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
981 return;
982 }
983
984 if (ST.hasTrue16BitInsts()) {
985 if (IsSGPRSrc) {
986 assert(SrcLow);
987 SrcReg = NewSrcReg;
988 }
989 // Use the smaller instruction encoding if possible.
990 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
991 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
992 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
993 .addReg(SrcReg);
994 } else {
995 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
996 .addImm(0) // src0_modifiers
997 .addReg(SrcReg)
998 .addImm(0); // op_sel
999 }
1000 return;
1001 }
1002
1003 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1004 if (!DstLow || !SrcLow) {
1005 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1006 "Cannot use hi16 subreg on VI!");
1007 }
1008
1009 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1010 .addReg(NewSrcReg, getKillRegState(KillSrc));
1011 return;
1012 }
1013
1014 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1015 .addImm(0) // src0_modifiers
1016 .addReg(NewSrcReg)
1017 .addImm(0) // clamp
1024 // First implicit operand is $exec.
1025 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1026 return;
1027 }
1028
1029 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1030 if (ST.hasMovB64()) {
1031 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1032 .addReg(SrcReg, getKillRegState(KillSrc));
1033 return;
1034 }
1035 if (ST.hasPkMovB32()) {
1036 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1038 .addReg(SrcReg)
1040 .addReg(SrcReg)
1041 .addImm(0) // op_sel_lo
1042 .addImm(0) // op_sel_hi
1043 .addImm(0) // neg_lo
1044 .addImm(0) // neg_hi
1045 .addImm(0) // clamp
1046 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1047 return;
1048 }
1049 }
1050
1051 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1052 if (RI.isSGPRClass(RC)) {
1053 if (!RI.isSGPRClass(SrcRC)) {
1054 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1055 return;
1056 }
1057 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1058 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1059 Forward);
1060 return;
1061 }
1062
1063 unsigned EltSize = 4;
1064 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1065 if (RI.isAGPRClass(RC)) {
1066 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1067 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1068 else if (RI.hasVGPRs(SrcRC) ||
1069 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1070 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1071 else
1072 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1073 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1074 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1075 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1076 (RI.isProperlyAlignedRC(*RC) &&
1077 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1078 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1079 if (ST.hasMovB64()) {
1080 Opcode = AMDGPU::V_MOV_B64_e32;
1081 EltSize = 8;
1082 } else if (ST.hasPkMovB32()) {
1083 Opcode = AMDGPU::V_PK_MOV_B32;
1084 EltSize = 8;
1085 }
1086 }
1087
1088 // For the cases where we need an intermediate instruction/temporary register
1089 // (destination is an AGPR), we need a scavenger.
1090 //
1091 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1092 // whole block for every handled copy.
1093 std::unique_ptr<RegScavenger> RS;
1094 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1095 RS.reset(new RegScavenger());
1096
1097 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1098
1099 // If there is an overlap, we can't kill the super-register on the last
1100 // instruction, since it will also kill the components made live by this def.
1101 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1102 const bool CanKillSuperReg = KillSrc && !Overlap;
1103
1104 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1105 unsigned SubIdx;
1106 if (Forward)
1107 SubIdx = SubIndices[Idx];
1108 else
1109 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1110 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1111 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1112 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1113
1114 bool IsFirstSubreg = Idx == 0;
1115 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1116
1117 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1118 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1119 Register ImpUseSuper = SrcReg;
1120 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1121 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1122 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1124 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1126 .addReg(SrcSubReg)
1128 .addReg(SrcSubReg)
1129 .addImm(0) // op_sel_lo
1130 .addImm(0) // op_sel_hi
1131 .addImm(0) // neg_lo
1132 .addImm(0) // neg_hi
1133 .addImm(0) // clamp
1134 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1135 if (IsFirstSubreg)
1137 } else {
1138 MachineInstrBuilder Builder =
1139 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1140 if (IsFirstSubreg)
1141 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1142
1143 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1144 }
1145 }
1146}
1147
1148int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1149 int NewOpc;
1150
1151 // Try to map original to commuted opcode
1152 NewOpc = AMDGPU::getCommuteRev(Opcode);
1153 if (NewOpc != -1)
1154 // Check if the commuted (REV) opcode exists on the target.
1155 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1156
1157 // Try to map commuted to original opcode
1158 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1159 if (NewOpc != -1)
1160 // Check if the original (non-REV) opcode exists on the target.
1161 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1162
1163 return Opcode;
1164}
1165
1168 const DebugLoc &DL, Register DestReg,
1169 int64_t Value) const {
1171 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
1172 if (RegClass == &AMDGPU::SReg_32RegClass ||
1173 RegClass == &AMDGPU::SGPR_32RegClass ||
1174 RegClass == &AMDGPU::SReg_32_XM0RegClass ||
1175 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
1176 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
1177 .addImm(Value);
1178 return;
1179 }
1180
1181 if (RegClass == &AMDGPU::SReg_64RegClass ||
1182 RegClass == &AMDGPU::SGPR_64RegClass ||
1183 RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
1184 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
1185 .addImm(Value);
1186 return;
1187 }
1188
1189 if (RegClass == &AMDGPU::VGPR_32RegClass) {
1190 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
1191 .addImm(Value);
1192 return;
1193 }
1194 if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
1195 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
1196 .addImm(Value);
1197 return;
1198 }
1199
1200 unsigned EltSize = 4;
1201 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1202 if (RI.isSGPRClass(RegClass)) {
1203 if (RI.getRegSizeInBits(*RegClass) > 32) {
1204 Opcode = AMDGPU::S_MOV_B64;
1205 EltSize = 8;
1206 } else {
1207 Opcode = AMDGPU::S_MOV_B32;
1208 EltSize = 4;
1209 }
1210 }
1211
1212 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
1213 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1214 int64_t IdxValue = Idx == 0 ? Value : 0;
1215
1216 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
1217 get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
1218 Builder.addImm(IdxValue);
1219 }
1220}
1221
1222const TargetRegisterClass *
1224 return &AMDGPU::VGPR_32RegClass;
1225}
1226
1229 const DebugLoc &DL, Register DstReg,
1231 Register TrueReg,
1232 Register FalseReg) const {
1234 const TargetRegisterClass *BoolXExecRC =
1235 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1236 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1237 "Not a VGPR32 reg");
1238
1239 if (Cond.size() == 1) {
1240 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1241 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1242 .add(Cond[0]);
1243 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1244 .addImm(0)
1245 .addReg(FalseReg)
1246 .addImm(0)
1247 .addReg(TrueReg)
1248 .addReg(SReg);
1249 } else if (Cond.size() == 2) {
1250 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1251 switch (Cond[0].getImm()) {
1252 case SIInstrInfo::SCC_TRUE: {
1253 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1254 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1255 : AMDGPU::S_CSELECT_B64), SReg)
1256 .addImm(1)
1257 .addImm(0);
1258 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1259 .addImm(0)
1260 .addReg(FalseReg)
1261 .addImm(0)
1262 .addReg(TrueReg)
1263 .addReg(SReg);
1264 break;
1265 }
1266 case SIInstrInfo::SCC_FALSE: {
1267 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1268 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1269 : AMDGPU::S_CSELECT_B64), SReg)
1270 .addImm(0)
1271 .addImm(1);
1272 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1273 .addImm(0)
1274 .addReg(FalseReg)
1275 .addImm(0)
1276 .addReg(TrueReg)
1277 .addReg(SReg);
1278 break;
1279 }
1280 case SIInstrInfo::VCCNZ: {
1281 MachineOperand RegOp = Cond[1];
1282 RegOp.setImplicit(false);
1283 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1284 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1285 .add(RegOp);
1286 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1287 .addImm(0)
1288 .addReg(FalseReg)
1289 .addImm(0)
1290 .addReg(TrueReg)
1291 .addReg(SReg);
1292 break;
1293 }
1294 case SIInstrInfo::VCCZ: {
1295 MachineOperand RegOp = Cond[1];
1296 RegOp.setImplicit(false);
1297 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1298 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1299 .add(RegOp);
1300 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1301 .addImm(0)
1302 .addReg(TrueReg)
1303 .addImm(0)
1304 .addReg(FalseReg)
1305 .addReg(SReg);
1306 break;
1307 }
1308 case SIInstrInfo::EXECNZ: {
1309 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1310 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1311 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1312 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1313 .addImm(0);
1314 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1315 : AMDGPU::S_CSELECT_B64), SReg)
1316 .addImm(1)
1317 .addImm(0);
1318 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1319 .addImm(0)
1320 .addReg(FalseReg)
1321 .addImm(0)
1322 .addReg(TrueReg)
1323 .addReg(SReg);
1324 break;
1325 }
1326 case SIInstrInfo::EXECZ: {
1327 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1328 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1329 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1330 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1331 .addImm(0);
1332 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1333 : AMDGPU::S_CSELECT_B64), SReg)
1334 .addImm(0)
1335 .addImm(1);
1336 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1337 .addImm(0)
1338 .addReg(FalseReg)
1339 .addImm(0)
1340 .addReg(TrueReg)
1341 .addReg(SReg);
1342 llvm_unreachable("Unhandled branch predicate EXECZ");
1343 break;
1344 }
1345 default:
1346 llvm_unreachable("invalid branch predicate");
1347 }
1348 } else {
1349 llvm_unreachable("Can only handle Cond size 1 or 2");
1350 }
1351}
1352
1355 const DebugLoc &DL,
1356 Register SrcReg, int Value) const {
1358 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1359 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1360 .addImm(Value)
1361 .addReg(SrcReg);
1362
1363 return Reg;
1364}
1365
1368 const DebugLoc &DL,
1369 Register SrcReg, int Value) const {
1371 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1372 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1373 .addImm(Value)
1374 .addReg(SrcReg);
1375
1376 return Reg;
1377}
1378
1380
1381 if (RI.isAGPRClass(DstRC))
1382 return AMDGPU::COPY;
1383 if (RI.getRegSizeInBits(*DstRC) == 16) {
1384 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1385 // before RA.
1386 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1387 } else if (RI.getRegSizeInBits(*DstRC) == 32) {
1388 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1389 } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
1390 return AMDGPU::S_MOV_B64;
1391 } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
1392 return AMDGPU::V_MOV_B64_PSEUDO;
1393 }
1394 return AMDGPU::COPY;
1395}
1396
1397const MCInstrDesc &
1399 bool IsIndirectSrc) const {
1400 if (IsIndirectSrc) {
1401 if (VecSize <= 32) // 4 bytes
1402 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1403 if (VecSize <= 64) // 8 bytes
1404 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1405 if (VecSize <= 96) // 12 bytes
1406 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1407 if (VecSize <= 128) // 16 bytes
1408 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1409 if (VecSize <= 160) // 20 bytes
1410 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1411 if (VecSize <= 256) // 32 bytes
1412 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1413 if (VecSize <= 288) // 36 bytes
1414 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1415 if (VecSize <= 320) // 40 bytes
1416 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1417 if (VecSize <= 352) // 44 bytes
1418 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1419 if (VecSize <= 384) // 48 bytes
1420 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1421 if (VecSize <= 512) // 64 bytes
1422 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1423 if (VecSize <= 1024) // 128 bytes
1424 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1425
1426 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1427 }
1428
1429 if (VecSize <= 32) // 4 bytes
1430 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1431 if (VecSize <= 64) // 8 bytes
1432 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1433 if (VecSize <= 96) // 12 bytes
1434 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1435 if (VecSize <= 128) // 16 bytes
1436 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1437 if (VecSize <= 160) // 20 bytes
1438 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1439 if (VecSize <= 256) // 32 bytes
1440 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1441 if (VecSize <= 288) // 36 bytes
1442 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1443 if (VecSize <= 320) // 40 bytes
1444 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1445 if (VecSize <= 352) // 44 bytes
1446 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1447 if (VecSize <= 384) // 48 bytes
1448 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1449 if (VecSize <= 512) // 64 bytes
1450 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1451 if (VecSize <= 1024) // 128 bytes
1452 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1453
1454 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1455}
1456
1457static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1458 if (VecSize <= 32) // 4 bytes
1459 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1460 if (VecSize <= 64) // 8 bytes
1461 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1462 if (VecSize <= 96) // 12 bytes
1463 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1464 if (VecSize <= 128) // 16 bytes
1465 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1466 if (VecSize <= 160) // 20 bytes
1467 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1468 if (VecSize <= 256) // 32 bytes
1469 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1470 if (VecSize <= 288) // 36 bytes
1471 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1472 if (VecSize <= 320) // 40 bytes
1473 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1474 if (VecSize <= 352) // 44 bytes
1475 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1476 if (VecSize <= 384) // 48 bytes
1477 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1478 if (VecSize <= 512) // 64 bytes
1479 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1480 if (VecSize <= 1024) // 128 bytes
1481 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1482
1483 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1484}
1485
1486static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1487 if (VecSize <= 32) // 4 bytes
1488 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1489 if (VecSize <= 64) // 8 bytes
1490 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1491 if (VecSize <= 96) // 12 bytes
1492 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1493 if (VecSize <= 128) // 16 bytes
1494 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1495 if (VecSize <= 160) // 20 bytes
1496 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1497 if (VecSize <= 256) // 32 bytes
1498 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1499 if (VecSize <= 288) // 36 bytes
1500 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1501 if (VecSize <= 320) // 40 bytes
1502 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1503 if (VecSize <= 352) // 44 bytes
1504 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1505 if (VecSize <= 384) // 48 bytes
1506 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1507 if (VecSize <= 512) // 64 bytes
1508 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1509 if (VecSize <= 1024) // 128 bytes
1510 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1511
1512 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1513}
1514
1515static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1516 if (VecSize <= 64) // 8 bytes
1517 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1518 if (VecSize <= 128) // 16 bytes
1519 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1520 if (VecSize <= 256) // 32 bytes
1521 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1522 if (VecSize <= 512) // 64 bytes
1523 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1524 if (VecSize <= 1024) // 128 bytes
1525 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1526
1527 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1528}
1529
1530const MCInstrDesc &
1531SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1532 bool IsSGPR) const {
1533 if (IsSGPR) {
1534 switch (EltSize) {
1535 case 32:
1536 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1537 case 64:
1538 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1539 default:
1540 llvm_unreachable("invalid reg indexing elt size");
1541 }
1542 }
1543
1544 assert(EltSize == 32 && "invalid reg indexing elt size");
1546}
1547
1548static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1549 switch (Size) {
1550 case 4:
1551 return AMDGPU::SI_SPILL_S32_SAVE;
1552 case 8:
1553 return AMDGPU::SI_SPILL_S64_SAVE;
1554 case 12:
1555 return AMDGPU::SI_SPILL_S96_SAVE;
1556 case 16:
1557 return AMDGPU::SI_SPILL_S128_SAVE;
1558 case 20:
1559 return AMDGPU::SI_SPILL_S160_SAVE;
1560 case 24:
1561 return AMDGPU::SI_SPILL_S192_SAVE;
1562 case 28:
1563 return AMDGPU::SI_SPILL_S224_SAVE;
1564 case 32:
1565 return AMDGPU::SI_SPILL_S256_SAVE;
1566 case 36:
1567 return AMDGPU::SI_SPILL_S288_SAVE;
1568 case 40:
1569 return AMDGPU::SI_SPILL_S320_SAVE;
1570 case 44:
1571 return AMDGPU::SI_SPILL_S352_SAVE;
1572 case 48:
1573 return AMDGPU::SI_SPILL_S384_SAVE;
1574 case 64:
1575 return AMDGPU::SI_SPILL_S512_SAVE;
1576 case 128:
1577 return AMDGPU::SI_SPILL_S1024_SAVE;
1578 default:
1579 llvm_unreachable("unknown register size");
1580 }
1581}
1582
1583static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1584 switch (Size) {
1585 case 4:
1586 return AMDGPU::SI_SPILL_V32_SAVE;
1587 case 8:
1588 return AMDGPU::SI_SPILL_V64_SAVE;
1589 case 12:
1590 return AMDGPU::SI_SPILL_V96_SAVE;
1591 case 16:
1592 return AMDGPU::SI_SPILL_V128_SAVE;
1593 case 20:
1594 return AMDGPU::SI_SPILL_V160_SAVE;
1595 case 24:
1596 return AMDGPU::SI_SPILL_V192_SAVE;
1597 case 28:
1598 return AMDGPU::SI_SPILL_V224_SAVE;
1599 case 32:
1600 return AMDGPU::SI_SPILL_V256_SAVE;
1601 case 36:
1602 return AMDGPU::SI_SPILL_V288_SAVE;
1603 case 40:
1604 return AMDGPU::SI_SPILL_V320_SAVE;
1605 case 44:
1606 return AMDGPU::SI_SPILL_V352_SAVE;
1607 case 48:
1608 return AMDGPU::SI_SPILL_V384_SAVE;
1609 case 64:
1610 return AMDGPU::SI_SPILL_V512_SAVE;
1611 case 128:
1612 return AMDGPU::SI_SPILL_V1024_SAVE;
1613 default:
1614 llvm_unreachable("unknown register size");
1615 }
1616}
1617
1618static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
1619 switch (Size) {
1620 case 4:
1621 return AMDGPU::SI_SPILL_A32_SAVE;
1622 case 8:
1623 return AMDGPU::SI_SPILL_A64_SAVE;
1624 case 12:
1625 return AMDGPU::SI_SPILL_A96_SAVE;
1626 case 16:
1627 return AMDGPU::SI_SPILL_A128_SAVE;
1628 case 20:
1629 return AMDGPU::SI_SPILL_A160_SAVE;
1630 case 24:
1631 return AMDGPU::SI_SPILL_A192_SAVE;
1632 case 28:
1633 return AMDGPU::SI_SPILL_A224_SAVE;
1634 case 32:
1635 return AMDGPU::SI_SPILL_A256_SAVE;
1636 case 36:
1637 return AMDGPU::SI_SPILL_A288_SAVE;
1638 case 40:
1639 return AMDGPU::SI_SPILL_A320_SAVE;
1640 case 44:
1641 return AMDGPU::SI_SPILL_A352_SAVE;
1642 case 48:
1643 return AMDGPU::SI_SPILL_A384_SAVE;
1644 case 64:
1645 return AMDGPU::SI_SPILL_A512_SAVE;
1646 case 128:
1647 return AMDGPU::SI_SPILL_A1024_SAVE;
1648 default:
1649 llvm_unreachable("unknown register size");
1650 }
1651}
1652
1653static unsigned getAVSpillSaveOpcode(unsigned Size) {
1654 switch (Size) {
1655 case 4:
1656 return AMDGPU::SI_SPILL_AV32_SAVE;
1657 case 8:
1658 return AMDGPU::SI_SPILL_AV64_SAVE;
1659 case 12:
1660 return AMDGPU::SI_SPILL_AV96_SAVE;
1661 case 16:
1662 return AMDGPU::SI_SPILL_AV128_SAVE;
1663 case 20:
1664 return AMDGPU::SI_SPILL_AV160_SAVE;
1665 case 24:
1666 return AMDGPU::SI_SPILL_AV192_SAVE;
1667 case 28:
1668 return AMDGPU::SI_SPILL_AV224_SAVE;
1669 case 32:
1670 return AMDGPU::SI_SPILL_AV256_SAVE;
1671 case 36:
1672 return AMDGPU::SI_SPILL_AV288_SAVE;
1673 case 40:
1674 return AMDGPU::SI_SPILL_AV320_SAVE;
1675 case 44:
1676 return AMDGPU::SI_SPILL_AV352_SAVE;
1677 case 48:
1678 return AMDGPU::SI_SPILL_AV384_SAVE;
1679 case 64:
1680 return AMDGPU::SI_SPILL_AV512_SAVE;
1681 case 128:
1682 return AMDGPU::SI_SPILL_AV1024_SAVE;
1683 default:
1684 llvm_unreachable("unknown register size");
1685 }
1686}
1687
1688static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1689 bool IsVectorSuperClass) {
1690 // Currently, there is only 32-bit WWM register spills needed.
1691 if (Size != 4)
1692 llvm_unreachable("unknown wwm register spill size");
1693
1694 if (IsVectorSuperClass)
1695 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1696
1697 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1698}
1699
1701 const TargetRegisterClass *RC,
1702 unsigned Size,
1703 const SIRegisterInfo &TRI,
1704 const SIMachineFunctionInfo &MFI) {
1705 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1706
1707 // Choose the right opcode if spilling a WWM register.
1709 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1710
1711 if (IsVectorSuperClass)
1712 return getAVSpillSaveOpcode(Size);
1713
1714 return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
1716}
1717
1720 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1721 const TargetRegisterInfo *TRI, Register VReg) const {
1724 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1725 const DebugLoc &DL = MBB.findDebugLoc(MI);
1726
1727 MachinePointerInfo PtrInfo
1728 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1730 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1731 FrameInfo.getObjectAlign(FrameIndex));
1732 unsigned SpillSize = TRI->getSpillSize(*RC);
1733
1735 if (RI.isSGPRClass(RC)) {
1736 MFI->setHasSpilledSGPRs();
1737 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1738 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1739 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1740
1741 // We are only allowed to create one new instruction when spilling
1742 // registers, so we need to use pseudo instruction for spilling SGPRs.
1743 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1744
1745 // The SGPR spill/restore instructions only work on number sgprs, so we need
1746 // to make sure we are using the correct register class.
1747 if (SrcReg.isVirtual() && SpillSize == 4) {
1748 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1749 }
1750
1751 BuildMI(MBB, MI, DL, OpDesc)
1752 .addReg(SrcReg, getKillRegState(isKill)) // data
1753 .addFrameIndex(FrameIndex) // addr
1754 .addMemOperand(MMO)
1756
1757 if (RI.spillSGPRToVGPR())
1758 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1759 return;
1760 }
1761
1762 unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
1763 SpillSize, RI, *MFI);
1764 MFI->setHasSpilledVGPRs();
1765
1766 BuildMI(MBB, MI, DL, get(Opcode))
1767 .addReg(SrcReg, getKillRegState(isKill)) // data
1768 .addFrameIndex(FrameIndex) // addr
1769 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1770 .addImm(0) // offset
1771 .addMemOperand(MMO);
1772}
1773
1774static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1775 switch (Size) {
1776 case 4:
1777 return AMDGPU::SI_SPILL_S32_RESTORE;
1778 case 8:
1779 return AMDGPU::SI_SPILL_S64_RESTORE;
1780 case 12:
1781 return AMDGPU::SI_SPILL_S96_RESTORE;
1782 case 16:
1783 return AMDGPU::SI_SPILL_S128_RESTORE;
1784 case 20:
1785 return AMDGPU::SI_SPILL_S160_RESTORE;
1786 case 24:
1787 return AMDGPU::SI_SPILL_S192_RESTORE;
1788 case 28:
1789 return AMDGPU::SI_SPILL_S224_RESTORE;
1790 case 32:
1791 return AMDGPU::SI_SPILL_S256_RESTORE;
1792 case 36:
1793 return AMDGPU::SI_SPILL_S288_RESTORE;
1794 case 40:
1795 return AMDGPU::SI_SPILL_S320_RESTORE;
1796 case 44:
1797 return AMDGPU::SI_SPILL_S352_RESTORE;
1798 case 48:
1799 return AMDGPU::SI_SPILL_S384_RESTORE;
1800 case 64:
1801 return AMDGPU::SI_SPILL_S512_RESTORE;
1802 case 128:
1803 return AMDGPU::SI_SPILL_S1024_RESTORE;
1804 default:
1805 llvm_unreachable("unknown register size");
1806 }
1807}
1808
1809static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1810 switch (Size) {
1811 case 4:
1812 return AMDGPU::SI_SPILL_V32_RESTORE;
1813 case 8:
1814 return AMDGPU::SI_SPILL_V64_RESTORE;
1815 case 12:
1816 return AMDGPU::SI_SPILL_V96_RESTORE;
1817 case 16:
1818 return AMDGPU::SI_SPILL_V128_RESTORE;
1819 case 20:
1820 return AMDGPU::SI_SPILL_V160_RESTORE;
1821 case 24:
1822 return AMDGPU::SI_SPILL_V192_RESTORE;
1823 case 28:
1824 return AMDGPU::SI_SPILL_V224_RESTORE;
1825 case 32:
1826 return AMDGPU::SI_SPILL_V256_RESTORE;
1827 case 36:
1828 return AMDGPU::SI_SPILL_V288_RESTORE;
1829 case 40:
1830 return AMDGPU::SI_SPILL_V320_RESTORE;
1831 case 44:
1832 return AMDGPU::SI_SPILL_V352_RESTORE;
1833 case 48:
1834 return AMDGPU::SI_SPILL_V384_RESTORE;
1835 case 64:
1836 return AMDGPU::SI_SPILL_V512_RESTORE;
1837 case 128:
1838 return AMDGPU::SI_SPILL_V1024_RESTORE;
1839 default:
1840 llvm_unreachable("unknown register size");
1841 }
1842}
1843
1844static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1845 switch (Size) {
1846 case 4:
1847 return AMDGPU::SI_SPILL_A32_RESTORE;
1848 case 8:
1849 return AMDGPU::SI_SPILL_A64_RESTORE;
1850 case 12:
1851 return AMDGPU::SI_SPILL_A96_RESTORE;
1852 case 16:
1853 return AMDGPU::SI_SPILL_A128_RESTORE;
1854 case 20:
1855 return AMDGPU::SI_SPILL_A160_RESTORE;
1856 case 24:
1857 return AMDGPU::SI_SPILL_A192_RESTORE;
1858 case 28:
1859 return AMDGPU::SI_SPILL_A224_RESTORE;
1860 case 32:
1861 return AMDGPU::SI_SPILL_A256_RESTORE;
1862 case 36:
1863 return AMDGPU::SI_SPILL_A288_RESTORE;
1864 case 40:
1865 return AMDGPU::SI_SPILL_A320_RESTORE;
1866 case 44:
1867 return AMDGPU::SI_SPILL_A352_RESTORE;
1868 case 48:
1869 return AMDGPU::SI_SPILL_A384_RESTORE;
1870 case 64:
1871 return AMDGPU::SI_SPILL_A512_RESTORE;
1872 case 128:
1873 return AMDGPU::SI_SPILL_A1024_RESTORE;
1874 default:
1875 llvm_unreachable("unknown register size");
1876 }
1877}
1878
1879static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1880 switch (Size) {
1881 case 4:
1882 return AMDGPU::SI_SPILL_AV32_RESTORE;
1883 case 8:
1884 return AMDGPU::SI_SPILL_AV64_RESTORE;
1885 case 12:
1886 return AMDGPU::SI_SPILL_AV96_RESTORE;
1887 case 16:
1888 return AMDGPU::SI_SPILL_AV128_RESTORE;
1889 case 20:
1890 return AMDGPU::SI_SPILL_AV160_RESTORE;
1891 case 24:
1892 return AMDGPU::SI_SPILL_AV192_RESTORE;
1893 case 28:
1894 return AMDGPU::SI_SPILL_AV224_RESTORE;
1895 case 32:
1896 return AMDGPU::SI_SPILL_AV256_RESTORE;
1897 case 36:
1898 return AMDGPU::SI_SPILL_AV288_RESTORE;
1899 case 40:
1900 return AMDGPU::SI_SPILL_AV320_RESTORE;
1901 case 44:
1902 return AMDGPU::SI_SPILL_AV352_RESTORE;
1903 case 48:
1904 return AMDGPU::SI_SPILL_AV384_RESTORE;
1905 case 64:
1906 return AMDGPU::SI_SPILL_AV512_RESTORE;
1907 case 128:
1908 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1909 default:
1910 llvm_unreachable("unknown register size");
1911 }
1912}
1913
1914static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1915 bool IsVectorSuperClass) {
1916 // Currently, there is only 32-bit WWM register spills needed.
1917 if (Size != 4)
1918 llvm_unreachable("unknown wwm register spill size");
1919
1920 if (IsVectorSuperClass)
1921 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1922
1923 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1924}
1925
1926static unsigned
1928 unsigned Size, const SIRegisterInfo &TRI,
1929 const SIMachineFunctionInfo &MFI) {
1930 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1931
1932 // Choose the right opcode if restoring a WWM register.
1934 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1935
1936 if (IsVectorSuperClass)
1938
1939 return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
1941}
1942
1945 Register DestReg, int FrameIndex,
1946 const TargetRegisterClass *RC,
1947 const TargetRegisterInfo *TRI,
1948 Register VReg) const {
1951 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1952 const DebugLoc &DL = MBB.findDebugLoc(MI);
1953 unsigned SpillSize = TRI->getSpillSize(*RC);
1954
1955 MachinePointerInfo PtrInfo
1956 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1957
1959 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1960 FrameInfo.getObjectAlign(FrameIndex));
1961
1962 if (RI.isSGPRClass(RC)) {
1963 MFI->setHasSpilledSGPRs();
1964 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1965 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1966 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1967
1968 // FIXME: Maybe this should not include a memoperand because it will be
1969 // lowered to non-memory instructions.
1970 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1971 if (DestReg.isVirtual() && SpillSize == 4) {
1973 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1974 }
1975
1976 if (RI.spillSGPRToVGPR())
1977 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1978 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1979 .addFrameIndex(FrameIndex) // addr
1980 .addMemOperand(MMO)
1982
1983 return;
1984 }
1985
1986 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1987 SpillSize, RI, *MFI);
1988 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1989 .addFrameIndex(FrameIndex) // vaddr
1990 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1991 .addImm(0) // offset
1992 .addMemOperand(MMO);
1993}
1994
1997 insertNoops(MBB, MI, 1);
1998}
1999
2002 unsigned Quantity) const {
2004 while (Quantity > 0) {
2005 unsigned Arg = std::min(Quantity, 8u);
2006 Quantity -= Arg;
2007 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
2008 }
2009}
2010
2012 auto MF = MBB.getParent();
2014
2015 assert(Info->isEntryFunction());
2016
2017 if (MBB.succ_empty()) {
2018 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
2019 if (HasNoTerminator) {
2020 if (Info->returnsVoid()) {
2021 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
2022 } else {
2023 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
2024 }
2025 }
2026 }
2027}
2028
2030 switch (MI.getOpcode()) {
2031 default:
2032 if (MI.isMetaInstruction())
2033 return 0;
2034 return 1; // FIXME: Do wait states equal cycles?
2035
2036 case AMDGPU::S_NOP:
2037 return MI.getOperand(0).getImm() + 1;
2038 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2039 // hazard, even if one exist, won't really be visible. Should we handle it?
2040 }
2041}
2042
2044 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2045 MachineBasicBlock &MBB = *MI.getParent();
2047 switch (MI.getOpcode()) {
2048 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2049 case AMDGPU::S_MOV_B64_term:
2050 // This is only a terminator to get the correct spill code placement during
2051 // register allocation.
2052 MI.setDesc(get(AMDGPU::S_MOV_B64));
2053 break;
2054
2055 case AMDGPU::S_MOV_B32_term:
2056 // This is only a terminator to get the correct spill code placement during
2057 // register allocation.
2058 MI.setDesc(get(AMDGPU::S_MOV_B32));
2059 break;
2060
2061 case AMDGPU::S_XOR_B64_term:
2062 // This is only a terminator to get the correct spill code placement during
2063 // register allocation.
2064 MI.setDesc(get(AMDGPU::S_XOR_B64));
2065 break;
2066
2067 case AMDGPU::S_XOR_B32_term:
2068 // This is only a terminator to get the correct spill code placement during
2069 // register allocation.
2070 MI.setDesc(get(AMDGPU::S_XOR_B32));
2071 break;
2072 case AMDGPU::S_OR_B64_term:
2073 // This is only a terminator to get the correct spill code placement during
2074 // register allocation.
2075 MI.setDesc(get(AMDGPU::S_OR_B64));
2076 break;
2077 case AMDGPU::S_OR_B32_term:
2078 // This is only a terminator to get the correct spill code placement during
2079 // register allocation.
2080 MI.setDesc(get(AMDGPU::S_OR_B32));
2081 break;
2082
2083 case AMDGPU::S_ANDN2_B64_term:
2084 // This is only a terminator to get the correct spill code placement during
2085 // register allocation.
2086 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2087 break;
2088
2089 case AMDGPU::S_ANDN2_B32_term:
2090 // This is only a terminator to get the correct spill code placement during
2091 // register allocation.
2092 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2093 break;
2094
2095 case AMDGPU::S_AND_B64_term:
2096 // This is only a terminator to get the correct spill code placement during
2097 // register allocation.
2098 MI.setDesc(get(AMDGPU::S_AND_B64));
2099 break;
2100
2101 case AMDGPU::S_AND_B32_term:
2102 // This is only a terminator to get the correct spill code placement during
2103 // register allocation.
2104 MI.setDesc(get(AMDGPU::S_AND_B32));
2105 break;
2106
2107 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2108 // This is only a terminator to get the correct spill code placement during
2109 // register allocation.
2110 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2111 break;
2112
2113 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2114 // This is only a terminator to get the correct spill code placement during
2115 // register allocation.
2116 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2117 break;
2118
2119 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2120 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2121 break;
2122
2123 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2124 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2125 break;
2126
2127 case AMDGPU::V_MOV_B64_PSEUDO: {
2128 Register Dst = MI.getOperand(0).getReg();
2129 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2130 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2131
2132 const MachineOperand &SrcOp = MI.getOperand(1);
2133 // FIXME: Will this work for 64-bit floating point immediates?
2134 assert(!SrcOp.isFPImm());
2135 if (ST.hasMovB64()) {
2136 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2137 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2138 isUInt<32>(SrcOp.getImm()))
2139 break;
2140 }
2141 if (SrcOp.isImm()) {
2142 APInt Imm(64, SrcOp.getImm());
2143 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2144 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2145 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2146 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2148 .addImm(Lo.getSExtValue())
2150 .addImm(Lo.getSExtValue())
2151 .addImm(0) // op_sel_lo
2152 .addImm(0) // op_sel_hi
2153 .addImm(0) // neg_lo
2154 .addImm(0) // neg_hi
2155 .addImm(0); // clamp
2156 } else {
2157 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2158 .addImm(Lo.getSExtValue())
2160 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2161 .addImm(Hi.getSExtValue())
2163 }
2164 } else {
2165 assert(SrcOp.isReg());
2166 if (ST.hasPkMovB32() &&
2167 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2168 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2169 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2170 .addReg(SrcOp.getReg())
2172 .addReg(SrcOp.getReg())
2173 .addImm(0) // op_sel_lo
2174 .addImm(0) // op_sel_hi
2175 .addImm(0) // neg_lo
2176 .addImm(0) // neg_hi
2177 .addImm(0); // clamp
2178 } else {
2179 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2180 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2182 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2183 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2185 }
2186 }
2187 MI.eraseFromParent();
2188 break;
2189 }
2190 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2192 break;
2193 }
2194 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2195 const MachineOperand &SrcOp = MI.getOperand(1);
2196 assert(!SrcOp.isFPImm());
2197 APInt Imm(64, SrcOp.getImm());
2198 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2199 MI.setDesc(get(AMDGPU::S_MOV_B64));
2200 break;
2201 }
2202
2203 Register Dst = MI.getOperand(0).getReg();
2204 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2205 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2206
2207 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2208 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2209 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2210 .addImm(Lo.getSExtValue())
2212 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2213 .addImm(Hi.getSExtValue())
2215 MI.eraseFromParent();
2216 break;
2217 }
2218 case AMDGPU::V_SET_INACTIVE_B32: {
2219 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2220 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2221 // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
2222 // optimizations (mainly Register Coalescer) aware of WWM register liveness.
2223 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2224 .add(MI.getOperand(1));
2225 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2226 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2227 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2228 .add(MI.getOperand(2));
2229 BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2230 .addReg(Exec);
2231 MI.eraseFromParent();
2232 break;
2233 }
2234 case AMDGPU::V_SET_INACTIVE_B64: {
2235 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2236 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2237 MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2238 MI.getOperand(0).getReg())
2239 .add(MI.getOperand(1));
2240 expandPostRAPseudo(*Copy);
2241 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2242 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2243 Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2244 MI.getOperand(0).getReg())
2245 .add(MI.getOperand(2));
2246 expandPostRAPseudo(*Copy);
2247 BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2248 .addReg(Exec);
2249 MI.eraseFromParent();
2250 break;
2251 }
2252 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2253 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2254 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2255 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2256 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2257 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2258 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2259 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2260 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2261 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2262 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2263 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2264 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2265 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2266 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2267 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2268 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2269 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2270 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2271 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2272 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2273 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2274 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2275 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2276 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2277 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2278 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2279 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2280 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2281 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2282
2283 unsigned Opc;
2284 if (RI.hasVGPRs(EltRC)) {
2285 Opc = AMDGPU::V_MOVRELD_B32_e32;
2286 } else {
2287 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2288 : AMDGPU::S_MOVRELD_B32;
2289 }
2290
2291 const MCInstrDesc &OpDesc = get(Opc);
2292 Register VecReg = MI.getOperand(0).getReg();
2293 bool IsUndef = MI.getOperand(1).isUndef();
2294 unsigned SubReg = MI.getOperand(3).getImm();
2295 assert(VecReg == MI.getOperand(1).getReg());
2296
2298 BuildMI(MBB, MI, DL, OpDesc)
2299 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2300 .add(MI.getOperand(2))
2302 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2303
2304 const int ImpDefIdx =
2305 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2306 const int ImpUseIdx = ImpDefIdx + 1;
2307 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2308 MI.eraseFromParent();
2309 break;
2310 }
2311 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2312 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2313 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2314 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2315 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2316 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2317 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2318 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2319 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2320 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2321 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2322 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2324 Register VecReg = MI.getOperand(0).getReg();
2325 bool IsUndef = MI.getOperand(1).isUndef();
2326 Register Idx = MI.getOperand(3).getReg();
2327 Register SubReg = MI.getOperand(4).getImm();
2328
2329 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2330 .addReg(Idx)
2332 SetOn->getOperand(3).setIsUndef();
2333
2334 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2336 BuildMI(MBB, MI, DL, OpDesc)
2337 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2338 .add(MI.getOperand(2))
2340 .addReg(VecReg,
2341 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2342
2343 const int ImpDefIdx =
2344 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2345 const int ImpUseIdx = ImpDefIdx + 1;
2346 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2347
2348 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2349
2350 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2351
2352 MI.eraseFromParent();
2353 break;
2354 }
2355 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2356 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2357 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2358 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2359 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2360 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2361 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2362 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2363 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2364 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2365 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2366 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2368 Register Dst = MI.getOperand(0).getReg();
2369 Register VecReg = MI.getOperand(1).getReg();
2370 bool IsUndef = MI.getOperand(1).isUndef();
2371 Register Idx = MI.getOperand(2).getReg();
2372 Register SubReg = MI.getOperand(3).getImm();
2373
2374 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2375 .addReg(Idx)
2377 SetOn->getOperand(3).setIsUndef();
2378
2379 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2380 .addDef(Dst)
2381 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2382 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2383
2384 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2385
2386 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2387
2388 MI.eraseFromParent();
2389 break;
2390 }
2391 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2392 MachineFunction &MF = *MBB.getParent();
2393 Register Reg = MI.getOperand(0).getReg();
2394 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2395 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2396 MachineOperand OpLo = MI.getOperand(1);
2397 MachineOperand OpHi = MI.getOperand(2);
2398
2399 // Create a bundle so these instructions won't be re-ordered by the
2400 // post-RA scheduler.
2401 MIBundleBuilder Bundler(MBB, MI);
2402 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2403
2404 // What we want here is an offset from the value returned by s_getpc (which
2405 // is the address of the s_add_u32 instruction) to the global variable, but
2406 // since the encoding of $symbol starts 4 bytes after the start of the
2407 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2408 // small. This requires us to add 4 to the global variable offset in order
2409 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2410 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2411 // instruction.
2412
2413 int64_t Adjust = 0;
2414 if (ST.hasGetPCZeroExtension()) {
2415 // Fix up hardware that does not sign-extend the 48-bit PC value by
2416 // inserting: s_sext_i32_i16 reghi, reghi
2417 Bundler.append(
2418 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2419 Adjust += 4;
2420 }
2421
2422 if (OpLo.isGlobal())
2423 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2424 Bundler.append(
2425 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2426
2427 if (OpHi.isGlobal())
2428 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2429 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2430 .addReg(RegHi)
2431 .add(OpHi));
2432
2433 finalizeBundle(MBB, Bundler.begin());
2434
2435 MI.eraseFromParent();
2436 break;
2437 }
2438 case AMDGPU::ENTER_STRICT_WWM: {
2439 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2440 // Whole Wave Mode is entered.
2441 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
2442 : AMDGPU::S_OR_SAVEEXEC_B64));
2443 break;
2444 }
2445 case AMDGPU::ENTER_STRICT_WQM: {
2446 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2447 // STRICT_WQM is entered.
2448 const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2449 const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
2450 const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2451 BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
2452 BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
2453
2454 MI.eraseFromParent();
2455 break;
2456 }
2457 case AMDGPU::EXIT_STRICT_WWM:
2458 case AMDGPU::EXIT_STRICT_WQM: {
2459 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2460 // WWM/STICT_WQM is exited.
2461 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
2462 break;
2463 }
2464 case AMDGPU::ENTER_PSEUDO_WM:
2465 case AMDGPU::EXIT_PSEUDO_WM: {
2466 // These do nothing.
2467 MI.eraseFromParent();
2468 break;
2469 }
2470 case AMDGPU::SI_RETURN: {
2471 const MachineFunction *MF = MBB.getParent();
2472 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2473 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2474 // Hiding the return address use with SI_RETURN may lead to extra kills in
2475 // the function and missing live-ins. We are fine in practice because callee
2476 // saved register handling ensures the register value is restored before
2477 // RET, but we need the undef flag here to appease the MachineVerifier
2478 // liveness checks.
2480 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2481 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2482
2483 MIB.copyImplicitOps(MI);
2484 MI.eraseFromParent();
2485 break;
2486 }
2487
2488 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2489 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2490 MI.setDesc(get(AMDGPU::S_MUL_U64));
2491 break;
2492
2493 case AMDGPU::S_GETPC_B64_pseudo:
2494 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2495 if (ST.hasGetPCZeroExtension()) {
2496 Register Dst = MI.getOperand(0).getReg();
2497 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2498 // Fix up hardware that does not sign-extend the 48-bit PC value by
2499 // inserting: s_sext_i32_i16 dsthi, dsthi
2500 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2501 DstHi)
2502 .addReg(DstHi);
2503 }
2504 break;
2505 }
2506 return true;
2507}
2508
2511 unsigned SubIdx, const MachineInstr &Orig,
2512 const TargetRegisterInfo &RI) const {
2513
2514 // Try shrinking the instruction to remat only the part needed for current
2515 // context.
2516 // TODO: Handle more cases.
2517 unsigned Opcode = Orig.getOpcode();
2518 switch (Opcode) {
2519 case AMDGPU::S_LOAD_DWORDX16_IMM:
2520 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2521 if (SubIdx != 0)
2522 break;
2523
2524 if (I == MBB.end())
2525 break;
2526
2527 if (I->isBundled())
2528 break;
2529
2530 // Look for a single use of the register that is also a subreg.
2531 Register RegToFind = Orig.getOperand(0).getReg();
2532 MachineOperand *UseMO = nullptr;
2533 for (auto &CandMO : I->operands()) {
2534 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2535 continue;
2536 if (UseMO) {
2537 UseMO = nullptr;
2538 break;
2539 }
2540 UseMO = &CandMO;
2541 }
2542 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2543 break;
2544
2545 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2546 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2547
2550 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2551
2552 unsigned NewOpcode = -1;
2553 if (SubregSize == 256)
2554 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2555 else if (SubregSize == 128)
2556 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2557 else
2558 break;
2559
2560 const MCInstrDesc &TID = get(NewOpcode);
2561 const TargetRegisterClass *NewRC =
2562 RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
2563 MRI.setRegClass(DestReg, NewRC);
2564
2565 UseMO->setReg(DestReg);
2566 UseMO->setSubReg(AMDGPU::NoSubRegister);
2567
2568 // Use a smaller load with the desired size, possibly with updated offset.
2569 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2570 MI->setDesc(TID);
2571 MI->getOperand(0).setReg(DestReg);
2572 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2573 if (Offset) {
2574 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2575 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2576 OffsetMO->setImm(FinalOffset);
2577 }
2579 for (const MachineMemOperand *MemOp : Orig.memoperands())
2580 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2581 SubregSize / 8));
2582 MI->setMemRefs(*MF, NewMMOs);
2583
2584 MBB.insert(I, MI);
2585 return;
2586 }
2587
2588 default:
2589 break;
2590 }
2591
2592 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2593}
2594
2595std::pair<MachineInstr*, MachineInstr*>
2597 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2598
2599 if (ST.hasMovB64() &&
2601 getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2602 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2603 return std::pair(&MI, nullptr);
2604 }
2605
2606 MachineBasicBlock &MBB = *MI.getParent();
2610 Register Dst = MI.getOperand(0).getReg();
2611 unsigned Part = 0;
2612 MachineInstr *Split[2];
2613
2614 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2615 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2616 if (Dst.isPhysical()) {
2617 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2618 } else {
2619 assert(MRI.isSSA());
2620 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2621 MovDPP.addDef(Tmp);
2622 }
2623
2624 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2625 const MachineOperand &SrcOp = MI.getOperand(I);
2626 assert(!SrcOp.isFPImm());
2627 if (SrcOp.isImm()) {
2628 APInt Imm(64, SrcOp.getImm());
2629 Imm.ashrInPlace(Part * 32);
2630 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2631 } else {
2632 assert(SrcOp.isReg());
2633 Register Src = SrcOp.getReg();
2634 if (Src.isPhysical())
2635 MovDPP.addReg(RI.getSubReg(Src, Sub));
2636 else
2637 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2638 }
2639 }
2640
2641 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2642 MovDPP.addImm(MO.getImm());
2643
2644 Split[Part] = MovDPP;
2645 ++Part;
2646 }
2647
2648 if (Dst.isVirtual())
2649 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2650 .addReg(Split[0]->getOperand(0).getReg())
2651 .addImm(AMDGPU::sub0)
2652 .addReg(Split[1]->getOperand(0).getReg())
2653 .addImm(AMDGPU::sub1);
2654
2655 MI.eraseFromParent();
2656 return std::pair(Split[0], Split[1]);
2657}
2658
2659std::optional<DestSourcePair>
2661 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2662 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2663
2664 return std::nullopt;
2665}
2666
2668 MachineOperand &Src0,
2669 unsigned Src0OpName,
2670 MachineOperand &Src1,
2671 unsigned Src1OpName) const {
2672 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2673 if (!Src0Mods)
2674 return false;
2675
2676 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2677 assert(Src1Mods &&
2678 "All commutable instructions have both src0 and src1 modifiers");
2679
2680 int Src0ModsVal = Src0Mods->getImm();
2681 int Src1ModsVal = Src1Mods->getImm();
2682
2683 Src1Mods->setImm(Src0ModsVal);
2684 Src0Mods->setImm(Src1ModsVal);
2685 return true;
2686}
2687
2689 MachineOperand &RegOp,
2690 MachineOperand &NonRegOp) {
2691 Register Reg = RegOp.getReg();
2692 unsigned SubReg = RegOp.getSubReg();
2693 bool IsKill = RegOp.isKill();
2694 bool IsDead = RegOp.isDead();
2695 bool IsUndef = RegOp.isUndef();
2696 bool IsDebug = RegOp.isDebug();
2697
2698 if (NonRegOp.isImm())
2699 RegOp.ChangeToImmediate(NonRegOp.getImm());
2700 else if (NonRegOp.isFI())
2701 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2702 else if (NonRegOp.isGlobal()) {
2703 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2704 NonRegOp.getTargetFlags());
2705 } else
2706 return nullptr;
2707
2708 // Make sure we don't reinterpret a subreg index in the target flags.
2709 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2710
2711 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2712 NonRegOp.setSubReg(SubReg);
2713
2714 return &MI;
2715}
2716
2718 unsigned Src0Idx,
2719 unsigned Src1Idx) const {
2720 assert(!NewMI && "this should never be used");
2721
2722 unsigned Opc = MI.getOpcode();
2723 int CommutedOpcode = commuteOpcode(Opc);
2724 if (CommutedOpcode == -1)
2725 return nullptr;
2726
2727 if (Src0Idx > Src1Idx)
2728 std::swap(Src0Idx, Src1Idx);
2729
2730 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2731 static_cast<int>(Src0Idx) &&
2732 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2733 static_cast<int>(Src1Idx) &&
2734 "inconsistency with findCommutedOpIndices");
2735
2736 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2737 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2738
2739 MachineInstr *CommutedMI = nullptr;
2740 if (Src0.isReg() && Src1.isReg()) {
2741 if (isOperandLegal(MI, Src1Idx, &Src0)) {
2742 // Be sure to copy the source modifiers to the right place.
2743 CommutedMI
2744 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2745 }
2746
2747 } else if (Src0.isReg() && !Src1.isReg()) {
2748 // src0 should always be able to support any operand type, so no need to
2749 // check operand legality.
2750 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2751 } else if (!Src0.isReg() && Src1.isReg()) {
2752 if (isOperandLegal(MI, Src1Idx, &Src0))
2753 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2754 } else {
2755 // FIXME: Found two non registers to commute. This does happen.
2756 return nullptr;
2757 }
2758
2759 if (CommutedMI) {
2760 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2761 Src1, AMDGPU::OpName::src1_modifiers);
2762
2763 CommutedMI->setDesc(get(CommutedOpcode));
2764 }
2765
2766 return CommutedMI;
2767}
2768
2769// This needs to be implemented because the source modifiers may be inserted
2770// between the true commutable operands, and the base
2771// TargetInstrInfo::commuteInstruction uses it.
2773 unsigned &SrcOpIdx0,
2774 unsigned &SrcOpIdx1) const {
2775 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2776}
2777
2779 unsigned &SrcOpIdx0,
2780 unsigned &SrcOpIdx1) const {
2781 if (!Desc.isCommutable())
2782 return false;
2783
2784 unsigned Opc = Desc.getOpcode();
2785 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2786 if (Src0Idx == -1)
2787 return false;
2788
2789 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2790 if (Src1Idx == -1)
2791 return false;
2792
2793 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2794}
2795
2797 int64_t BrOffset) const {
2798 // BranchRelaxation should never have to check s_setpc_b64 because its dest
2799 // block is unanalyzable.
2800 assert(BranchOp != AMDGPU::S_SETPC_B64);
2801
2802 // Convert to dwords.
2803 BrOffset /= 4;
2804
2805 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2806 // from the next instruction.
2807 BrOffset -= 1;
2808
2809 return isIntN(BranchOffsetBits, BrOffset);
2810}
2811
2814 return MI.getOperand(0).getMBB();
2815}
2816
2818 for (const MachineInstr &MI : MBB->terminators()) {
2819 if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO ||
2820 MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2821 MI.getOpcode() == AMDGPU::SI_LOOP)
2822 return true;
2823 }
2824 return false;
2825}
2826
2828 MachineBasicBlock &DestBB,
2829 MachineBasicBlock &RestoreBB,
2830 const DebugLoc &DL, int64_t BrOffset,
2831 RegScavenger *RS) const {
2832 assert(RS && "RegScavenger required for long branching");
2833 assert(MBB.empty() &&
2834 "new block should be inserted for expanding unconditional branch");
2835 assert(MBB.pred_size() == 1);
2836 assert(RestoreBB.empty() &&
2837 "restore block should be inserted for restoring clobbered registers");
2838
2842
2843 // FIXME: Virtual register workaround for RegScavenger not working with empty
2844 // blocks.
2845 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2846
2847 auto I = MBB.end();
2848
2849 // We need to compute the offset relative to the instruction immediately after
2850 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2851 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2852
2853 auto &MCCtx = MF->getContext();
2854 MCSymbol *PostGetPCLabel =
2855 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2856 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2857
2858 MCSymbol *OffsetLo =
2859 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2860 MCSymbol *OffsetHi =
2861 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2862 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2863 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2864 .addReg(PCReg, 0, AMDGPU::sub0)
2865 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2866 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2867 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2868 .addReg(PCReg, 0, AMDGPU::sub1)
2869 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2870
2871 // Insert the indirect branch after the other terminator.
2872 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2873 .addReg(PCReg);
2874
2875 // If a spill is needed for the pc register pair, we need to insert a spill
2876 // restore block right before the destination block, and insert a short branch
2877 // into the old destination block's fallthrough predecessor.
2878 // e.g.:
2879 //
2880 // s_cbranch_scc0 skip_long_branch:
2881 //
2882 // long_branch_bb:
2883 // spill s[8:9]
2884 // s_getpc_b64 s[8:9]
2885 // s_add_u32 s8, s8, restore_bb
2886 // s_addc_u32 s9, s9, 0
2887 // s_setpc_b64 s[8:9]
2888 //
2889 // skip_long_branch:
2890 // foo;
2891 //
2892 // .....
2893 //
2894 // dest_bb_fallthrough_predecessor:
2895 // bar;
2896 // s_branch dest_bb
2897 //
2898 // restore_bb:
2899 // restore s[8:9]
2900 // fallthrough dest_bb
2901 ///
2902 // dest_bb:
2903 // buzz;
2904
2905 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
2906 Register Scav;
2907
2908 // If we've previously reserved a register for long branches
2909 // avoid running the scavenger and just use those registers
2910 if (LongBranchReservedReg) {
2911 RS->enterBasicBlock(MBB);
2912 Scav = LongBranchReservedReg;
2913 } else {
2915 Scav = RS->scavengeRegisterBackwards(
2916 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
2917 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
2918 }
2919 if (Scav) {
2920 RS->setRegUsed(Scav);
2921 MRI.replaceRegWith(PCReg, Scav);
2922 MRI.clearVirtRegs();
2923 } else {
2924 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
2925 // SGPR spill.
2926 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2927 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2928 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
2929 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
2930 MRI.clearVirtRegs();
2931 }
2932
2933 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
2934 // Now, the distance could be defined.
2936 MCSymbolRefExpr::create(DestLabel, MCCtx),
2937 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
2938 // Add offset assignments.
2939 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
2940 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
2941 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
2942 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
2943}
2944
2945unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
2946 switch (Cond) {
2947 case SIInstrInfo::SCC_TRUE:
2948 return AMDGPU::S_CBRANCH_SCC1;
2949 case SIInstrInfo::SCC_FALSE:
2950 return AMDGPU::S_CBRANCH_SCC0;
2951 case SIInstrInfo::VCCNZ:
2952 return AMDGPU::S_CBRANCH_VCCNZ;
2953 case SIInstrInfo::VCCZ:
2954 return AMDGPU::S_CBRANCH_VCCZ;
2955 case SIInstrInfo::EXECNZ:
2956 return AMDGPU::S_CBRANCH_EXECNZ;
2957 case SIInstrInfo::EXECZ:
2958 return AMDGPU::S_CBRANCH_EXECZ;
2959 default:
2960 llvm_unreachable("invalid branch predicate");
2961 }
2962}
2963
2964SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
2965 switch (Opcode) {
2966 case AMDGPU::S_CBRANCH_SCC0:
2967 return SCC_FALSE;
2968 case AMDGPU::S_CBRANCH_SCC1:
2969 return SCC_TRUE;
2970 case AMDGPU::S_CBRANCH_VCCNZ:
2971 return VCCNZ;
2972 case AMDGPU::S_CBRANCH_VCCZ:
2973 return VCCZ;
2974 case AMDGPU::S_CBRANCH_EXECNZ:
2975 return EXECNZ;
2976 case AMDGPU::S_CBRANCH_EXECZ:
2977 return EXECZ;
2978 default:
2979 return INVALID_BR;
2980 }
2981}
2982
2986 MachineBasicBlock *&FBB,
2988 bool AllowModify) const {
2989 if (I->getOpcode() == AMDGPU::S_BRANCH) {
2990 // Unconditional Branch
2991 TBB = I->getOperand(0).getMBB();
2992 return false;
2993 }
2994
2995 MachineBasicBlock *CondBB = nullptr;
2996
2997 if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
2998 CondBB = I->getOperand(1).getMBB();
2999 Cond.push_back(I->getOperand(0));
3000 } else {
3001 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3002 if (Pred == INVALID_BR)
3003 return true;
3004
3005 CondBB = I->getOperand(0).getMBB();
3006 Cond.push_back(MachineOperand::CreateImm(Pred));
3007 Cond.push_back(I->getOperand(1)); // Save the branch register.
3008 }
3009 ++I;
3010
3011 if (I == MBB.end()) {
3012 // Conditional branch followed by fall-through.
3013 TBB = CondBB;
3014 return false;
3015 }
3016
3017 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3018 TBB = CondBB;
3019 FBB = I->getOperand(0).getMBB();
3020 return false;
3021 }
3022
3023 return true;
3024}
3025
3027 MachineBasicBlock *&FBB,
3029 bool AllowModify) const {
3031 auto E = MBB.end();
3032 if (I == E)
3033 return false;
3034
3035 // Skip over the instructions that are artificially terminators for special
3036 // exec management.
3037 while (I != E && !I->isBranch() && !I->isReturn()) {
3038 switch (I->getOpcode()) {
3039 case AMDGPU::S_MOV_B64_term:
3040 case AMDGPU::S_XOR_B64_term:
3041 case AMDGPU::S_OR_B64_term:
3042 case AMDGPU::S_ANDN2_B64_term:
3043 case AMDGPU::S_AND_B64_term:
3044 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3045 case AMDGPU::S_MOV_B32_term:
3046 case AMDGPU::S_XOR_B32_term:
3047 case AMDGPU::S_OR_B32_term:
3048 case AMDGPU::S_ANDN2_B32_term:
3049 case AMDGPU::S_AND_B32_term:
3050 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3051 break;
3052 case AMDGPU::SI_IF:
3053 case AMDGPU::SI_ELSE:
3054 case AMDGPU::SI_KILL_I1_TERMINATOR:
3055 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3056 // FIXME: It's messy that these need to be considered here at all.
3057 return true;
3058 default:
3059 llvm_unreachable("unexpected non-branch terminator inst");
3060 }
3061
3062 ++I;
3063 }
3064
3065 if (I == E)
3066 return false;
3067
3068 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3069}
3070
3072 int *BytesRemoved) const {
3073 unsigned Count = 0;
3074 unsigned RemovedSize = 0;
3076 // Skip over artificial terminators when removing instructions.
3077 if (MI.isBranch() || MI.isReturn()) {
3078 RemovedSize += getInstSizeInBytes(MI);
3079 MI.eraseFromParent();
3080 ++Count;
3081 }
3082 }
3083
3084 if (BytesRemoved)
3085 *BytesRemoved = RemovedSize;
3086
3087 return Count;
3088}
3089
3090// Copy the flags onto the implicit condition register operand.
3092 const MachineOperand &OrigCond) {
3093 CondReg.setIsUndef(OrigCond.isUndef());
3094 CondReg.setIsKill(OrigCond.isKill());
3095}
3096
3099 MachineBasicBlock *FBB,
3101 const DebugLoc &DL,
3102 int *BytesAdded) const {
3103 if (!FBB && Cond.empty()) {
3104 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3105 .addMBB(TBB);
3106 if (BytesAdded)
3107 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3108 return 1;
3109 }
3110
3111 if(Cond.size() == 1 && Cond[0].isReg()) {
3112 BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
3113 .add(Cond[0])
3114 .addMBB(TBB);
3115 return 1;
3116 }
3117
3118 assert(TBB && Cond[0].isImm());
3119
3120 unsigned Opcode
3121 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3122
3123 if (!FBB) {
3124 MachineInstr *CondBr =
3125 BuildMI(&MBB, DL, get(Opcode))
3126 .addMBB(TBB);
3127
3128 // Copy the flags onto the implicit condition register operand.
3129 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3130 fixImplicitOperands(*CondBr);
3131
3132 if (BytesAdded)
3133 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3134 return 1;
3135 }
3136
3137 assert(TBB && FBB);
3138
3139 MachineInstr *CondBr =
3140 BuildMI(&MBB, DL, get(Opcode))
3141 .addMBB(TBB);
3142 fixImplicitOperands(*CondBr);
3143 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3144 .addMBB(FBB);
3145
3146 MachineOperand &CondReg = CondBr->getOperand(1);
3147 CondReg.setIsUndef(Cond[1].isUndef());
3148 CondReg.setIsKill(Cond[1].isKill());
3149
3150 if (BytesAdded)
3151 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3152
3153 return 2;
3154}
3155
3158 if (Cond.size() != 2) {
3159 return true;
3160 }
3161
3162 if (Cond[0].isImm()) {
3163 Cond[0].setImm(-Cond[0].getImm());
3164 return false;
3165 }
3166
3167 return true;
3168}
3169
3172 Register DstReg, Register TrueReg,
3173 Register FalseReg, int &CondCycles,
3174 int &TrueCycles, int &FalseCycles) const {
3175 switch (Cond[0].getImm()) {
3176 case VCCNZ:
3177 case VCCZ: {
3179 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3180 if (MRI.getRegClass(FalseReg) != RC)
3181 return false;
3182
3183 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3184 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3185
3186 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3187 return RI.hasVGPRs(RC) && NumInsts <= 6;
3188 }
3189 case SCC_TRUE:
3190 case SCC_FALSE: {
3191 // FIXME: We could insert for VGPRs if we could replace the original compare
3192 // with a vector one.
3194 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3195 if (MRI.getRegClass(FalseReg) != RC)
3196 return false;
3197
3198 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3199
3200 // Multiples of 8 can do s_cselect_b64
3201 if (NumInsts % 2 == 0)
3202 NumInsts /= 2;
3203
3204 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3205 return RI.isSGPRClass(RC);
3206 }
3207 default:
3208 return false;
3209 }
3210}
3211
3215 Register TrueReg, Register FalseReg) const {
3216 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3217 if (Pred == VCCZ || Pred == SCC_FALSE) {
3218 Pred = static_cast<BranchPredicate>(-Pred);
3219 std::swap(TrueReg, FalseReg);
3220 }
3221
3223 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3224 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3225
3226 if (DstSize == 32) {
3228 if (Pred == SCC_TRUE) {
3229 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3230 .addReg(TrueReg)
3231 .addReg(FalseReg);
3232 } else {
3233 // Instruction's operands are backwards from what is expected.
3234 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3235 .addReg(FalseReg)
3236 .addReg(TrueReg);
3237 }
3238
3239 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3240 return;
3241 }
3242
3243 if (DstSize == 64 && Pred == SCC_TRUE) {
3245 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3246 .addReg(TrueReg)
3247 .addReg(FalseReg);
3248
3249 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3250 return;
3251 }
3252
3253 static const int16_t Sub0_15[] = {
3254 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3255 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3256 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3257 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3258 };
3259
3260 static const int16_t Sub0_15_64[] = {
3261 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3262 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3263 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3264 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3265 };
3266
3267 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3268 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3269 const int16_t *SubIndices = Sub0_15;
3270 int NElts = DstSize / 32;
3271
3272 // 64-bit select is only available for SALU.
3273 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3274 if (Pred == SCC_TRUE) {
3275 if (NElts % 2) {
3276 SelOp = AMDGPU::S_CSELECT_B32;
3277 EltRC = &AMDGPU::SGPR_32RegClass;
3278 } else {
3279 SelOp = AMDGPU::S_CSELECT_B64;
3280 EltRC = &AMDGPU::SGPR_64RegClass;
3281 SubIndices = Sub0_15_64;
3282 NElts /= 2;
3283 }
3284 }
3285
3287 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3288
3289 I = MIB->getIterator();
3290
3292 for (int Idx = 0; Idx != NElts; ++Idx) {
3293 Register DstElt = MRI.createVirtualRegister(EltRC);
3294 Regs.push_back(DstElt);
3295
3296 unsigned SubIdx = SubIndices[Idx];
3297
3299 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3300 Select =
3301 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3302 .addReg(FalseReg, 0, SubIdx)
3303 .addReg(TrueReg, 0, SubIdx);
3304 } else {
3305 Select =
3306 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3307 .addReg(TrueReg, 0, SubIdx)
3308 .addReg(FalseReg, 0, SubIdx);
3309 }
3310
3311 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3313
3314 MIB.addReg(DstElt)
3315 .addImm(SubIdx);
3316 }
3317}
3318
3320 switch (MI.getOpcode()) {
3321 case AMDGPU::V_MOV_B32_e32:
3322 case AMDGPU::V_MOV_B32_e64:
3323 case AMDGPU::V_MOV_B64_PSEUDO:
3324 case AMDGPU::V_MOV_B64_e32:
3325 case AMDGPU::V_MOV_B64_e64:
3326 case AMDGPU::S_MOV_B32:
3327 case AMDGPU::S_MOV_B64:
3328 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3329 case AMDGPU::COPY:
3330 case AMDGPU::WWM_COPY:
3331 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3332 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3333 case AMDGPU::V_ACCVGPR_MOV_B32:
3334 return true;
3335 default:
3336 return false;
3337 }
3338}
3339
3340static constexpr unsigned ModifierOpNames[] = {
3341 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3342 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3343 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3344
3346 unsigned Opc = MI.getOpcode();
3347 for (unsigned Name : reverse(ModifierOpNames)) {
3349 if (Idx >= 0)
3350 MI.removeOperand(Idx);
3351 }
3352}
3353
3355 Register Reg, MachineRegisterInfo *MRI) const {
3356 if (!MRI->hasOneNonDBGUse(Reg))
3357 return false;
3358
3359 switch (DefMI.getOpcode()) {
3360 default:
3361 return false;
3362 case AMDGPU::V_MOV_B64_e32:
3363 case AMDGPU::S_MOV_B64:
3364 case AMDGPU::V_MOV_B64_PSEUDO:
3365 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3366 case AMDGPU::V_MOV_B32_e32:
3367 case AMDGPU::S_MOV_B32:
3368 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3369 break;
3370 }
3371
3372 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
3373 assert(ImmOp);
3374 // FIXME: We could handle FrameIndex values here.
3375 if (!ImmOp->isImm())
3376 return false;
3377
3378 auto getImmFor = [ImmOp](const MachineOperand &UseOp) -> int64_t {
3379 int64_t Imm = ImmOp->getImm();
3380 switch (UseOp.getSubReg()) {
3381 default:
3382 return Imm;
3383 case AMDGPU::sub0:
3384 return Lo_32(Imm);
3385 case AMDGPU::sub1:
3386 return Hi_32(Imm);
3387 case AMDGPU::lo16:
3388 return APInt(16, Imm).getSExtValue();
3389 case AMDGPU::hi16:
3390 return APInt(32, Imm).ashr(16).getSExtValue();
3391 case AMDGPU::sub1_lo16:
3392 return APInt(16, Hi_32(Imm)).getSExtValue();
3393 case AMDGPU::sub1_hi16:
3394 return APInt(32, Hi_32(Imm)).ashr(16).getSExtValue();
3395 }
3396 };
3397
3398 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3399
3400 unsigned Opc = UseMI.getOpcode();
3401 if (Opc == AMDGPU::COPY) {
3402 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3403
3404 Register DstReg = UseMI.getOperand(0).getReg();
3405 unsigned OpSize = getOpSize(UseMI, 0);
3406 bool Is16Bit = OpSize == 2;
3407 bool Is64Bit = OpSize == 8;
3408 bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
3409 unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
3410 : AMDGPU::V_MOV_B32_e32
3411 : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
3412 : AMDGPU::S_MOV_B32;
3413 APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1)));
3414
3415 if (RI.isAGPR(*MRI, DstReg)) {
3416 if (Is64Bit || !isInlineConstant(Imm))
3417 return false;
3418 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
3419 }
3420
3421 if (Is16Bit) {
3422 if (isVGPRCopy)
3423 return false; // Do not clobber vgpr_hi16
3424
3425 if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
3426 return false;
3427
3428 UseMI.getOperand(0).setSubReg(0);
3429 if (DstReg.isPhysical()) {
3430 DstReg = RI.get32BitRegister(DstReg);
3431 UseMI.getOperand(0).setReg(DstReg);
3432 }
3433 assert(UseMI.getOperand(1).getReg().isVirtual());
3434 }
3435
3436 const MCInstrDesc &NewMCID = get(NewOpc);
3437 if (DstReg.isPhysical() &&
3438 !RI.getRegClass(NewMCID.operands()[0].RegClass)->contains(DstReg))
3439 return false;
3440
3441 UseMI.setDesc(NewMCID);
3442 UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
3443 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
3444 return true;
3445 }
3446
3447 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3448 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3449 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3450 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3451 Opc == AMDGPU::V_FMAC_F16_t16_e64) {
3452 // Don't fold if we are using source or output modifiers. The new VOP2
3453 // instructions don't have them.
3455 return false;
3456
3457 // If this is a free constant, there's no reason to do this.
3458 // TODO: We could fold this here instead of letting SIFoldOperands do it
3459 // later.
3460 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
3461
3462 // Any src operand can be used for the legality check.
3463 if (isInlineConstant(UseMI, *Src0, *ImmOp))
3464 return false;
3465
3466 bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3467 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64;
3468 bool IsFMA =
3469 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3470 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3471 Opc == AMDGPU::V_FMAC_F16_t16_e64;
3472 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3473 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3474
3475 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3476 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3477 (Src1->isReg() && Src1->getReg() == Reg)) {
3478 MachineOperand *RegSrc =
3479 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3480 if (!RegSrc->isReg())
3481 return false;
3482 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3483 ST.getConstantBusLimit(Opc) < 2)
3484 return false;
3485
3486 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3487 return false;
3488
3489 // If src2 is also a literal constant then we have to choose which one to
3490 // fold. In general it is better to choose madak so that the other literal
3491 // can be materialized in an sgpr instead of a vgpr:
3492 // s_mov_b32 s0, literal
3493 // v_madak_f32 v0, s0, v0, literal
3494 // Instead of:
3495 // v_mov_b32 v1, literal
3496 // v_madmk_f32 v0, v0, literal, v1
3497 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3498 if (Def && Def->isMoveImmediate() &&
3499 !isInlineConstant(Def->getOperand(1)))
3500 return false;
3501
3502 unsigned NewOpc =
3503 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3504 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
3505 : AMDGPU::V_FMAMK_F16)
3506 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
3507 if (pseudoToMCOpcode(NewOpc) == -1)
3508 return false;
3509
3510 // V_FMAMK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
3511 // would also require restricting their register classes. For now
3512 // just bail out.
3513 if (NewOpc == AMDGPU::V_FMAMK_F16_t16)
3514 return false;
3515
3516 const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
3517
3518 // FIXME: This would be a lot easier if we could return a new instruction
3519 // instead of having to modify in place.
3520
3521 Register SrcReg = RegSrc->getReg();
3522 unsigned SrcSubReg = RegSrc->getSubReg();
3523 Src0->setReg(SrcReg);
3524 Src0->setSubReg(SrcSubReg);
3525 Src0->setIsKill(RegSrc->isKill());
3526
3527 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3528 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3529 Opc == AMDGPU::V_FMAC_F16_e64)
3530 UseMI.untieRegOperand(
3531 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3532
3533 Src1->ChangeToImmediate(Imm);
3534
3536 UseMI.setDesc(get(NewOpc));
3537
3538 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3539 if (DeleteDef)
3540 DefMI.eraseFromParent();
3541
3542 return true;
3543 }
3544
3545 // Added part is the constant: Use v_madak_{f16, f32}.
3546 if (Src2->isReg() && Src2->getReg() == Reg) {
3547 if (ST.getConstantBusLimit(Opc) < 2) {
3548 // Not allowed to use constant bus for another operand.
3549 // We can however allow an inline immediate as src0.
3550 bool Src0Inlined = false;
3551 if (Src0->isReg()) {
3552 // Try to inline constant if possible.
3553 // If the Def moves immediate and the use is single
3554 // We are saving VGPR here.
3555 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3556 if (Def && Def->isMoveImmediate() &&
3557 isInlineConstant(Def->getOperand(1)) &&
3558 MRI->hasOneUse(Src0->getReg())) {
3559 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3560 Src0Inlined = true;
3561 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3562 RI.isSGPRReg(*MRI, Src0->getReg())) {
3563 return false;
3564 }
3565 // VGPR is okay as Src0 - fallthrough
3566 }
3567
3568 if (Src1->isReg() && !Src0Inlined) {
3569 // We have one slot for inlinable constant so far - try to fill it
3570 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3571 if (Def && Def->isMoveImmediate() &&
3572 isInlineConstant(Def->getOperand(1)) &&
3573 MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI))
3574 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3575 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3576 return false;
3577 // VGPR is okay as Src1 - fallthrough
3578 }
3579 }
3580
3581 unsigned NewOpc =
3582 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3583 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
3584 : AMDGPU::V_FMAAK_F16)
3585 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
3586 if (pseudoToMCOpcode(NewOpc) == -1)
3587 return false;
3588
3589 // V_FMAAK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
3590 // would also require restricting their register classes. For now
3591 // just bail out.
3592 if (NewOpc == AMDGPU::V_FMAAK_F16_t16)
3593 return false;
3594
3595 // FIXME: This would be a lot easier if we could return a new instruction
3596 // instead of having to modify in place.
3597
3598 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3599 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3600 Opc == AMDGPU::V_FMAC_F16_e64)
3601 UseMI.untieRegOperand(
3602 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3603
3604 // ChangingToImmediate adds Src2 back to the instruction.
3605 Src2->ChangeToImmediate(getImmFor(*Src2));
3606
3607 // These come before src2.
3609 UseMI.setDesc(get(NewOpc));
3610 // It might happen that UseMI was commuted
3611 // and we now have SGPR as SRC1. If so 2 inlined
3612 // constant and SGPR are illegal.
3614
3615 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3616 if (DeleteDef)
3617 DefMI.eraseFromParent();
3618
3619 return true;
3620 }
3621 }
3622
3623 return false;
3624}
3625
3626static bool
3629 if (BaseOps1.size() != BaseOps2.size())
3630 return false;
3631 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3632 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3633 return false;
3634 }
3635 return true;
3636}
3637
3638static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
3639 int WidthB, int OffsetB) {
3640 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3641 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3642 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3643 return LowOffset + LowWidth <= HighOffset;
3644}
3645
3646bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3647 const MachineInstr &MIb) const {
3648 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3649 int64_t Offset0, Offset1;
3650 LocationSize Dummy0 = 0, Dummy1 = 0;
3651 bool Offset0IsScalable, Offset1IsScalable;
3652 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3653 Dummy0, &RI) ||
3654 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3655 Dummy1, &RI))
3656 return false;
3657
3658 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3659 return false;
3660
3661 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3662 // FIXME: Handle ds_read2 / ds_write2.
3663 return false;
3664 }
3665 unsigned Width0 = MIa.memoperands().front()->getSize();
3666 unsigned Width1 = MIb.memoperands().front()->getSize();
3667 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3668}
3669
3671 const MachineInstr &MIb) const {
3672 assert(MIa.mayLoadOrStore() &&
3673 "MIa must load from or modify a memory location");
3674 assert(MIb.mayLoadOrStore() &&
3675 "MIb must load from or modify a memory location");
3676
3678 return false;
3679
3680 // XXX - Can we relax this between address spaces?
3681 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3682 return false;
3683
3684 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3685 return false;
3686
3687 // TODO: Should we check the address space from the MachineMemOperand? That
3688 // would allow us to distinguish objects we know don't alias based on the
3689 // underlying address space, even if it was lowered to a different one,
3690 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3691 // buffer.
3692 if (isDS(MIa)) {
3693 if (isDS(MIb))
3694 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3695
3696 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3697 }
3698
3699 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3700 if (isMUBUF(MIb) || isMTBUF(MIb))
3701 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3702
3703 if (isFLAT(MIb))
3704 return isFLATScratch(MIb);
3705
3706 return !isSMRD(MIb);
3707 }
3708
3709 if (isSMRD(MIa)) {
3710 if (isSMRD(MIb))
3711 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3712
3713 if (isFLAT(MIb))
3714 return isFLATScratch(MIb);
3715
3716 return !isMUBUF(MIb) && !isMTBUF(MIb);
3717 }
3718
3719 if (isFLAT(MIa)) {
3720 if (isFLAT(MIb)) {
3721 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3722 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3723 return true;
3724
3725 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3726 }
3727
3728 return false;
3729 }
3730
3731 return false;
3732}
3733
3735 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3736 if (Reg.isPhysical())
3737 return false;
3738 auto *Def = MRI.getUniqueVRegDef(Reg);
3739 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3740 Imm = Def->getOperand(1).getImm();
3741 if (DefMI)
3742 *DefMI = Def;
3743 return true;
3744 }
3745 return false;
3746}
3747
3748static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3749 MachineInstr **DefMI = nullptr) {
3750 if (!MO->isReg())
3751 return false;
3752 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3753 const MachineRegisterInfo &MRI = MF->getRegInfo();
3754 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3755}
3756
3758 MachineInstr &NewMI) {
3759 if (LV) {
3760 unsigned NumOps = MI.getNumOperands();
3761 for (unsigned I = 1; I < NumOps; ++I) {
3762 MachineOperand &Op = MI.getOperand(I);
3763 if (Op.isReg() && Op.isKill())
3764 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3765 }
3766 }
3767}
3768
3770 LiveVariables *LV,
3771 LiveIntervals *LIS) const {
3772 MachineBasicBlock &MBB = *MI.getParent();
3773 unsigned Opc = MI.getOpcode();
3774
3775 // Handle MFMA.
3776 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
3777 if (NewMFMAOpc != -1) {
3779 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
3780 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3781 MIB.add(MI.getOperand(I));
3782 updateLiveVariables(LV, MI, *MIB);
3783 if (LIS) {
3784 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3785 // SlotIndex of defs needs to be updated when converting to early-clobber
3786 MachineOperand &Def = MIB->getOperand(0);
3787 if (Def.isEarlyClobber() && Def.isReg() &&
3788 LIS->hasInterval(Def.getReg())) {
3789 SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
3790 SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
3791 auto &LI = LIS->getInterval(Def.getReg());
3792 auto UpdateDefIndex = [&](LiveRange &LR) {
3793 auto S = LR.find(OldIndex);
3794 if (S != LR.end() && S->start == OldIndex) {
3795 assert(S->valno && S->valno->def == OldIndex);
3796 S->start = NewIndex;
3797 S->valno->def = NewIndex;
3798 }
3799 };
3800 UpdateDefIndex(LI);
3801 for (auto &SR : LI.subranges())
3802 UpdateDefIndex(SR);
3803 }
3804 }
3805 return MIB;
3806 }
3807
3808 if (SIInstrInfo::isWMMA(MI)) {
3809 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
3810 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3811 .setMIFlags(MI.getFlags());
3812 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3813 MIB->addOperand(MI.getOperand(I));
3814
3815 updateLiveVariables(LV, MI, *MIB);
3816 if (LIS)
3817 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3818
3819 return MIB;
3820 }
3821
3822 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
3823 "V_FMAC_F16_t16_e32 is not supported and not expected to be present "
3824 "pre-RA");
3825
3826 // Handle MAC/FMAC.
3827 bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
3828 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3829 Opc == AMDGPU::V_FMAC_F16_t16_e64;
3830 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3831 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3832 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
3833 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3834 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3835 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3836 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3837 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
3838 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
3839 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3840 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
3841 bool Src0Literal = false;
3842
3843 switch (Opc) {
3844 default:
3845 return nullptr;
3846 case AMDGPU::V_MAC_F16_e64:
3847 case AMDGPU::V_FMAC_F16_e64:
3848 case AMDGPU::V_FMAC_F16_t16_e64:
3849 case AMDGPU::V_MAC_F32_e64:
3850 case AMDGPU::V_MAC_LEGACY_F32_e64:
3851 case AMDGPU::V_FMAC_F32_e64:
3852 case AMDGPU::V_FMAC_LEGACY_F32_e64:
3853 case AMDGPU::V_FMAC_F64_e64:
3854 break;
3855 case AMDGPU::V_MAC_F16_e32:
3856 case AMDGPU::V_FMAC_F16_e32:
3857 case AMDGPU::V_MAC_F32_e32:
3858 case AMDGPU::V_MAC_LEGACY_F32_e32:
3859 case AMDGPU::V_FMAC_F32_e32:
3860 case AMDGPU::V_FMAC_LEGACY_F32_e32:
3861 case AMDGPU::V_FMAC_F64_e32: {
3862 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
3863 AMDGPU::OpName::src0);
3864 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
3865 if (!Src0->isReg() && !Src0->isImm())
3866 return nullptr;
3867
3868 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
3869 Src0Literal = true;
3870
3871 break;
3872 }
3873 }
3874
3876 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
3877 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
3878 const MachineOperand *Src0Mods =
3879 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
3880 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3881 const MachineOperand *Src1Mods =
3882 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
3883 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3884 const MachineOperand *Src2Mods =
3885 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
3886 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
3887 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
3888 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
3889
3890 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 &&
3891 !IsLegacy &&
3892 // If we have an SGPR input, we will violate the constant bus restriction.
3893 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
3894 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
3896 const auto killDef = [&]() -> void {
3898 // The only user is the instruction which will be killed.
3899 Register DefReg = DefMI->getOperand(0).getReg();
3900 if (!MRI.hasOneNonDBGUse(DefReg))
3901 return;
3902 // We cannot just remove the DefMI here, calling pass will crash.
3903 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
3904 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
3906 if (LV)
3907 LV->getVarInfo(DefReg).AliveBlocks.clear();
3908 };
3909
3910 int64_t Imm;
3911 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
3912 unsigned NewOpc =
3913 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
3914 : AMDGPU::V_FMAAK_F16)
3915 : AMDGPU::V_FMAAK_F32)
3916 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
3917 if (pseudoToMCOpcode(NewOpc) != -1) {
3918 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3919 .add(*Dst)
3920 .add(*Src0)
3921 .add(*Src1)
3922 .addImm(Imm);
3923 updateLiveVariables(LV, MI, *MIB);
3924 if (LIS)
3925 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3926 killDef();
3927 return MIB;
3928 }
3929 }
3930 unsigned NewOpc =
3931 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
3932 : AMDGPU::V_FMAMK_F16)
3933 : AMDGPU::V_FMAMK_F32)
3934 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
3935 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
3936 if (pseudoToMCOpcode(NewOpc) != -1) {
3937 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3938 .add(*Dst)
3939 .add(*Src0)
3940 .addImm(Imm)
3941 .add(*Src2);
3942 updateLiveVariables(LV, MI, *MIB);
3943 if (LIS)
3944 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3945 killDef();
3946 return MIB;
3947 }
3948 }
3949 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
3950 if (Src0Literal) {
3951 Imm = Src0->getImm();
3952 DefMI = nullptr;
3953 }
3954 if (pseudoToMCOpcode(NewOpc) != -1 &&
3956 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
3957 Src1)) {
3958 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3959 .add(*Dst)
3960 .add(*Src1)
3961 .addImm(Imm)
3962 .add(*Src2);
3963 updateLiveVariables(LV, MI, *MIB);
3964 if (LIS)
3965 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3966 if (DefMI)
3967 killDef();
3968 return MIB;
3969 }
3970 }
3971 }
3972
3973 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
3974 // if VOP3 does not allow a literal operand.
3975 if (Src0Literal && !ST.hasVOP3Literal())
3976 return nullptr;
3977
3978 unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64
3979 : IsF64 ? AMDGPU::V_FMA_F64_e64
3980 : IsLegacy
3981 ? AMDGPU::V_FMA_LEGACY_F32_e64
3982 : AMDGPU::V_FMA_F32_e64
3983 : IsF16 ? AMDGPU::V_MAD_F16_e64
3984 : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64
3985 : AMDGPU::V_MAD_F32_e64;
3986 if (pseudoToMCOpcode(NewOpc) == -1)
3987 return nullptr;
3988
3989 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3990 .add(*Dst)
3991 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
3992 .add(*Src0)
3993 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
3994 .add(*Src1)
3995 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
3996 .add(*Src2)
3997 .addImm(Clamp ? Clamp->getImm() : 0)
3998 .addImm(Omod ? Omod->getImm() : 0);
3999 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4000 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4001 updateLiveVariables(LV, MI, *MIB);
4002 if (LIS)
4003 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4004 return MIB;
4005}
4006
4007// It's not generally safe to move VALU instructions across these since it will
4008// start using the register as a base index rather than directly.
4009// XXX - Why isn't hasSideEffects sufficient for these?
4011 switch (MI.getOpcode()) {
4012 case AMDGPU::S_SET_GPR_IDX_ON:
4013 case AMDGPU::S_SET_GPR_IDX_MODE:
4014 case AMDGPU::S_SET_GPR_IDX_OFF:
4015 return true;
4016 default:
4017 return false;
4018 }
4019}
4020
4022 const MachineBasicBlock *MBB,
4023 const MachineFunction &MF) const {
4024 // Skipping the check for SP writes in the base implementation. The reason it
4025 // was added was apparently due to compile time concerns.
4026 //
4027 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4028 // but is probably avoidable.
4029
4030 // Copied from base implementation.
4031 // Terminators and labels can't be scheduled around.
4032 if (MI.isTerminator() || MI.isPosition())
4033 return true;
4034
4035 // INLINEASM_BR can jump to another block
4036 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4037 return true;
4038
4039 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4040 return true;
4041
4042 // Target-independent instructions do not have an implicit-use of EXEC, even
4043 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4044 // boundaries prevents incorrect movements of such instructions.
4045 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4046 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4047 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4048 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4050}
4051
4053 return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
4054}
4055
4057 // Skip the full operand and register alias search modifiesRegister
4058 // does. There's only a handful of instructions that touch this, it's only an
4059 // implicit def, and doesn't alias any other registers.
4060 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4061}
4062
4064 unsigned Opcode = MI.getOpcode();
4065
4066 if (MI.mayStore() && isSMRD(MI))
4067 return true; // scalar store or atomic
4068
4069 // This will terminate the function when other lanes may need to continue.
4070 if (MI.isReturn())
4071 return true;
4072
4073 // These instructions cause shader I/O that may cause hardware lockups
4074 // when executed with an empty EXEC mask.
4075 //
4076 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4077 // EXEC = 0, but checking for that case here seems not worth it
4078 // given the typical code patterns.
4079 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4080 isEXP(Opcode) ||
4081 Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP ||
4082 Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER)
4083 return true;
4084
4085 if (MI.isCall() || MI.isInlineAsm())
4086 return true; // conservative assumption
4087
4088 // A mode change is a scalar operation that influences vector instructions.
4090 return true;
4091
4092 // These are like SALU instructions in terms of effects, so it's questionable
4093 // whether we should return true for those.
4094 //
4095 // However, executing them with EXEC = 0 causes them to operate on undefined
4096 // data, which we avoid by returning true here.
4097 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4098 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4099 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4100 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4101 return true;
4102
4103 return false;
4104}
4105
4107 const MachineInstr &MI) const {
4108 if (MI.isMetaInstruction())
4109 return false;
4110
4111 // This won't read exec if this is an SGPR->SGPR copy.
4112 if (MI.isCopyLike()) {
4113 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4114 return true;
4115
4116 // Make sure this isn't copying exec as a normal operand
4117 return MI.readsRegister(AMDGPU::EXEC, &RI);
4118 }
4119
4120 // Make a conservative assumption about the callee.
4121 if (MI.isCall())
4122 return true;
4123
4124 // Be conservative with any unhandled generic opcodes.
4125 if (!isTargetSpecificOpcode(MI.getOpcode()))
4126 return true;
4127
4128 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4129}
4130
4131bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4132 switch (Imm.getBitWidth()) {
4133 case 1: // This likely will be a condition code mask.
4134 return true;
4135
4136 case 32:
4137 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4138 ST.hasInv2PiInlineImm());
4139 case 64:
4140 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4141 ST.hasInv2PiInlineImm());
4142 case 16:
4143 return ST.has16BitInsts() &&
4144 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4145 ST.hasInv2PiInlineImm());
4146 default:
4147 llvm_unreachable("invalid bitwidth");
4148 }
4149}
4150
4152 APInt IntImm = Imm.bitcastToAPInt();
4153 int64_t IntImmVal = IntImm.getSExtValue();
4154 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4155 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4156 default:
4157 llvm_unreachable("invalid fltSemantics");
4160 return isInlineConstant(IntImm);
4162 return ST.has16BitInsts() &&
4163 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4165 return ST.has16BitInsts() &&
4166 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4167 }
4168}
4169
4171 uint8_t OperandType) const {
4172 assert(!MO.isReg() && "isInlineConstant called on register operand!");
4173 if (!MO.isImm())
4174 return false;
4175
4176 // MachineOperand provides no way to tell the true operand size, since it only
4177 // records a 64-bit value. We need to know the size to determine if a 32-bit
4178 // floating point immediate bit pattern is legal for an integer immediate. It
4179 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4180
4181 int64_t Imm = MO.getImm();
4182 switch (OperandType) {
4195 int32_t Trunc = static_cast<int32_t>(Imm);
4197 }
4204 ST.hasInv2PiInlineImm());
4208 // We would expect inline immediates to not be concerned with an integer/fp
4209 // distinction. However, in the case of 16-bit integer operations, the
4210 // "floating point" values appear to not work. It seems read the low 16-bits
4211 // of 32-bit immediates, which happens to always work for the integer
4212 // values.
4213 //
4214 // See llvm bugzilla 46302.
4215 //
4216 // TODO: Theoretically we could use op-sel to use the high bits of the
4217 // 32-bit FP values.
4235 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4236 // A few special case instructions have 16-bit operands on subtargets
4237 // where 16-bit instructions are not legal.
4238 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4239 // constants in these cases
4240 int16_t Trunc = static_cast<int16_t>(Imm);
4241 return ST.has16BitInsts() &&
4243 }
4244
4245 return false;
4246 }
4251 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4252 int16_t Trunc = static_cast<int16_t>(Imm);
4253 return ST.has16BitInsts() &&
4255 }
4256 return false;
4257 }
4260 return false;
4263 // Always embedded in the instruction for free.
4264 return true;
4274 // Just ignore anything else.
4275 return true;
4276 default:
4277 llvm_unreachable("invalid operand type");
4278 }
4279}
4280
4281static bool compareMachineOp(const MachineOperand &Op0,
4282 const MachineOperand &Op1) {
4283 if (Op0.getType() != Op1.getType())
4284 return false;
4285
4286 switch (Op0.getType()) {
4288 return Op0.getReg() == Op1.getReg();
4290 return Op0.getImm() == Op1.getImm();
4291 default:
4292 llvm_unreachable("Didn't expect to be comparing these operand types");
4293 }
4294}
4295
4297 const MachineOperand &MO) const {
4298 const MCInstrDesc &InstDesc = MI.getDesc();
4299 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4300
4301 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
4302
4304 return true;
4305
4306 if (OpInfo.RegClass < 0)
4307 return false;
4308
4309 if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
4310 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
4311 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4312 AMDGPU::OpName::src2))
4313 return false;
4314 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4315 }
4316
4317 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4318 return false;
4319
4320 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
4321 return true;
4322
4323 return ST.hasVOP3Literal();
4324}
4325
4326bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4327 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4328 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4329 return false;
4330
4331 int Op32 = AMDGPU::getVOPe32(Opcode);
4332 if (Op32 == -1)
4333 return false;
4334
4335 return pseudoToMCOpcode(Op32) != -1;
4336}
4337
4338bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4339 // The src0_modifier operand is present on all instructions
4340 // that have modifiers.
4341
4342 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4343}
4344
4346 unsigned OpName) const {
4347 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4348 return Mods && Mods->getImm();
4349}
4350
4352 return any_of(ModifierOpNames,
4353 [&](unsigned Name) { return hasModifiersSet(MI, Name); });
4354}
4355
4357 const MachineRegisterInfo &MRI) const {
4358 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4359 // Can't shrink instruction with three operands.
4360 if (Src2) {
4361 switch (MI.getOpcode()) {
4362 default: return false;
4363
4364 case AMDGPU::V_ADDC_U32_e64:
4365 case AMDGPU::V_SUBB_U32_e64:
4366 case AMDGPU::V_SUBBREV_U32_e64: {
4367 const MachineOperand *Src1
4368 = getNamedOperand(MI, AMDGPU::OpName::src1);
4369 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4370 return false;
4371 // Additional verification is needed for sdst/src2.
4372 return true;
4373 }
4374 case AMDGPU::V_MAC_F16_e64:
4375 case AMDGPU::V_MAC_F32_e64:
4376 case AMDGPU::V_MAC_LEGACY_F32_e64:
4377 case AMDGPU::V_FMAC_F16_e64:
4378 case AMDGPU::V_FMAC_F16_t16_e64:
4379 case AMDGPU::V_FMAC_F32_e64:
4380 case AMDGPU::V_FMAC_F64_e64:
4381 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4382 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4383 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4384 return false;
4385 break;
4386
4387 case AMDGPU::V_CNDMASK_B32_e64:
4388 break;
4389 }
4390 }
4391
4392 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4393 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4394 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4395 return false;
4396
4397 // We don't need to check src0, all input types are legal, so just make sure
4398 // src0 isn't using any modifiers.
4399 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4400 return false;
4401
4402 // Can it be shrunk to a valid 32 bit opcode?
4403 if (!hasVALU32BitEncoding(MI.getOpcode()))
4404 return false;
4405
4406 // Check output modifiers
4407 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4408 !hasModifiersSet(MI, AMDGPU::OpName::clamp);
4409}
4410
4411// Set VCC operand with all flags from \p Orig, except for setting it as
4412// implicit.
4414 const MachineOperand &Orig) {
4415
4416 for (MachineOperand &Use : MI.implicit_operands()) {
4417 if (Use.isUse() &&
4418 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4419 Use.setIsUndef(Orig.isUndef());
4420 Use.setIsKill(Orig.isKill());
4421 return;
4422 }
4423 }
4424}
4425
4427 unsigned Op32) const {
4428 MachineBasicBlock *MBB = MI.getParent();
4429 MachineInstrBuilder Inst32 =
4430 BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32))
4431 .setMIFlags(MI.getFlags());
4432
4433 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4434 // For VOPC instructions, this is replaced by an implicit def of vcc.
4435 if (AMDGPU::hasNamedOperand(Op32, AMDGPU::OpName::vdst)) {
4436 // dst
4437 Inst32.add(MI.getOperand(0));
4438 } else if (AMDGPU::hasNamedOperand(Op32, AMDGPU::OpName::sdst)) {
4439 // VOPCX instructions won't be writing to an explicit dst, so this should
4440 // not fail for these instructions.
4441 assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) ||
4442 (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&
4443 "Unexpected case");
4444 }
4445
4446 Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0));
4447
4448 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4449 if (Src1)
4450 Inst32.add(*Src1);
4451
4452 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4453
4454 if (Src2) {
4455 int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
4456 if (Op32Src2Idx != -1) {
4457 Inst32.add(*Src2);
4458 } else {
4459 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4460 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4461 // of vcc was already added during the initial BuildMI, but we
4462 // 1) may need to change vcc to vcc_lo to preserve the original register
4463 // 2) have to preserve the original flags.
4464 fixImplicitOperands(*Inst32);
4465 copyFlagsToImplicitVCC(*Inst32, *Src2);
4466 }
4467 }
4468
4469 return Inst32;
4470}
4471
4473 const MachineOperand &MO,
4474 const MCOperandInfo &OpInfo) const {
4475 // Literal constants use the constant bus.
4476 if (!MO.isReg())
4477 return !isInlineConstant(MO, OpInfo);
4478
4479 if (!MO.isUse())
4480 return false;
4481
4482 if (MO.getReg().isVirtual())
4483 return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
4484
4485 // Null is free
4486 if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64)
4487 return false;
4488
4489 // SGPRs use the constant bus
4490 if (MO.isImplicit()) {
4491 return MO.getReg() == AMDGPU::M0 ||
4492 MO.getReg() == AMDGPU::VCC ||
4493 MO.getReg() == AMDGPU::VCC_LO;
4494 } else {
4495 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
4496 AMDGPU::SReg_64RegClass.contains(MO.getReg());
4497 }
4498}
4499
4501 for (const MachineOperand &MO : MI.implicit_operands()) {
4502 // We only care about reads.
4503 if (MO.isDef())
4504 continue;
4505
4506 switch (MO.getReg()) {
4507 case AMDGPU::VCC:
4508 case AMDGPU::VCC_LO:
4509 case AMDGPU::VCC_HI:
4510 case AMDGPU::M0:
4511 case AMDGPU::FLAT_SCR:
4512 return MO.getReg();
4513
4514 default:
4515 break;
4516 }
4517 }
4518
4519 return Register();
4520}
4521
4522static bool shouldReadExec(const MachineInstr &MI) {
4523 if (SIInstrInfo::isVALU(MI)) {
4524 switch (MI.getOpcode()) {
4525 case AMDGPU::V_READLANE_B32:
4526 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4527 case AMDGPU::V_WRITELANE_B32:
4528 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4529 return false;
4530 }
4531
4532 return true;
4533 }
4534
4535 if (MI.isPreISelOpcode() ||
4536 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4539 return false;
4540
4541 return true;
4542}
4543
4544static bool isSubRegOf(const SIRegisterInfo &TRI,
4545 const MachineOperand &SuperVec,
4546 const MachineOperand &SubReg) {
4547 if (SubReg.getReg().isPhysical())
4548 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4549
4550 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4551 SubReg.getReg() == SuperVec.getReg();
4552}
4553
4555 StringRef &ErrInfo) const {
4556 uint16_t Opcode = MI.getOpcode();
4557 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
4558 return true;
4559
4560 const MachineFunction *MF = MI.getParent()->getParent();
4561 const MachineRegisterInfo &MRI = MF->getRegInfo();
4562
4563 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4564 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4565 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4566 int Src3Idx = -1;
4567 if (Src0Idx == -1) {
4568 // VOPD V_DUAL_* instructions use different operand names.
4569 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4570 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4571 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4572 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4573 }
4574
4575 // Make sure the number of operands is correct.
4576 const MCInstrDesc &Desc = get(Opcode);
4577 if (!Desc.isVariadic() &&
4578 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4579 ErrInfo = "Instruction has wrong number of operands.";
4580 return false;
4581 }
4582
4583 if (MI.isInlineAsm()) {
4584 // Verify register classes for inlineasm constraints.
4585 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4586 I != E; ++I) {
4587 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4588 if (!RC)
4589 continue;
4590
4591 const MachineOperand &Op = MI.getOperand(I);
4592 if (!Op.isReg())
4593 continue;
4594
4595 Register Reg = Op.getReg();
4596 if (!Reg.isVirtual() && !RC->contains(Reg)) {
4597 ErrInfo = "inlineasm operand has incorrect register class.";
4598 return false;
4599 }
4600 }
4601
4602 return true;
4603 }
4604
4605 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
4606 ErrInfo = "missing memory operand from image instruction.";
4607 return false;
4608 }
4609
4610 // Make sure the register classes are correct.
4611 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
4612 const MachineOperand &MO = MI.getOperand(i);
4613 if (MO.isFPImm()) {
4614 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
4615 "all fp values to integers.";
4616 return false;
4617 }
4618
4619 int RegClass = Desc.operands()[i].RegClass;
4620
4621 switch (Desc.operands()[i].OperandType) {
4623 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
4624 ErrInfo = "Illegal immediate value for operand.";
4625 return false;
4626 }
4627 break;
4632 break;
4644 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
4645 ErrInfo = "Illegal immediate value for operand.";
4646 return false;
4647 }
4648 break;
4649 }
4651 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
4652 ErrInfo = "Expected inline constant for operand.";
4653 return false;
4654 }
4655 break;
4658 // Check if this operand is an immediate.
4659 // FrameIndex operands will be replaced by immediates, so they are
4660 // allowed.
4661 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
4662 ErrInfo = "Expected immediate, but got non-immediate";
4663 return false;
4664 }
4665 [[fallthrough]];
4666 default:
4667 continue;
4668 }
4669
4670 if (!MO.isReg())
4671 continue;
4672 Register Reg = MO.getReg();
4673 if (!Reg)
4674 continue;
4675
4676 // FIXME: Ideally we would have separate instruction definitions with the
4677 // aligned register constraint.
4678 // FIXME: We do not verify inline asm operands, but custom inline asm
4679 // verification is broken anyway
4680 if (ST.needsAlignedVGPRs()) {
4681 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
4682 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
4683 const TargetRegisterClass *SubRC =
4684 RI.getSubRegisterClass(RC, MO.getSubReg());
4685 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
4686 if (RC)
4687 RC = SubRC;
4688 }
4689
4690 // Check that this is the aligned version of the class.
4691 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
4692 ErrInfo = "Subtarget requires even aligned vector registers";
4693 return false;
4694 }
4695 }
4696
4697 if (RegClass != -1) {
4698 if (Reg.isVirtual())
4699 continue;
4700
4701 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
4702 if (!RC->contains(Reg)) {
4703 ErrInfo = "Operand has incorrect register class.";
4704 return false;
4705 }
4706 }
4707 }
4708
4709 // Verify SDWA
4710 if (isSDWA(MI)) {
4711 if (!ST.hasSDWA()) {
4712 ErrInfo = "SDWA is not supported on this target";
4713 return false;
4714 }
4715
4716 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
4717
4718 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
4719 if (OpIdx == -1)
4720 continue;
4721 const MachineOperand &MO = MI.getOperand(OpIdx);
4722
4723 if (!ST.hasSDWAScalar()) {
4724 // Only VGPRS on VI
4725 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
4726 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
4727 return false;
4728 }
4729 } else {
4730 // No immediates on GFX9
4731 if (!MO.isReg()) {
4732 ErrInfo =
4733 "Only reg allowed as operands in SDWA instructions on GFX9+";
4734 return false;
4735 }
4736 }
4737 }
4738
4739 if (!ST.hasSDWAOmod()) {
4740 // No omod allowed on VI
4741 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4742 if (OMod != nullptr &&
4743 (!OMod->isImm() || OMod->getImm() != 0)) {
4744 ErrInfo = "OMod not allowed in SDWA instructions on VI";
4745 return false;
4746 }
4747 }
4748
4749 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
4750 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
4751 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
4752 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
4753 const MachineOperand *Src0ModsMO =
4754 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4755 unsigned Mods = Src0ModsMO->getImm();
4756 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
4757 Mods & SISrcMods::SEXT) {
4758 ErrInfo = "sext, abs and neg are not allowed on this instruction";
4759 return false;
4760 }
4761 }
4762
4763 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
4764 if (isVOPC(BasicOpcode)) {
4765 if (!ST.hasSDWASdst() && DstIdx != -1) {
4766 // Only vcc allowed as dst on VI for VOPC
4767 const MachineOperand &Dst = MI.getOperand(DstIdx);
4768 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
4769 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
4770 return false;
4771 }
4772 } else if (!ST.hasSDWAOutModsVOPC()) {
4773 // No clamp allowed on GFX9 for VOPC
4774 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4775 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
4776 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
4777 return false;
4778 }
4779
4780 // No omod allowed on GFX9 for VOPC
4781 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4782 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
4783 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
4784 return false;
4785 }
4786 }
4787 }
4788
4789 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
4790 if (DstUnused && DstUnused->isImm() &&
4791 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
4792 const MachineOperand &Dst = MI.getOperand(DstIdx);
4793 if (!Dst.isReg() || !Dst.isTied()) {
4794 ErrInfo = "Dst register should have tied register";
4795 return false;
4796 }
4797
4798 const MachineOperand &TiedMO =
4799 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
4800 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
4801 ErrInfo =
4802 "Dst register should be tied to implicit use of preserved register";
4803 return false;
4804 } else if (TiedMO.getReg().isPhysical() &&
4805 Dst.getReg() != TiedMO.getReg()) {
4806 ErrInfo = "Dst register should use same physical register as preserved";
4807 return false;
4808 }
4809 }
4810 }
4811
4812 // Verify MIMG / VIMAGE / VSAMPLE
4813 if (isImage(MI.getOpcode()) && !MI.mayStore()) {
4814 // Ensure that the return type used is large enough for all the options
4815 // being used TFE/LWE require an extra result register.
4816 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
4817 if (DMask) {
4818 uint64_t DMaskImm = DMask->getImm();
4819 uint32_t RegCount =
4820 isGather4(MI.getOpcode()) ? 4 : llvm::popcount(DMaskImm);
4821 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
4822 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
4823 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
4824
4825 // Adjust for packed 16 bit values
4826 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
4827 RegCount = divideCeil(RegCount, 2);
4828
4829 // Adjust if using LWE or TFE
4830 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
4831 RegCount += 1;
4832
4833 const uint32_t DstIdx =
4834 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
4835 const MachineOperand &Dst = MI.getOperand(DstIdx);
4836 if (Dst.isReg()) {
4837 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
4838 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
4839 if (RegCount > DstSize) {
4840 ErrInfo = "Image instruction returns too many registers for dst "
4841 "register class";
4842 return false;
4843 }
4844 }
4845 }
4846 }
4847
4848 // Verify VOP*. Ignore multiple sgpr operands on writelane.
4849 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
4850 unsigned ConstantBusCount = 0;
4851 bool UsesLiteral = false;
4852 const MachineOperand *LiteralVal = nullptr;
4853
4854 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
4855 if (ImmIdx != -1) {
4856 ++ConstantBusCount;
4857 UsesLiteral = true;
4858 LiteralVal = &MI.getOperand(ImmIdx);
4859 }
4860
4861 SmallVector<Register, 2> SGPRsUsed;
4862 Register SGPRUsed;
4863
4864 // Only look at the true operands. Only a real operand can use the constant
4865 // bus, and we don't want to check pseudo-operands like the source modifier
4866 // flags.
4867 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
4868 if (OpIdx == -1)
4869 continue;
4870 const MachineOperand &MO = MI.getOperand(OpIdx);
4871 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
4872 if (MO.isReg()) {
4873 SGPRUsed = MO.getReg();
4874 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
4875 ++ConstantBusCount;
4876 SGPRsUsed.push_back(SGPRUsed);
4877 }
4878 } else {
4879 if (!UsesLiteral) {
4880 ++ConstantBusCount;
4881 UsesLiteral = true;
4882 LiteralVal = &MO;
4883 } else if (!MO.isIdenticalTo(*LiteralVal)) {
4884 assert(isVOP2(MI) || isVOP3(MI));
4885 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
4886 return false;
4887 }
4888 }
4889 }
4890 }
4891
4892 SGPRUsed = findImplicitSGPRRead(MI);
4893 if (SGPRUsed) {
4894 // Implicit uses may safely overlap true operands
4895 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
4896 return !RI.regsOverlap(SGPRUsed, SGPR);
4897 })) {
4898 ++ConstantBusCount;
4899 SGPRsUsed.push_back(SGPRUsed);
4900 }
4901 }
4902
4903 // v_writelane_b32 is an exception from constant bus restriction:
4904 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
4905 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
4906 Opcode != AMDGPU::V_WRITELANE_B32) {
4907 ErrInfo = "VOP* instruction violates constant bus restriction";
4908 return false;
4909 }
4910
4911 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
4912 ErrInfo = "VOP3 instruction uses literal";
4913 return false;
4914 }
4915 }
4916
4917 // Special case for writelane - this can break the multiple constant bus rule,
4918 // but still can't use more than one SGPR register
4919 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
4920 unsigned SGPRCount = 0;
4921 Register SGPRUsed;
4922
4923 for (int OpIdx : {Src0Idx, Src1Idx}) {
4924 if (OpIdx == -1)
4925 break;
4926
4927 const MachineOperand &MO = MI.getOperand(OpIdx);
4928
4929 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
4930 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
4931 if (MO.getReg() != SGPRUsed)
4932 ++SGPRCount;
4933 SGPRUsed = MO.getReg();
4934 }
4935 }
4936 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
4937 ErrInfo = "WRITELANE instruction violates constant bus restriction";
4938 return false;
4939 }
4940 }
4941 }
4942
4943 // Verify misc. restrictions on specific instructions.
4944 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
4945 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
4946 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
4947 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
4948 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
4949 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
4950 if (!compareMachineOp(Src0, Src1) &&
4951 !compareMachineOp(Src0, Src2)) {
4952 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
4953 return false;
4954 }
4955 }
4956 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
4957 SISrcMods::ABS) ||
4958 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
4959 SISrcMods::ABS) ||
4960 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
4961 SISrcMods::ABS)) {
4962 ErrInfo = "ABS not allowed in VOP3B instructions";
4963 return false;
4964 }
4965 }
4966
4967 if (isSOP2(MI) || isSOPC(MI)) {
4968 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
4969 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
4970
4971 if (!Src0.isReg() && !Src1.isReg() &&
4972 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
4973 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
4974 !Src0.isIdenticalTo(Src1)) {
4975 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
4976 return false;
4977 }
4978 }
4979
4980 if (isSOPK(MI)) {
4981 auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
4982 if (Desc.isBranch()) {
4983 if (!Op->isMBB()) {
4984 ErrInfo = "invalid branch target for SOPK instruction";
4985 return false;
4986 }
4987 } else {
4988 uint64_t Imm = Op->getImm();
4989 if (sopkIsZext(Opcode)) {
4990 if (!isUInt<16>(Imm)) {
4991 ErrInfo = "invalid immediate for SOPK instruction";
4992 return false;
4993 }
4994 } else {
4995 if (!isInt<16>(Imm)) {
4996 ErrInfo = "invalid immediate for SOPK instruction";
4997 return false;
4998 }
4999 }
5000 }
5001 }
5002
5003 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5004 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5005 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5006 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5007 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5008 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5009
5010 const unsigned StaticNumOps =
5011 Desc.getNumOperands() + Desc.implicit_uses().size();
5012 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5013
5014 // Allow additional implicit operands. This allows a fixup done by the post
5015 // RA scheduler where the main implicit operand is killed and implicit-defs
5016 // are added for sub-registers that remain live after this instruction.
5017 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5018 ErrInfo = "missing implicit register operands";
5019 return false;
5020 }
5021
5022 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5023 if (IsDst) {
5024 if (!Dst->isUse()) {
5025 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5026 return false;
5027 }
5028
5029 unsigned UseOpIdx;
5030 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5031 UseOpIdx != StaticNumOps + 1) {
5032 ErrInfo = "movrel implicit operands should be tied";
5033 return false;
5034 }
5035 }
5036
5037 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5038 const MachineOperand &ImpUse
5039 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5040 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5041 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5042 ErrInfo = "src0 should be subreg of implicit vector use";
5043 return false;
5044 }
5045 }
5046
5047 // Make sure we aren't losing exec uses in the td files. This mostly requires
5048 // being careful when using let Uses to try to add other use registers.
5049 if (shouldReadExec(MI)) {
5050 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5051 ErrInfo = "VALU instruction does not implicitly read exec mask";
5052 return false;
5053 }
5054 }
5055
5056 if (isSMRD(MI)) {
5057 if (MI.mayStore() &&
5059 // The register offset form of scalar stores may only use m0 as the
5060 // soffset register.
5061 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5062 if (Soff && Soff->getReg() != AMDGPU::M0) {
5063 ErrInfo = "scalar stores must use m0 as offset register";
5064 return false;
5065 }
5066 }
5067 }
5068
5069 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5070 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5071 if (Offset->getImm() != 0) {
5072 ErrInfo = "subtarget does not support offsets in flat instructions";
5073 return false;
5074 }
5075 }
5076
5077 if (isDS(MI) && !ST.hasGDS()) {
5078 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5079 if (GDSOp && GDSOp->getImm() != 0) {
5080 ErrInfo = "GDS is not supported on this subtarget";
5081 return false;
5082 }
5083 }
5084
5085 if (isImage(MI)) {
5086 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5087 if (DimOp) {
5088 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5089 AMDGPU::OpName::vaddr0);
5090 int RSrcOpName =
5091 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5092 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5093 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5094 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5096 const AMDGPU::MIMGDimInfo *Dim =
5098
5099 if (!Dim) {
5100 ErrInfo = "dim is out of range";
5101 return false;
5102 }
5103
5104 bool IsA16 = false;
5105 if (ST.hasR128A16()) {
5106 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5107 IsA16 = R128A16->getImm() != 0;
5108 } else if (ST.hasA16()) {
5109 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5110 IsA16 = A16->getImm() != 0;
5111 }
5112
5113 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5114
5115 unsigned AddrWords =
5116 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5117
5118 unsigned VAddrWords;
5119 if (IsNSA) {
5120 VAddrWords = RsrcIdx - VAddr0Idx;
5121 if (ST.hasPartialNSAEncoding() &&
5122 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5123 unsigned LastVAddrIdx = RsrcIdx - 1;
5124 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5125 }
5126 } else {
5127 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5128 if (AddrWords > 12)
5129 AddrWords = 16;
5130 }
5131
5132 if (VAddrWords != AddrWords) {
5133 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5134 << " but got " << VAddrWords << "\n");
5135 ErrInfo = "bad vaddr size";
5136 return false;
5137 }
5138 }
5139 }
5140
5141 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5142 if (DppCt) {
5143 using namespace AMDGPU::DPP;
5144
5145 unsigned DC = DppCt->getImm();
5146 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5147 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5148 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5149 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5150 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5151 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5152 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5153 ErrInfo = "Invalid dpp_ctrl value";
5154 return false;
5155 }
5156 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5158 ErrInfo = "Invalid dpp_ctrl value: "
5159 "wavefront shifts are not supported on GFX10+";
5160 return false;
5161 }
5162 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5164 ErrInfo = "Invalid dpp_ctrl value: "
5165 "broadcasts are not supported on GFX10+";
5166 return false;
5167 }
5168 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5170 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5171 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5172 !ST.hasGFX90AInsts()) {
5173 ErrInfo = "Invalid dpp_ctrl value: "
5174 "row_newbroadcast/row_share is not supported before "
5175 "GFX90A/GFX10";
5176 return false;
5177 } else if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5178 ErrInfo = "Invalid dpp_ctrl value: "
5179 "row_share and row_xmask are not supported before GFX10";
5180 return false;
5181 }
5182 }
5183
5184 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5186 ErrInfo = "Invalid dpp_ctrl value: "
5187 "DP ALU dpp only support row_newbcast";
5188 return false;
5189 }
5190 }
5191
5192 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5193 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5194 uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0
5195 : AMDGPU::OpName::vdata;
5196 const MachineOperand *Data = getNamedOperand(MI, DataNameIdx);
5197 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5198 if (Data && !Data->isReg())
5199 Data = nullptr;
5200
5201 if (ST.hasGFX90AInsts()) {
5202 if (Dst && Data &&
5203 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5204 ErrInfo = "Invalid register class: "
5205 "vdata and vdst should be both VGPR or AGPR";
5206 return false;
5207 }
5208 if (Data && Data2 &&
5209 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5210 ErrInfo = "Invalid register class: "
5211 "both data operands should be VGPR or AGPR";
5212 return false;
5213 }
5214 } else {
5215 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5216 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5217 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5218 ErrInfo = "Invalid register class: "
5219 "agpr loads and stores not supported on this GPU";
5220 return false;
5221 }
5222 }
5223 }
5224
5225 if (ST.needsAlignedVGPRs()) {
5226 const auto isAlignedReg = [&MI, &MRI, this](unsigned OpName) -> bool {
5228 if (!Op)
5229 return true;
5230 Register Reg = Op->getReg();
5231 if (Reg.isPhysical())
5232 return !(RI.getHWRegIndex(Reg) & 1);
5233 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5234 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5235 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5236 };
5237
5238 if (MI.getOpcode() == AMDGPU::DS_GWS_INIT ||
5239 MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
5240 MI.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
5241
5242 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5243 ErrInfo = "Subtarget requires even aligned vector registers "
5244 "for DS_GWS instructions";
5245 return false;
5246 }
5247 }
5248
5249 if (isMIMG(MI)) {
5250 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5251 ErrInfo = "Subtarget requires even aligned vector registers "
5252 "for vaddr operand of image instructions";
5253 return false;
5254 }
5255 }
5256 }
5257
5258 if (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
5259 !ST.hasGFX90AInsts()) {
5260 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5261 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5262 ErrInfo = "Invalid register class: "
5263 "v_accvgpr_write with an SGPR is not supported on this GPU";
5264 return false;
5265 }
5266 }
5267
5268 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5269 const MachineOperand &SrcOp = MI.getOperand(1);
5270 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5271 ErrInfo = "pseudo expects only physical SGPRs";
5272 return false;
5273 }
5274 }
5275
5276 return true;
5277}
5278
5279// It is more readable to list mapped opcodes on the same line.
5280// clang-format off
5281
5283 switch (MI.getOpcode()) {
5284 default: return AMDGPU::INSTRUCTION_LIST_END;
5285 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5286 case AMDGPU::COPY: return AMDGPU::COPY;
5287 case AMDGPU::PHI: return AMDGPU::PHI;
5288 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5289 case AMDGPU::WQM: return AMDGPU::WQM;
5290 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5291 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5292 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5293 case AMDGPU::S_MOV_B32: {
5294 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5295 return MI.getOperand(1).isReg() ||
5296 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5297 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5298 }
5299 case AMDGPU::S_ADD_I32:
5300 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5301 case AMDGPU::S_ADDC_U32:
5302 return AMDGPU::V_ADDC_U32_e32;
5303 case AMDGPU::S_SUB_I32:
5304 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5305 // FIXME: These are not consistently handled, and selected when the carry is
5306 // used.
5307 case AMDGPU::S_ADD_U32:
5308 return AMDGPU::V_ADD_CO_U32_e32;
5309 case AMDGPU::S_SUB_U32:
5310 return AMDGPU::V_SUB_CO_U32_e32;
5311 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5312 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5313 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5314 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5315 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5316 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5317 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5318 case AMDGPU::S_XNOR_B32:
5319 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5320 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5321 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5322 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5323 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5324 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5325 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5326 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5327 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5328 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5329 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5330 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5331 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5332 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5333 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5334 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5335 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5336 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5337 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5338 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5339 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5340 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5341 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5342 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5343 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5344 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5345 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5346 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5347 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5348 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5349 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5350 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5351 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5352 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5353 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5354 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5355 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5356 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5357 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5358 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5359 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5360 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5361 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5362 case AMDGPU::S_CVT_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
5363 case AMDGPU::S_CVT_HI_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
5364 case AMDGPU::S_CVT_F16_F32: return AMDGPU::V_CVT_F16_F32_t16_e64;
5365 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5366 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5367 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5368 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5369 case AMDGPU::S_CEIL_F16:
5370 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5371 : AMDGPU::V_CEIL_F16_fake16_e64;
5372 case AMDGPU::S_FLOOR_F16:
5373 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5374 : AMDGPU::V_FLOOR_F16_fake16_e64;
5375 case AMDGPU::S_TRUNC_F16:
5376 return AMDGPU::V_TRUNC_F16_fake16_e64;
5377 case AMDGPU::S_RNDNE_F16:
5378 return AMDGPU::V_RNDNE_F16_fake16_e64;
5379 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5380 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5381 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5382 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5383 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5384 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5385 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5386 case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64;
5387 case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
5388 case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
5389 case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
5390 case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64;
5391 case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64;
5392 case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
5393 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5394 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5395 case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_t16_e64;
5396 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5397 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5398 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5399 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5400 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5401 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5402 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5403 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5404 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5405 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5406 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5407 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5408 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5409 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5410 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5411 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5412 case AMDGPU::S_CMP_LT_F16: return AMDGPU::V_CMP_LT_F16_t16_e64;
5413 case AMDGPU::S_CMP_EQ_F16: return AMDGPU::V_CMP_EQ_F16_t16_e64;
5414 case AMDGPU::S_CMP_LE_F16: return AMDGPU::V_CMP_LE_F16_t16_e64;
5415 case AMDGPU::S_CMP_GT_F16: return AMDGPU::V_CMP_GT_F16_t16_e64;
5416 case AMDGPU::S_CMP_LG_F16: return AMDGPU::V_CMP_LG_F16_t16_e64;
5417 case AMDGPU::S_CMP_GE_F16: return AMDGPU::V_CMP_GE_F16_t16_e64;
5418 case AMDGPU::S_CMP_O_F16: return AMDGPU::V_CMP_O_F16_t16_e64;
5419 case AMDGPU::S_CMP_U_F16: return AMDGPU::V_CMP_U_F16_t16_e64;
5420 case AMDGPU::S_CMP_NGE_F16: return AMDGPU::V_CMP_NGE_F16_t16_e64;
5421 case AMDGPU::S_CMP_NLG_F16: return AMDGPU::V_CMP_NLG_F16_t16_e64;
5422 case AMDGPU::S_CMP_NGT_F16: return AMDGPU::V_CMP_NGT_F16_t16_e64;
5423 case AMDGPU::S_CMP_NLE_F16: return AMDGPU::V_CMP_NLE_F16_t16_e64;
5424 case AMDGPU::S_CMP_NEQ_F16: return AMDGPU::V_CMP_NEQ_F16_t16_e64;
5425 case AMDGPU::S_CMP_NLT_F16: return AMDGPU::V_CMP_NLT_F16_t16_e64;
5426 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5427 case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_fake16_e64;
5428 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5429 case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_fake16_e64;
5430 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5431 case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_fake16_e64;
5432 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5433 case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_fake16_e64;
5434 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5435 case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_fake16_e64;
5436 }
5438 "Unexpected scalar opcode without corresponding vector one!");
5439}
5440
5441// clang-format on
5442
5446 const DebugLoc &DL, Register Reg,
5447 bool IsSCCLive,
5448 SlotIndexes *Indexes) const {
5449 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5450 const SIInstrInfo *TII = ST.getInstrInfo();
5451 bool IsWave32 = ST.isWave32();
5452 if (IsSCCLive) {
5453 // Insert two move instructions, one to save the original value of EXEC and
5454 // the other to turn on all bits in EXEC. This is required as we can't use
5455 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5456 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5457 MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5458 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
5459 .addReg(Exec, RegState::Kill);
5460 auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
5461 if (Indexes) {
5462 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5463 Indexes->insertMachineInstrInMaps(*FlipExecMI);
5464 }
5465 } else {
5466 const unsigned OrSaveExec =
5467 IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
5468 auto SaveExec =
5469 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
5470 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5471 if (Indexes)
5472 Indexes->insertMachineInstrInMaps(*SaveExec);
5473 }
5474}
5475
5478 const DebugLoc &DL, Register Reg,
5479 SlotIndexes *Indexes) const {
5480 unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5481 MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5482 auto ExecRestoreMI =
5483 BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
5484 if (Indexes)
5485 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
5486}
5487
5488static const TargetRegisterClass *
5490 const MachineRegisterInfo &MRI,
5491 const MCInstrDesc &TID, unsigned RCID,
5492 bool IsAllocatable) {
5493 if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5494 (((TID.mayLoad() || TID.mayStore()) &&
5495 !(TID.TSFlags & SIInstrFlags::Spill)) ||
5497 switch (RCID) {
5498 case AMDGPU::AV_32RegClassID:
5499 RCID = AMDGPU::VGPR_32RegClassID;
5500 break;
5501 case AMDGPU::AV_64RegClassID:
5502 RCID = AMDGPU::VReg_64RegClassID;
5503 break;
5504 case AMDGPU::AV_96RegClassID:
5505 RCID = AMDGPU::VReg_96RegClassID;
5506 break;
5507 case AMDGPU::AV_128RegClassID:
5508 RCID = AMDGPU::VReg_128RegClassID;
5509 break;
5510 case AMDGPU::AV_160RegClassID:
5511 RCID = AMDGPU::VReg_160RegClassID;
5512 break;
5513 case AMDGPU::AV_512RegClassID:
5514 RCID = AMDGPU::VReg_512RegClassID;
5515 break;
5516 default:
5517 break;
5518 }
5519 }
5520
5521 return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
5522}
5523
5525 unsigned OpNum, const TargetRegisterInfo *TRI,
5526 const MachineFunction &MF)
5527 const {
5528 if (OpNum >= TID.getNumOperands())
5529 return nullptr;
5530 auto RegClass = TID.operands()[OpNum].RegClass;
5531 bool IsAllocatable = false;
5533 // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
5534 // with two data operands. Request register class constrained to VGPR only
5535 // of both operands present as Machine Copy Propagation can not check this
5536 // constraint and possibly other passes too.
5537 //
5538 // The check is limited to FLAT and DS because atomics in non-flat encoding
5539 // have their vdst and vdata tied to be the same register.
5540 const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5541 AMDGPU::OpName::vdst);
5542 const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5543 (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
5544 : AMDGPU::OpName::vdata);
5545 if (DataIdx != -1) {
5546 IsAllocatable = VDstIdx != -1 || AMDGPU::hasNamedOperand(
5547 TID.Opcode, AMDGPU::OpName::data1);
5548 }
5549 }
5550 return adjustAllocatableRegClass(ST, RI, MF.getRegInfo(), TID, RegClass,
5551 IsAllocatable);
5552}
5553
5555 unsigned OpNo) const {
5556 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5557 const MCInstrDesc &Desc = get(MI.getOpcode());
5558 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
5559 Desc.operands()[OpNo].RegClass == -1) {
5560 Register Reg = MI.getOperand(OpNo).getReg();
5561
5562 if (Reg.isVirtual())
5563 return MRI.getRegClass(Reg);
5564 return RI.getPhysRegBaseClass(Reg);
5565 }
5566
5567 unsigned RCID = Desc.operands()[OpNo].RegClass;
5568 return adjustAllocatableRegClass(ST, RI, MRI, Desc, RCID, true);
5569}
5570
5573 MachineBasicBlock *MBB = MI.getParent();
5574 MachineOperand &MO = MI.getOperand(OpIdx);
5576 unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
5577 const TargetRegisterClass *RC = RI.getRegClass(RCID);
5578 unsigned Size = RI.getRegSizeInBits(*RC);
5579 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32;
5580 if (MO.isReg())
5581 Opcode = AMDGPU::COPY;
5582 else if (RI.isSGPRClass(RC))
5583 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
5584
5585 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
5586 Register Reg = MRI.createVirtualRegister(VRC);
5588 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
5589 MO.ChangeToRegister(Reg, false);
5590}
5591
5594 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
5595 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5596 MachineBasicBlock *MBB = MI->getParent();
5597 DebugLoc DL = MI->getDebugLoc();
5598 Register SubReg = MRI.createVirtualRegister(SubRC);
5599
5600 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
5601 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
5602 .addReg(SuperReg.getReg(), 0, SubIdx);
5603 return SubReg;
5604 }
5605
5606 // Just in case the super register is itself a sub-register, copy it to a new
5607 // value so we don't need to worry about merging its subreg index with the
5608 // SubIdx passed to this function. The register coalescer should be able to
5609 // eliminate this extra copy.
5610 Register NewSuperReg = MRI.createVirtualRegister(SuperRC);
5611
5612 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
5613 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
5614
5615 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
5616 .addReg(NewSuperReg, 0, SubIdx);
5617
5618 return SubReg;
5619}
5620
5623 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
5624 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5625 if (Op.isImm()) {
5626 if (SubIdx == AMDGPU::sub0)
5627 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
5628 if (SubIdx == AMDGPU::sub1)
5629 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
5630
5631 llvm_unreachable("Unhandled register index for immediate");
5632 }
5633
5634 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
5635 SubIdx, SubRC);
5636 return MachineOperand::CreateReg(SubReg, false);
5637}
5638
5639// Change the order of operands from (0, 1, 2) to (0, 2, 1)
5640void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
5641 assert(Inst.getNumExplicitOperands() == 3);
5642 MachineOperand Op1 = Inst.getOperand(1);
5643 Inst.removeOperand(1);
5644 Inst.addOperand(Op1);
5645}
5646
5648 const MCOperandInfo &OpInfo,
5649 const MachineOperand &MO) const {
5650 if (!MO.isReg())
5651 return false;
5652
5653 Register Reg = MO.getReg();
5654
5655 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
5656 if (Reg.isPhysical())
5657 return DRC->contains(Reg);
5658
5659 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
5660
5661 if (MO.getSubReg()) {
5662 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
5663 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
5664 if (!SuperRC)
5665 return false;
5666
5667 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
5668 if (!DRC)
5669 return false;
5670 }
5671 return RC->hasSuperClassEq(DRC);
5672}
5673
5675 const MCOperandInfo &OpInfo,
5676 const MachineOperand &MO) const {
5677 if (MO.isReg())
5678 return isLegalRegOperand(MRI, OpInfo, MO);
5679
5680 // Handle non-register types that are treated like immediates.
5681 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
5682 return true;
5683}
5684
5685bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
5686 const MachineOperand *MO) const {
5687 const MachineFunction &MF = *MI.getParent()->getParent();
5688 const MachineRegisterInfo &MRI = MF.getRegInfo();
5689 const MCInstrDesc &InstDesc = MI.getDesc();
5690 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
5691 const TargetRegisterClass *DefinedRC =
5692 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
5693 if (!MO)
5694 MO = &MI.getOperand(OpIdx);
5695
5696 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
5697 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
5698 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
5699 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo) && !LiteralLimit--)
5700 return false;
5701
5703 if (MO->isReg())
5704 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
5705
5706 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
5707 if (i == OpIdx)
5708 continue;
5709 const MachineOperand &Op = MI.getOperand(i);
5710 if (Op.isReg()) {
5711 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
5712 if (!SGPRsUsed.count(SGPR) &&
5713 // FIXME: This can access off the end of the operands() array.
5714 usesConstantBus(MRI, Op, InstDesc.operands().begin()[i])) {
5715 if (--ConstantBusLimit <= 0)
5716 return false;
5717 SGPRsUsed.insert(SGPR);
5718 }
5719 } else if (AMDGPU::isSISrcOperand(InstDesc, i) &&
5720 !isInlineConstant(Op, InstDesc.operands()[i])) {
5721 if (!LiteralLimit--)
5722 return false;
5723 if (--ConstantBusLimit <= 0)
5724 return false;
5725 }
5726 }
5727 }
5728
5729 if (MO->isReg()) {
5730 if (!DefinedRC)
5731 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
5732 if (!isLegalRegOperand(MRI, OpInfo, *MO))
5733 return false;
5734 bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
5735 if (IsAGPR && !ST.hasMAIInsts())
5736 return false;
5737 unsigned Opc = MI.getOpcode();
5738 if (IsAGPR &&
5739 (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5740 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
5741 return false;
5742 // Atomics should have both vdst and vdata either vgpr or agpr.
5743 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
5744 const int DataIdx = AMDGPU::getNamedOperandIdx(Opc,
5745 isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
5746 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
5747 MI.getOperand(DataIdx).isReg() &&
5748 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
5749 return false;
5750 if ((int)OpIdx == DataIdx) {
5751 if (VDstIdx != -1 &&
5752 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
5753 return false;
5754 // DS instructions with 2 src operands also must have tied RC.
5755 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc,
5756 AMDGPU::OpName::data1);
5757 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
5758 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
5759 return false;
5760 }
5761 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
5762 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
5763 RI.isSGPRReg(MRI, MO->getReg()))
5764 return false;
5765 return true;
5766 }
5767
5768 if (MO->isImm()) {
5769 uint64_t Imm = MO->getImm();
5770 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
5771 bool Is64BitOp = Is64BitFPOp ||
5775 if (Is64BitOp &&
5777 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp))
5778 return false;
5779
5780 // FIXME: We can use sign extended 64-bit literals, but only for signed
5781 // operands. At the moment we do not know if an operand is signed.
5782 // Such operand will be encoded as its low 32 bits and then either
5783 // correctly sign extended or incorrectly zero extended by HW.
5784 if (!Is64BitFPOp && (int32_t)Imm < 0)
5785 return false;
5786 }
5787 }
5788
5789 // Handle non-register types that are treated like immediates.
5790 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
5791
5792 if (!DefinedRC) {
5793 // This operand expects an immediate.
5794 return true;
5795 }
5796
5797 return isImmOperandLegal(MI, OpIdx, *MO);
5798}
5799
5801 MachineInstr &MI) const {
5802 unsigned Opc = MI.getOpcode();
5803 const MCInstrDesc &InstrDesc = get(Opc);
5804
5805 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
5806 MachineOperand &Src0 = MI.getOperand(Src0Idx);
5807
5808 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
5809 MachineOperand &Src1 = MI.getOperand(Src1Idx);
5810
5811 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
5812 // we need to only have one constant bus use before GFX10.
5813 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
5814 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
5815 RI.isSGPRReg(MRI, Src0.getReg()))
5816 legalizeOpWithMove(MI, Src0Idx);
5817
5818 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
5819 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
5820 // src0/src1 with V_READFIRSTLANE.
5821 if (Opc == AMDGPU::V_WRITELANE_B32) {
5822 const DebugLoc &DL = MI.getDebugLoc();
5823 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
5824 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5825 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5826 .add(Src0);
5827 Src0.ChangeToRegister(Reg, false);
5828 }
5829 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
5830 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5831 const DebugLoc &DL = MI.getDebugLoc();
5832 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5833 .add(Src1);
5834 Src1.ChangeToRegister(Reg, false);
5835 }
5836 return;
5837 }
5838
5839 // No VOP2 instructions support AGPRs.
5840 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
5841 legalizeOpWithMove(MI, Src0Idx);
5842
5843 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
5844 legalizeOpWithMove(MI, Src1Idx);
5845
5846 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
5847 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
5848 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
5849 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
5850 legalizeOpWithMove(MI, Src2Idx);
5851 }
5852
5853 // VOP2 src0 instructions support all operand types, so we don't need to check
5854 // their legality. If src1 is already legal, we don't need to do anything.
5855 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
5856 return;
5857
5858 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
5859 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
5860 // select is uniform.
5861 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
5862 RI.isVGPR(MRI, Src1.getReg())) {
5863 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5864 const DebugLoc &DL = MI.getDebugLoc();
5865 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5866 .add(Src1);
5867 Src1.ChangeToRegister(Reg, false);
5868 return;
5869 }
5870
5871 // We do not use commuteInstruction here because it is too aggressive and will
5872 // commute if it is possible. We only want to commute here if it improves
5873 // legality. This can be called a fairly large number of times so don't waste
5874 // compile time pointlessly swapping and checking legality again.
5875 if (HasImplicitSGPR || !MI.isCommutable()) {
5876 legalizeOpWithMove(MI, Src1Idx);
5877 return;
5878 }
5879
5880 // If src0 can be used as src1, commuting will make the operands legal.
5881 // Otherwise we have to give up and insert a move.
5882 //
5883 // TODO: Other immediate-like operand kinds could be commuted if there was a
5884 // MachineOperand::ChangeTo* for them.
5885 if ((!Src1.isImm() && !Src1.isReg()) ||
5886 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
5887 legalizeOpWithMove(MI, Src1Idx);
5888 return;
5889 }
5890
5891 int CommutedOpc = commuteOpcode(MI);
5892 if (CommutedOpc == -1) {
5893 legalizeOpWithMove(MI, Src1Idx);
5894 return;
5895 }
5896
5897 MI.setDesc(get(CommutedOpc));
5898
5899 Register Src0Reg = Src0.getReg();
5900 unsigned Src0SubReg = Src0.getSubReg();
5901 bool Src0Kill = Src0.isKill();
5902
5903 if (Src1.isImm())
5904 Src0.ChangeToImmediate(Src1.getImm());
5905 else if (Src1.isReg()) {
5906 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
5907 Src0.setSubReg(Src1.getSubReg());
5908 } else
5909 llvm_unreachable("Should only have register or immediate operands");
5910
5911 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
5912 Src1.setSubReg(Src0SubReg);
5914}
5915
5916// Legalize VOP3 operands. All operand types are supported for any operand
5917// but only one literal constant and only starting from GFX10.
5919 MachineInstr &MI) const {
5920 unsigned Opc = MI.getOpcode();
5921
5922 int VOP3Idx[3] = {
5923 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
5924 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
5925 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
5926 };
5927
5928 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
5929 Opc == AMDGPU::V_PERMLANEX16_B32_e64) {
5930 // src1 and src2 must be scalar
5931 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
5932 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
5933 const DebugLoc &DL = MI.getDebugLoc();
5934 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
5935 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5936 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5937 .add(Src1);
5938 Src1.ChangeToRegister(Reg, false);
5939 }
5940 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
5941 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5942 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5943 .add(Src2);
5944 Src2.ChangeToRegister(Reg, false);
5945 }
5946 }
5947
5948 // Find the one SGPR operand we are allowed to use.
5949 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
5950 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
5951 SmallDenseSet<unsigned> SGPRsUsed;
5952 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
5953 if (SGPRReg) {
5954 SGPRsUsed.insert(SGPRReg);
5955 --ConstantBusLimit;
5956 }
5957
5958 for (int Idx : VOP3Idx) {
5959 if (Idx == -1)
5960 break;
5961 MachineOperand &MO = MI.getOperand(Idx);
5962
5963 if (!MO.isReg()) {
5964 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
5965 continue;
5966
5967 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
5968 --LiteralLimit;
5969 --ConstantBusLimit;
5970 continue;
5971 }
5972
5973 --LiteralLimit;
5974 --ConstantBusLimit;
5976 continue;
5977 }
5978
5979 if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&
5980 !isOperandLegal(MI, Idx, &MO)) {
5982 continue;
5983 }
5984
5985 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
5986 continue; // VGPRs are legal
5987
5988 // We can use one SGPR in each VOP3 instruction prior to GFX10
5989 // and two starting from GFX10.
5990 if (SGPRsUsed.count(MO.getReg()))
5991 continue;
5992 if (ConstantBusLimit > 0) {
5993 SGPRsUsed.insert(MO.getReg());
5994 --ConstantBusLimit;
5995 continue;
5996 }
5997
5998 // If we make it this far, then the operand is not legal and we must
5999 // legalize it.
6001 }
6002
6003 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6004 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6005 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6006 legalizeOpWithMove(MI, VOP3Idx[2]);
6007}
6008
6010 MachineRegisterInfo &MRI) const {
6011 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6012 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6013 Register DstReg = MRI.createVirtualRegister(SRC);
6014 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6015
6016 if (RI.hasAGPRs(VRC)) {
6017 VRC = RI.getEquivalentVGPRClass(VRC);
6018 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6019 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6020 get(TargetOpcode::COPY), NewSrcReg)
6021 .addReg(SrcReg);
6022 SrcReg = NewSrcReg;
6023 }
6024
6025 if (SubRegs == 1) {
6026 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6027 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6028 .addReg(SrcReg);
6029 return DstReg;
6030 }
6031
6033 for (unsigned i = 0; i < SubRegs; ++i) {
6034 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6035 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6036 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6037 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6038 SRegs.push_back(SGPR);
6039 }
6040
6042 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6043 get(AMDGPU::REG_SEQUENCE), DstReg);
6044 for (unsigned i = 0; i < SubRegs; ++i) {
6045 MIB.addReg(SRegs[i]);
6046 MIB.addImm(RI.getSubRegFromChannel(i));
6047 }
6048 return DstReg;
6049}
6050
6052 MachineInstr &MI) const {
6053
6054 // If the pointer is store in VGPRs, then we need to move them to
6055 // SGPRs using v_readfirstlane. This is safe because we only select
6056 // loads with uniform pointers to SMRD instruction so we know the
6057 // pointer value is uniform.
6058 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6059 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6060 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6061 SBase->setReg(SGPR);
6062 }
6063 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6064 if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
6065 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6066 SOff->setReg(SGPR);
6067 }
6068}
6069
6071 unsigned Opc = Inst.getOpcode();
6072 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6073 if (OldSAddrIdx < 0)
6074 return false;
6075
6077
6078 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6079 if (NewOpc < 0)
6081 if (NewOpc < 0)
6082 return false;
6083
6085 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6086 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6087 return false;
6088
6089 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6090 if (NewVAddrIdx < 0)
6091 return false;
6092
6093 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6094
6095 // Check vaddr, it shall be zero or absent.
6096 MachineInstr *VAddrDef = nullptr;
6097 if (OldVAddrIdx >= 0) {
6098 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6099 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6100 if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
6101 !VAddrDef->getOperand(1).isImm() ||
6102 VAddrDef->getOperand(1).getImm() != 0)
6103 return false;
6104 }
6105
6106 const MCInstrDesc &NewDesc = get(NewOpc);
6107 Inst.setDesc(NewDesc);
6108
6109 // Callers expect iterator to be valid after this call, so modify the
6110 // instruction in place.
6111 if (OldVAddrIdx == NewVAddrIdx) {
6112 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6113 // Clear use list from the old vaddr holding a zero register.
6114 MRI.removeRegOperandFromUseList(&NewVAddr);
6115 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6116 Inst.removeOperand(OldSAddrIdx);
6117 // Update the use list with the pointer we have just moved from vaddr to
6118 // saddr position. Otherwise new vaddr will be missing from the use list.
6119 MRI.removeRegOperandFromUseList(&NewVAddr);
6120 MRI.addRegOperandToUseList(&NewVAddr);
6121 } else {
6122 assert(OldSAddrIdx == NewVAddrIdx);
6123
6124 if (OldVAddrIdx >= 0) {
6125 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6126 AMDGPU::OpName::vdst_in);
6127
6128 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6129 // it asserts. Untie the operands for now and retie them afterwards.
6130 if (NewVDstIn != -1) {
6131 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6132 Inst.untieRegOperand(OldVDstIn);
6133 }
6134
6135 Inst.removeOperand(OldVAddrIdx);
6136
6137 if (NewVDstIn != -1) {
6138 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6139 Inst.tieOperands(NewVDst, NewVDstIn);
6140 }
6141 }
6142 }
6143
6144 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6145 VAddrDef->eraseFromParent();
6146
6147 return true;
6148}
6149
6150// FIXME: Remove this when SelectionDAG is obsoleted.
6152 MachineInstr &MI) const {
6154 return;
6155
6156 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6157 // thinks they are uniform, so a readfirstlane should be valid.
6158 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6159 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6160 return;
6161
6163 return;
6164
6165 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI);
6166 SAddr->setReg(ToSGPR);
6167}
6168
6171 const TargetRegisterClass *DstRC,
6174 const DebugLoc &DL) const {
6175 Register OpReg = Op.getReg();
6176 unsigned OpSubReg = Op.getSubReg();
6177
6178 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6179 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6180
6181 // Check if operand is already the correct register class.
6182 if (DstRC == OpRC)
6183 return;
6184
6185 Register DstReg = MRI.createVirtualRegister(DstRC);
6186 auto Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
6187
6188 Op.setReg(DstReg);
6189 Op.setSubReg(0);
6190
6191 MachineInstr *Def = MRI.getVRegDef(OpReg);
6192 if (!Def)
6193 return;
6194
6195 // Try to eliminate the copy if it is copying an immediate value.
6196 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6197 foldImmediate(*Copy, *Def, OpReg, &MRI);
6198
6199 bool ImpDef = Def->isImplicitDef();
6200 while (!ImpDef && Def && Def->isCopy()) {
6201 if (Def->getOperand(1).getReg().isPhysical())
6202 break;
6203 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6204 ImpDef = Def && Def->isImplicitDef();
6205 }
6206 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6207 !ImpDef)
6208 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6209}
6210
6211// Emit the actual waterfall loop, executing the wrapped instruction for each
6212// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6213// iteration, in the worst case we execute 64 (once per lane).
6216 MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL,
6217 ArrayRef<MachineOperand *> ScalarOps) {
6218 MachineFunction &MF = *OrigBB.getParent();
6219 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6220 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6221 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6222 unsigned SaveExecOpc =
6223 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
6224 unsigned XorTermOpc =
6225 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
6226 unsigned AndOpc =
6227 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6228 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6229
6231
6232 SmallVector<Register, 8> ReadlanePieces;
6233 Register CondReg;
6234
6235 for (MachineOperand *ScalarOp : ScalarOps) {
6236 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6237 unsigned NumSubRegs = RegSize / 32;
6238 Register VScalarOp = ScalarOp->getReg();
6239
6240 if (NumSubRegs == 1) {
6241 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6242
6243 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6244 .addReg(VScalarOp);
6245
6246 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6247
6248 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6249 .addReg(CurReg)
6250 .addReg(VScalarOp);
6251
6252 // Combine the comparison results with AND.
6253 if (!CondReg) // First.
6254 CondReg = NewCondReg;
6255 else { // If not the first, we create an AND.
6256 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6257 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6258 .addReg(CondReg)
6259 .addReg(NewCondReg);
6260 CondReg = AndReg;
6261 }
6262
6263 // Update ScalarOp operand to use the SGPR ScalarOp.
6264 ScalarOp->setReg(CurReg);
6265 ScalarOp->setIsKill();
6266 } else {
6267 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6268 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6269 "Unhandled register size");
6270
6271 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6272 Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6273 Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6274
6275 // Read the next variant <- also loop target.
6276 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6277 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6278
6279 // Read the next variant <- also loop target.
6280 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6281 .addReg(VScalarOp, VScalarOpUndef,
6282 TRI->getSubRegFromChannel(Idx + 1));
6283
6284 ReadlanePieces.push_back(CurRegLo);
6285 ReadlanePieces.push_back(CurRegHi);
6286
6287 // Comparison is to be done as 64-bit.
6288 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6289 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6290 .addReg(CurRegLo)
6291 .addImm(AMDGPU::sub0)
6292 .addReg(CurRegHi)
6293 .addImm(AMDGPU::sub1);
6294
6295 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6296 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6297 NewCondReg)
6298 .addReg(CurReg);
6299 if (NumSubRegs <= 2)
6300 Cmp.addReg(VScalarOp);
6301 else
6302 Cmp.addReg(VScalarOp, VScalarOpUndef,
6303 TRI->getSubRegFromChannel(Idx, 2));
6304
6305 // Combine the comparison results with AND.
6306 if (!CondReg) // First.
6307 CondReg = NewCondReg;
6308 else { // If not the first, we create an AND.
6309 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6310 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6311 .addReg(CondReg)
6312 .addReg(NewCondReg);
6313 CondReg = AndReg;
6314 }
6315 } // End for loop.
6316
6317 auto SScalarOpRC =
6318 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6319 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6320
6321 // Build scalar ScalarOp.
6322 auto Merge =
6323 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6324 unsigned Channel = 0;
6325 for (Register Piece : ReadlanePieces) {
6326 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6327 }
6328
6329 // Update ScalarOp operand to use the SGPR ScalarOp.
6330 ScalarOp->setReg(SScalarOp);
6331 ScalarOp->setIsKill();
6332 }
6333 }
6334
6335 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6336 MRI.setSimpleHint(SaveExec, CondReg);
6337
6338 // Update EXEC to matching lanes, saving original to SaveExec.
6339 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
6340 .addReg(CondReg, RegState::Kill);
6341
6342 // The original instruction is here; we insert the terminators after it.
6343 I = BodyBB.end();
6344
6345 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6346 BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec)
6347 .addReg(Exec)
6348 .addReg(SaveExec);
6349
6350 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
6351}
6352
6353// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6354// with SGPRs by iterating over all unique values across all lanes.
6355// Returns the loop basic block that now contains \p MI.
6356static MachineBasicBlock *
6360 MachineBasicBlock::iterator Begin = nullptr,
6361 MachineBasicBlock::iterator End = nullptr) {
6362 MachineBasicBlock &MBB = *MI.getParent();
6363 MachineFunction &MF = *MBB.getParent();
6364 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6365 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6367 if (!Begin.isValid())
6368 Begin = &MI;
6369 if (!End.isValid()) {
6370 End = &MI;
6371 ++End;
6372 }
6373 const DebugLoc &DL = MI.getDebugLoc();
6374 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6375 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6376 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6377
6378 // Save SCC. Waterfall Loop may overwrite SCC.
6379 Register SaveSCCReg;
6380 bool SCCNotDead = (MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI, 30) !=
6382 if (SCCNotDead) {
6383 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6384 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
6385 .addImm(1)
6386 .addImm(0);
6387 }
6388
6389 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6390
6391 // Save the EXEC mask
6392 BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
6393
6394 // Killed uses in the instruction we are waterfalling around will be
6395 // incorrect due to the added control-flow.
6397 ++AfterMI;
6398 for (auto I = Begin; I != AfterMI; I++) {
6399 for (auto &MO : I->all_uses())
6400 MRI.clearKillFlags(MO.getReg());
6401 }
6402
6403 // To insert the loop we need to split the block. Move everything after this
6404 // point to a new block, and insert a new empty block between the two.
6407 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
6409 ++MBBI;
6410
6411 MF.insert(MBBI, LoopBB);
6412 MF.insert(MBBI, BodyBB);
6413 MF.insert(MBBI, RemainderBB);
6414
6415 LoopBB->addSuccessor(BodyBB);
6416 BodyBB->addSuccessor(LoopBB);
6417 BodyBB->addSuccessor(RemainderBB);
6418
6419 // Move Begin to MI to the BodyBB, and the remainder of the block to
6420 // RemainderBB.
6421 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
6422 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
6423 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
6424
6425 MBB.addSuccessor(LoopBB);
6426
6427 // Update dominators. We know that MBB immediately dominates LoopBB, that
6428 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
6429 // RemainderBB. RemainderBB immediately dominates all of the successors
6430 // transferred to it from MBB that MBB used to properly dominate.
6431 if (MDT) {
6432 MDT->addNewBlock(LoopBB, &MBB);
6433 MDT->addNewBlock(BodyBB, LoopBB);
6434 MDT->addNewBlock(RemainderBB, BodyBB);
6435 for (auto &Succ : RemainderBB->successors()) {
6436 if (MDT->properlyDominates(&MBB, Succ)) {
6437 MDT->changeImmediateDominator(Succ, RemainderBB);
6438 }
6439 }
6440 }
6441
6442 emitLoadScalarOpsFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, ScalarOps);
6443
6444 MachineBasicBlock::iterator First = RemainderBB->begin();
6445 // Restore SCC
6446 if (SCCNotDead) {
6447 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
6448 .addReg(SaveSCCReg, RegState::Kill)
6449 .addImm(0);
6450 }
6451
6452 // Restore the EXEC mask
6453 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
6454 return BodyBB;
6455}
6456
6457// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
6458static std::tuple<unsigned, unsigned>
6460 MachineBasicBlock &MBB = *MI.getParent();
6461 MachineFunction &MF = *MBB.getParent();
6463
6464 // Extract the ptr from the resource descriptor.
6465 unsigned RsrcPtr =
6466 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
6467 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
6468
6469 // Create an empty resource descriptor
6470 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6471 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6472 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6473 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6474 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
6475
6476 // Zero64 = 0
6477 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
6478 .addImm(0);
6479
6480 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
6481 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
6482 .addImm(RsrcDataFormat & 0xFFFFFFFF);
6483
6484 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
6485 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
6486 .addImm(RsrcDataFormat >> 32);
6487
6488 // NewSRsrc = {Zero64, SRsrcFormat}
6489 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
6490 .addReg(Zero64)
6491 .addImm(AMDGPU::sub0_sub1)
6492 .addReg(SRsrcFormatLo)
6493 .addImm(AMDGPU::sub2)
6494 .addReg(SRsrcFormatHi)
6495 .addImm(AMDGPU::sub3);
6496
6497 return std::tuple(RsrcPtr, NewSRsrc);
6498}
6499
6502 MachineDominatorTree *MDT) const {
6503 MachineFunction &MF = *MI.getParent()->getParent();
6505 MachineBasicBlock *CreatedBB = nullptr;
6506
6507 // Legalize VOP2
6508 if (isVOP2(MI) || isVOPC(MI)) {
6510 return CreatedBB;
6511 }
6512
6513 // Legalize VOP3
6514 if (isVOP3(MI)) {
6516 return CreatedBB;
6517 }
6518
6519 // Legalize SMRD
6520 if (isSMRD(MI)) {
6522 return CreatedBB;
6523 }
6524
6525 // Legalize FLAT
6526 if (isFLAT(MI)) {
6528 return CreatedBB;
6529 }
6530
6531 // Legalize REG_SEQUENCE and PHI
6532 // The register class of the operands much be the same type as the register
6533 // class of the output.
6534 if (MI.getOpcode() == AMDGPU::PHI) {
6535 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
6536 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
6537 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
6538 continue;
6539 const TargetRegisterClass *OpRC =
6540 MRI.getRegClass(MI.getOperand(i).getReg());
6541 if (RI.hasVectorRegisters(OpRC)) {
6542 VRC = OpRC;
6543 } else {
6544 SRC = OpRC;
6545 }
6546 }
6547
6548 // If any of the operands are VGPR registers, then they all most be
6549 // otherwise we will create illegal VGPR->SGPR copies when legalizing
6550 // them.
6551 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
6552 if (!VRC) {
6553 assert(SRC);
6554 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
6555 VRC = &AMDGPU::VReg_1RegClass;
6556 } else
6557 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6558 ? RI.getEquivalentAGPRClass(SRC)
6559 : RI.getEquivalentVGPRClass(SRC);
6560 } else {
6561 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6562 ? RI.getEquivalentAGPRClass(VRC)
6563 : RI.getEquivalentVGPRClass(VRC);
6564 }
6565 RC = VRC;
6566 } else {
6567 RC = SRC;
6568 }
6569
6570 // Update all the operands so they have the same type.
6571 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6572 MachineOperand &Op = MI.getOperand(I);
6573 if (!Op.isReg() || !Op.getReg().isVirtual())
6574 continue;
6575
6576 // MI is a PHI instruction.
6577 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
6579
6580 // Avoid creating no-op copies with the same src and dst reg class. These
6581 // confuse some of the machine passes.
6582 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
6583 }
6584 }
6585
6586 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
6587 // VGPR dest type and SGPR sources, insert copies so all operands are
6588 // VGPRs. This seems to help operand folding / the register coalescer.
6589 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
6590 MachineBasicBlock *MBB = MI.getParent();
6591 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
6592 if (RI.hasVGPRs(DstRC)) {
6593 // Update all the operands so they are VGPR register classes. These may
6594 // not be the same register class because REG_SEQUENCE supports mixing
6595 // subregister index types e.g. sub0_sub1 + sub2 + sub3
6596 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6597 MachineOperand &Op = MI.getOperand(I);
6598 if (!Op.isReg() || !Op.getReg().isVirtual())
6599 continue;
6600
6601 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
6602 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
6603 if (VRC == OpRC)
6604 continue;
6605
6606 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
6607 Op.setIsKill();
6608 }
6609 }
6610
6611 return CreatedBB;
6612 }
6613
6614 // Legalize INSERT_SUBREG
6615 // src0 must have the same register class as dst
6616 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
6617 Register Dst = MI.getOperand(0).getReg();
6618 Register Src0 = MI.getOperand(1).getReg();
6619 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
6620 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
6621 if (DstRC != Src0RC) {
6622 MachineBasicBlock *MBB = MI.getParent();
6623 MachineOperand &Op = MI.getOperand(1);
6624 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
6625 }
6626 return CreatedBB;
6627 }
6628
6629 // Legalize SI_INIT_M0
6630 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
6631 MachineOperand &Src = MI.getOperand(0);
6632 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6633 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6634 return CreatedBB;
6635 }
6636
6637 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
6638 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
6639 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
6640 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
6641 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
6642 MI.getOpcode() == AMDGPU::S_WQM_B64) {
6643 MachineOperand &Src = MI.getOperand(1);
6644 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6645 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6646 return CreatedBB;
6647 }
6648
6649 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
6650 //
6651 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
6652 // scratch memory access. In both cases, the legalization never involves
6653 // conversion to the addr64 form.
6655 (isMUBUF(MI) || isMTBUF(MI)))) {
6656 int RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI)) ? AMDGPU::OpName::rsrc
6657 : AMDGPU::OpName::srsrc;
6658 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
6659 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
6660 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
6661
6662 int SampOpName = isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
6663 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
6664 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
6665 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
6666
6667 return CreatedBB;
6668 }
6669
6670 // Legalize SI_CALL
6671 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
6672 MachineOperand *Dest = &MI.getOperand(0);
6673 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
6674 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
6675 // following copies, we also need to move copies from and to physical
6676 // registers into the loop block.
6677 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
6678 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
6679
6680 // Also move the copies to physical registers into the loop block
6681 MachineBasicBlock &MBB = *MI.getParent();
6683 while (Start->getOpcode() != FrameSetupOpcode)
6684 --Start;
6686 while (End->getOpcode() != FrameDestroyOpcode)
6687 ++End;
6688 // Also include following copies of the return value
6689 ++End;
6690 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
6691 MI.definesRegister(End->getOperand(1).getReg()))
6692 ++End;
6693 CreatedBB =
6694 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
6695 }
6696 }
6697
6698 // Legalize s_sleep_var.
6699 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
6700 const DebugLoc &DL = MI.getDebugLoc();
6701 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6702 int Src0Idx =
6703 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
6704 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6705 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6706 .add(Src0);
6707 Src0.ChangeToRegister(Reg, false);
6708 return nullptr;
6709 }
6710
6711 // Legalize MUBUF instructions.
6712 bool isSoffsetLegal = true;
6713 int SoffsetIdx =
6714 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
6715 if (SoffsetIdx != -1) {
6716 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
6717 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
6718 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
6719 isSoffsetLegal = false;
6720 }
6721 }
6722
6723 bool isRsrcLegal = true;
6724 int RsrcIdx =
6725 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
6726 if (RsrcIdx != -1) {
6727 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6728 if (Rsrc->isReg() && !RI.isSGPRClass(MRI.getRegClass(Rsrc->getReg()))) {
6729 isRsrcLegal = false;
6730 }
6731 }
6732
6733 // The operands are legal.
6734 if (isRsrcLegal && isSoffsetLegal)
6735 return CreatedBB;
6736
6737 if (!isRsrcLegal) {
6738 // Legalize a VGPR Rsrc
6739 //
6740 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
6741 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
6742 // a zero-value SRsrc.
6743 //
6744 // If the instruction is _OFFSET (both idxen and offen disabled), and we
6745 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
6746 // above.
6747 //
6748 // Otherwise we are on non-ADDR64 hardware, and/or we have
6749 // idxen/offen/bothen and we fall back to a waterfall loop.
6750
6751 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6752 MachineBasicBlock &MBB = *MI.getParent();
6753
6754 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
6755 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
6756 // This is already an ADDR64 instruction so we need to add the pointer
6757 // extracted from the resource descriptor to the current value of VAddr.
6758 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6759 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6760 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6761
6762 const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6763 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
6764 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
6765
6766 unsigned RsrcPtr, NewSRsrc;
6767 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
6768
6769 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
6770 const DebugLoc &DL = MI.getDebugLoc();
6771 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
6772 .addDef(CondReg0)
6773 .addReg(RsrcPtr, 0, AMDGPU::sub0)
6774 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
6775 .addImm(0);
6776
6777 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
6778 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
6779 .addDef(CondReg1, RegState::Dead)
6780 .addReg(RsrcPtr, 0, AMDGPU::sub1)
6781 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
6782 .addReg(CondReg0, RegState::Kill)
6783 .addImm(0);
6784
6785 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6786 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
6787 .addReg(NewVAddrLo)
6788 .addImm(AMDGPU::sub0)
6789 .addReg(NewVAddrHi)
6790 .addImm(AMDGPU::sub1);
6791
6792 VAddr->setReg(NewVAddr);
6793 Rsrc->setReg(NewSRsrc);
6794 } else if (!VAddr && ST.hasAddr64()) {
6795 // This instructions is the _OFFSET variant, so we need to convert it to
6796 // ADDR64.
6798 "FIXME: Need to emit flat atomics here");
6799
6800 unsigned RsrcPtr, NewSRsrc;
6801 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
6802
6803 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6804 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
6805 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
6806 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6807 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
6808
6809 // Atomics with return have an additional tied operand and are
6810 // missing some of the special bits.
6811 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
6812 MachineInstr *Addr64;
6813
6814 if (!VDataIn) {
6815 // Regular buffer load / store.
6817 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
6818 .add(*VData)
6819 .addReg(NewVAddr)
6820 .addReg(NewSRsrc)
6821 .add(*SOffset)
6822 .add(*Offset);
6823
6824 if (const MachineOperand *CPol =
6825 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
6826 MIB.addImm(CPol->getImm());
6827 }
6828
6829 if (const MachineOperand *TFE =
6830 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
6831 MIB.addImm(TFE->getImm());
6832 }
6833
6834 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
6835
6836 MIB.cloneMemRefs(MI);
6837 Addr64 = MIB;
6838 } else {
6839 // Atomics with return.
6840 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
6841 .add(*VData)
6842 .add(*VDataIn)
6843 .addReg(NewVAddr)
6844 .addReg(NewSRsrc)
6845 .add(*SOffset)
6846 .add(*Offset)
6847 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
6848 .cloneMemRefs(MI);
6849 }
6850
6851 MI.removeFromParent();
6852
6853 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6854 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
6855 NewVAddr)
6856 .addReg(RsrcPtr, 0, AMDGPU::sub0)
6857 .addImm(AMDGPU::sub0)
6858 .addReg(RsrcPtr, 0, AMDGPU::sub1)
6859 .addImm(AMDGPU::sub1);
6860 } else {
6861 // Legalize a VGPR Rsrc and soffset together.
6862 if (!isSoffsetLegal) {
6863 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6864 CreatedBB =
6865 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
6866 return CreatedBB;
6867 }
6868 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
6869 return CreatedBB;
6870 }
6871 }
6872
6873 // Legalize a VGPR soffset.
6874 if (!isSoffsetLegal) {
6875 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6876 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
6877 return CreatedBB;
6878 }
6879 return CreatedBB;
6880}
6881
6883 InstrList.insert(MI);
6884 // Add MBUF instructiosn to deferred list.
6885 int RsrcIdx =
6886 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
6887 if (RsrcIdx != -1) {
6888 DeferredList.insert(MI);
6889 }
6890}
6891
6893 return DeferredList.contains(MI);
6894}
6895
6897 MachineDominatorTree *MDT) const {
6898
6899 while (!Worklist.empty()) {
6900 MachineInstr &Inst = *Worklist.top();
6901 Worklist.erase_top();
6902 // Skip MachineInstr in the deferred list.
6903 if (Worklist.isDeferred(&Inst))
6904 continue;
6905 moveToVALUImpl(Worklist, MDT, Inst);
6906 }
6907
6908 // Deferred list of instructions will be processed once
6909 // all the MachineInstr in the worklist are done.
6910 for (MachineInstr *Inst : Worklist.getDeferredList()) {
6911 moveToVALUImpl(Worklist, MDT, *Inst);
6912 assert(Worklist.empty() &&
6913 "Deferred MachineInstr are not supposed to re-populate worklist");
6914 }
6915}
6916
6919 MachineInstr &Inst) const {
6920
6922 if (!MBB)
6923 return;
6925 unsigned Opcode = Inst.getOpcode();
6926 unsigned NewOpcode = getVALUOp(Inst);
6927 // Handle some special cases
6928 switch (Opcode) {
6929 default:
6930 break;
6931 case AMDGPU::S_ADD_U64_PSEUDO:
6932 NewOpcode = AMDGPU::V_ADD_U64_PSEUDO;
6933 break;
6934 case AMDGPU::S_SUB_U64_PSEUDO:
6935 NewOpcode = AMDGPU::V_SUB_U64_PSEUDO;
6936 break;
6937 case AMDGPU::S_ADD_I32:
6938 case AMDGPU::S_SUB_I32: {
6939 // FIXME: The u32 versions currently selected use the carry.
6940 bool Changed;
6941 MachineBasicBlock *CreatedBBTmp = nullptr;
6942 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
6943 if (Changed)
6944 return;
6945
6946 // Default handling
6947 break;
6948 }
6949
6950 case AMDGPU::S_MUL_U64:
6951 // Split s_mul_u64 in 32-bit vector multiplications.
6952 splitScalarSMulU64(Worklist, Inst, MDT);
6953 Inst.eraseFromParent();
6954 return;
6955
6956 case AMDGPU::S_MUL_U64_U32_PSEUDO:
6957 case AMDGPU::S_MUL_I64_I32_PSEUDO:
6958 // This is a special case of s_mul_u64 where all the operands are either
6959 // zero extended or sign extended.
6960 splitScalarSMulPseudo(Worklist, Inst, MDT);
6961 Inst.eraseFromParent();
6962 return;
6963
6964 case AMDGPU::S_AND_B64:
6965 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
6966 Inst.eraseFromParent();
6967 return;
6968
6969 case AMDGPU::S_OR_B64:
6970 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
6971 Inst.eraseFromParent();
6972 return;
6973
6974 case AMDGPU::S_XOR_B64:
6975 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
6976 Inst.eraseFromParent();
6977 return;
6978
6979 case AMDGPU::S_NAND_B64:
6980 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
6981 Inst.eraseFromParent();
6982 return;
6983
6984 case AMDGPU::S_NOR_B64:
6985 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
6986 Inst.eraseFromParent();
6987 return;
6988
6989 case AMDGPU::S_XNOR_B64:
6990 if (ST.hasDLInsts())
6991 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
6992 else
6993 splitScalar64BitXnor(Worklist, Inst, MDT);
6994 Inst.eraseFromParent();
6995 return;
6996
6997 case AMDGPU::S_ANDN2_B64:
6998 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
6999 Inst.eraseFromParent();
7000 return;
7001
7002 case AMDGPU::S_ORN2_B64:
7003 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7004 Inst.eraseFromParent();
7005 return;
7006
7007 case AMDGPU::S_BREV_B64:
7008 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7009 Inst.eraseFromParent();
7010 return;
7011
7012 case AMDGPU::S_NOT_B64:
7013 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7014 Inst.eraseFromParent();
7015 return;
7016
7017 case AMDGPU::S_BCNT1_I32_B64:
7018 splitScalar64BitBCNT(Worklist, Inst);
7019 Inst.eraseFromParent();
7020 return;
7021
7022 case AMDGPU::S_BFE_I64:
7023 splitScalar64BitBFE(Worklist, Inst);
7024 Inst.eraseFromParent();
7025 return;
7026
7027 case AMDGPU::S_FLBIT_I32_B64:
7028 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7029 Inst.eraseFromParent();
7030 return;
7031 case AMDGPU::S_FF1_I32_B64:
7032 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7033 Inst.eraseFromParent();
7034 return;
7035
7036 case AMDGPU::S_LSHL_B32:
7037 if (ST.hasOnlyRevVALUShifts()) {
7038 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7039 swapOperands(Inst);
7040 }
7041 break;
7042 case AMDGPU::S_ASHR_I32:
7043 if (ST.hasOnlyRevVALUShifts()) {
7044 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7045 swapOperands(Inst);
7046 }
7047 break;
7048 case AMDGPU::S_LSHR_B32:
7049 if (ST.hasOnlyRevVALUShifts()) {
7050 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7051 swapOperands(Inst);
7052 }
7053 break;
7054 case AMDGPU::S_LSHL_B64:
7055 if (ST.hasOnlyRevVALUShifts()) {
7056 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7057 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7058 : AMDGPU::V_LSHLREV_B64_e64;
7059 swapOperands(Inst);
7060 }
7061 break;
7062 case AMDGPU::S_ASHR_I64:
7063 if (ST.hasOnlyRevVALUShifts()) {
7064 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7065 swapOperands(Inst);
7066 }
7067 break;
7068 case AMDGPU::S_LSHR_B64:
7069 if (ST.hasOnlyRevVALUShifts()) {
7070 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7071 swapOperands(Inst);
7072 }
7073 break;
7074
7075 case AMDGPU::S_ABS_I32:
7076 lowerScalarAbs(Worklist, Inst);
7077 Inst.eraseFromParent();
7078 return;
7079
7080 case AMDGPU::S_CBRANCH_SCC0:
7081 case AMDGPU::S_CBRANCH_SCC1: {
7082 // Clear unused bits of vcc
7083 Register CondReg = Inst.getOperand(1).getReg();
7084 bool IsSCC = CondReg == AMDGPU::SCC;
7085 Register VCC = RI.getVCC();
7086 Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
7087 unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
7088 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
7089 .addReg(EXEC)
7090 .addReg(IsSCC ? VCC : CondReg);
7091 Inst.removeOperand(1);
7092 } break;
7093
7094 case AMDGPU::S_BFE_U64:
7095 case AMDGPU::S_BFM_B64:
7096 llvm_unreachable("Moving this op to VALU not implemented");
7097
7098 case AMDGPU::S_PACK_LL_B32_B16:
7099 case AMDGPU::S_PACK_LH_B32_B16:
7100 case AMDGPU::S_PACK_HL_B32_B16:
7101 case AMDGPU::S_PACK_HH_B32_B16:
7102 movePackToVALU(Worklist, MRI, Inst);
7103 Inst.eraseFromParent();
7104 return;
7105
7106 case AMDGPU::S_XNOR_B32:
7107 lowerScalarXnor(Worklist, Inst);
7108 Inst.eraseFromParent();
7109 return;
7110
7111 case AMDGPU::S_NAND_B32:
7112 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7113 Inst.eraseFromParent();
7114 return;
7115
7116 case AMDGPU::S_NOR_B32:
7117 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7118 Inst.eraseFromParent();
7119 return;
7120
7121 case AMDGPU::S_ANDN2_B32:
7122 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7123 Inst.eraseFromParent();
7124 return;
7125
7126 case AMDGPU::S_ORN2_B32:
7127 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7128 Inst.eraseFromParent();
7129 return;
7130
7131 // TODO: remove as soon as everything is ready
7132 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7133 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7134 // can only be selected from the uniform SDNode.
7135 case AMDGPU::S_ADD_CO_PSEUDO:
7136 case AMDGPU::S_SUB_CO_PSEUDO: {
7137 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7138 ? AMDGPU::V_ADDC_U32_e64
7139 : AMDGPU::V_SUBB_U32_e64;
7140 const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
7141
7142 Register CarryInReg = Inst.getOperand(4).getReg();
7143 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7144 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7145 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7146 .addReg(CarryInReg);
7147 }
7148
7149 Register CarryOutReg = Inst.getOperand(1).getReg();
7150
7151 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7152 MRI.getRegClass(Inst.getOperand(0).getReg())));
7153 MachineInstr *CarryOp =
7154 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7155 .addReg(CarryOutReg, RegState::Define)
7156 .add(Inst.getOperand(2))
7157 .add(Inst.getOperand(3))
7158 .addReg(CarryInReg)
7159 .addImm(0);
7160 legalizeOperands(*CarryOp);
7161 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7162 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7163 Inst.eraseFromParent();
7164 }
7165 return;
7166 case AMDGPU::S_UADDO_PSEUDO:
7167 case AMDGPU::S_USUBO_PSEUDO: {
7168 const DebugLoc &DL = Inst.getDebugLoc();
7169 MachineOperand &Dest0 = Inst.getOperand(0);
7170 MachineOperand &Dest1 = Inst.getOperand(1);
7171 MachineOperand &Src0 = Inst.getOperand(2);
7172 MachineOperand &Src1 = Inst.getOperand(3);
7173
7174 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7175 ? AMDGPU::V_ADD_CO_U32_e64
7176 : AMDGPU::V_SUB_CO_U32_e64;
7177 const TargetRegisterClass *NewRC =
7178 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7179 Register DestReg = MRI.createVirtualRegister(NewRC);
7180 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7181 .addReg(Dest1.getReg(), RegState::Define)
7182 .add(Src0)
7183 .add(Src1)
7184 .addImm(0); // clamp bit
7185
7186 legalizeOperands(*NewInstr, MDT);
7187 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7188 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7189 Worklist);
7190 Inst.eraseFromParent();
7191 }
7192 return;
7193
7194 case AMDGPU::S_CSELECT_B32:
7195 case AMDGPU::S_CSELECT_B64:
7196 lowerSelect(Worklist, Inst, MDT);
7197 Inst.eraseFromParent();
7198 return;
7199 case AMDGPU::S_CMP_EQ_I32:
7200 case AMDGPU::S_CMP_LG_I32:
7201 case AMDGPU::S_CMP_GT_I32:
7202 case AMDGPU::S_CMP_GE_I32:
7203 case AMDGPU::S_CMP_LT_I32:
7204 case AMDGPU::S_CMP_LE_I32:
7205 case AMDGPU::S_CMP_EQ_U32:
7206 case AMDGPU::S_CMP_LG_U32:
7207 case AMDGPU::S_CMP_GT_U32:
7208 case AMDGPU::S_CMP_GE_U32:
7209 case AMDGPU::S_CMP_LT_U32:
7210 case AMDGPU::S_CMP_LE_U32:
7211 case AMDGPU::S_CMP_EQ_U64:
7212 case AMDGPU::S_CMP_LG_U64:
7213 case AMDGPU::S_CMP_LT_F32:
7214 case AMDGPU::S_CMP_EQ_F32:
7215 case AMDGPU::S_CMP_LE_F32:
7216 case AMDGPU::S_CMP_GT_F32:
7217 case AMDGPU::S_CMP_LG_F32:
7218 case AMDGPU::S_CMP_GE_F32:
7219 case AMDGPU::S_CMP_O_F32:
7220 case AMDGPU::S_CMP_U_F32:
7221 case AMDGPU::S_CMP_NGE_F32:
7222 case AMDGPU::S_CMP_NLG_F32:
7223 case AMDGPU::S_CMP_NGT_F32:
7224 case AMDGPU::S_CMP_NLE_F32:
7225 case AMDGPU::S_CMP_NEQ_F32:
7226 case AMDGPU::S_CMP_NLT_F32:
7227 case AMDGPU::S_CMP_LT_F16:
7228 case AMDGPU::S_CMP_EQ_F16:
7229 case AMDGPU::S_CMP_LE_F16:
7230 case AMDGPU::S_CMP_GT_F16:
7231 case AMDGPU::S_CMP_LG_F16:
7232 case AMDGPU::S_CMP_GE_F16:
7233 case AMDGPU::S_CMP_O_F16:
7234 case AMDGPU::S_CMP_U_F16:
7235 case AMDGPU::S_CMP_NGE_F16:
7236 case AMDGPU::S_CMP_NLG_F16:
7237 case AMDGPU::S_CMP_NGT_F16:
7238 case AMDGPU::S_CMP_NLE_F16:
7239 case AMDGPU::S_CMP_NEQ_F16:
7240 case AMDGPU::S_CMP_NLT_F16: {
7241 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7242 auto NewInstr =
7243 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7244 .setMIFlags(Inst.getFlags());
7245 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7246 AMDGPU::OpName::src0_modifiers) >= 0) {
7247 NewInstr
7248 .addImm(0) // src0_modifiers
7249 .add(Inst.getOperand(0)) // src0
7250 .addImm(0) // src1_modifiers
7251 .add(Inst.getOperand(1)) // src1
7252 .addImm(0); // clamp
7253 } else {
7254 NewInstr
7255 .add(Inst.getOperand(0))
7256 .add(Inst.getOperand(1));
7257 }
7258 legalizeOperands(*NewInstr, MDT);
7259 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC);
7260 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7261 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7262 Inst.eraseFromParent();
7263 return;
7264 }
7265 case AMDGPU::S_CVT_HI_F32_F16: {
7266 const DebugLoc &DL = Inst.getDebugLoc();
7267 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7268 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7269 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
7270 .addImm(16)
7271 .add(Inst.getOperand(1));
7272 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7273 .addImm(0) // src0_modifiers
7274 .addReg(TmpReg)
7275 .addImm(0) // clamp
7276 .addImm(0); // omod
7277
7278 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7279 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7280 Inst.eraseFromParent();
7281 return;
7282 }
7283 case AMDGPU::S_MINIMUM_F32:
7284 case AMDGPU::S_MAXIMUM_F32:
7285 case AMDGPU::S_MINIMUM_F16:
7286 case AMDGPU::S_MAXIMUM_F16: {
7287 const DebugLoc &DL = Inst.getDebugLoc();
7288 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7289 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7290 .addImm(0) // src0_modifiers
7291 .add(Inst.getOperand(1))
7292 .addImm(0) // src1_modifiers
7293 .add(Inst.getOperand(2))
7294 .addImm(0) // clamp
7295 .addImm(0); // omod
7296 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7297
7298 legalizeOperands(*NewInstr, MDT);
7299 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7300 Inst.eraseFromParent();
7301 return;
7302 }
7303 }
7304
7305 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
7306 // We cannot move this instruction to the VALU, so we should try to
7307 // legalize its operands instead.
7308 legalizeOperands(Inst, MDT);
7309 return;
7310 }
7311 // Handle converting generic instructions like COPY-to-SGPR into
7312 // COPY-to-VGPR.
7313 if (NewOpcode == Opcode) {
7314 Register DstReg = Inst.getOperand(0).getReg();
7315 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
7316
7317 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7318 // hope for the best.
7319 if (Inst.isCopy() && DstReg.isPhysical() &&
7320 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
7321 // TODO: Only works for 32 bit registers.
7322 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7323 get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
7324 .add(Inst.getOperand(1));
7325 Inst.eraseFromParent();
7326 return;
7327 }
7328
7329 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
7330 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
7331 // Instead of creating a copy where src and dst are the same register
7332 // class, we just replace all uses of dst with src. These kinds of
7333 // copies interfere with the heuristics MachineSink uses to decide
7334 // whether or not to split a critical edge. Since the pass assumes
7335 // that copies will end up as machine instructions and not be
7336 // eliminated.
7337 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
7338 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
7339 MRI.clearKillFlags(Inst.getOperand(1).getReg());
7340 Inst.getOperand(0).setReg(DstReg);
7341 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
7342 // these are deleted later, but at -O0 it would leave a suspicious
7343 // looking illegal copy of an undef register.
7344 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
7345 Inst.removeOperand(I);
7346 Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
7347 return;
7348 }
7349 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
7350 MRI.replaceRegWith(DstReg, NewDstReg);
7351 legalizeOperands(Inst, MDT);
7352 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7353 return;
7354 }
7355
7356 // Use the new VALU Opcode.
7357 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
7358 .setMIFlags(Inst.getFlags());
7359 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
7360 // Intersperse VOP3 modifiers among the SALU operands.
7361 NewInstr->addOperand(Inst.getOperand(0));
7362 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7363 AMDGPU::OpName::src0_modifiers) >= 0)
7364 NewInstr.addImm(0);
7365 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
7366 MachineOperand Src = Inst.getOperand(1);
7367 if (AMDGPU::isTrue16Inst(NewOpcode) && ST.useRealTrue16Insts() &&
7368 Src.isReg() && RI.isVGPR(MRI, Src.getReg()))
7369 NewInstr.addReg(Src.getReg(), 0, AMDGPU::lo16);
7370 else
7371 NewInstr->addOperand(Src);
7372 }
7373
7374 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
7375 // We are converting these to a BFE, so we need to add the missing
7376 // operands for the size and offset.
7377 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
7378 NewInstr.addImm(0);
7379 NewInstr.addImm(Size);
7380 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
7381 // The VALU version adds the second operand to the result, so insert an
7382 // extra 0 operand.
7383 NewInstr.addImm(0);
7384 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
7385 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
7386 // If we need to move this to VGPRs, we need to unpack the second
7387 // operand back into the 2 separate ones for bit offset and width.
7388 assert(OffsetWidthOp.isImm() &&
7389 "Scalar BFE is only implemented for constant width and offset");
7390 uint32_t Imm = OffsetWidthOp.getImm();
7391
7392 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
7393 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
7394 NewInstr.addImm(Offset);
7395 NewInstr.addImm(BitWidth);
7396 } else {
7397 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7398 AMDGPU::OpName::src1_modifiers) >= 0)
7399 NewInstr.addImm(0);
7400 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
7401 NewInstr->addOperand(Inst.getOperand(2));
7402 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7403 AMDGPU::OpName::src2_modifiers) >= 0)
7404 NewInstr.addImm(0);
7405 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
7406 NewInstr->addOperand(Inst.getOperand(3));
7407 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
7408 NewInstr.addImm(0);
7409 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
7410 NewInstr.addImm(0);
7411 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
7412 NewInstr.addImm(0);
7413 }
7414 } else {
7415 // Just copy the SALU operands.
7416 for (const MachineOperand &Op : Inst.explicit_operands())
7417 NewInstr->addOperand(Op);
7418 }
7419
7420 // Remove any references to SCC. Vector instructions can't read from it, and
7421 // We're just about to add the implicit use / defs of VCC, and we don't want
7422 // both.
7423 for (MachineOperand &Op : Inst.implicit_operands()) {
7424 if (Op.getReg() == AMDGPU::SCC) {
7425 // Only propagate through live-def of SCC.
7426 if (Op.isDef() && !Op.isDead())
7427 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
7428 if (Op.isUse())
7429 addSCCDefsToVALUWorklist(NewInstr, Worklist);
7430 }
7431 }
7432 Inst.eraseFromParent();
7433 Register NewDstReg;
7434 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
7435 Register DstReg = NewInstr->getOperand(0).getReg();
7436 assert(DstReg.isVirtual());
7437 // Update the destination register class.
7438 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
7439 assert(NewDstRC);
7440 NewDstReg = MRI.createVirtualRegister(NewDstRC);
7441 MRI.replaceRegWith(DstReg, NewDstReg);
7442 }
7443 fixImplicitOperands(*NewInstr);
7444 // Legalize the operands
7445 legalizeOperands(*NewInstr, MDT);
7446 if (NewDstReg)
7447 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7448}
7449
7450// Add/sub require special handling to deal with carry outs.
7451std::pair<bool, MachineBasicBlock *>
7452SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
7453 MachineDominatorTree *MDT) const {
7454 if (ST.hasAddNoCarry()) {
7455 // Assume there is no user of scc since we don't select this in that case.
7456 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
7457 // is used.
7458
7459 MachineBasicBlock &MBB = *Inst.getParent();
7461
7462 Register OldDstReg = Inst.getOperand(0).getReg();
7463 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7464
7465 unsigned Opc = Inst.getOpcode();
7466 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
7467
7468 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
7469 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
7470
7471 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
7472 Inst.removeOperand(3);
7473
7474 Inst.setDesc(get(NewOpc));
7475 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
7477 MRI.replaceRegWith(OldDstReg, ResultReg);
7478 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
7479
7480 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7481 return std::pair(true, NewBB);
7482 }
7483
7484 return std::pair(false, nullptr);
7485}
7486
7487void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
7488 MachineDominatorTree *MDT) const {
7489
7490 MachineBasicBlock &MBB = *Inst.getParent();
7492 MachineBasicBlock::iterator MII = Inst;
7493 DebugLoc DL = Inst.getDebugLoc();
7494
7495 MachineOperand &Dest = Inst.getOperand(0);
7496 MachineOperand &Src0 = Inst.getOperand(1);
7497 MachineOperand &Src1 = Inst.getOperand(2);
7498 MachineOperand &Cond = Inst.getOperand(3);
7499
7500 Register CondReg = Cond.getReg();
7501 bool IsSCC = (CondReg == AMDGPU::SCC);
7502
7503 // If this is a trivial select where the condition is effectively not SCC
7504 // (CondReg is a source of copy to SCC), then the select is semantically
7505 // equivalent to copying CondReg. Hence, there is no need to create
7506 // V_CNDMASK, we can just use that and bail out.
7507 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
7508 (Src1.getImm() == 0)) {
7509 MRI.replaceRegWith(Dest.getReg(), CondReg);
7510 return;
7511 }
7512
7513 Register NewCondReg = CondReg;
7514 if (IsSCC) {
7515 const TargetRegisterClass *TC =
7516 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
7517 NewCondReg = MRI.createVirtualRegister(TC);
7518
7519 // Now look for the closest SCC def if it is a copy
7520 // replacing the CondReg with the COPY source register
7521 bool CopyFound = false;
7522 for (MachineInstr &CandI :
7524 Inst.getParent()->rend())) {
7525 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) !=
7526 -1) {
7527 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
7528 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
7529 .addReg(CandI.getOperand(1).getReg());
7530 CopyFound = true;
7531 }
7532 break;
7533 }
7534 }
7535 if (!CopyFound) {
7536 // SCC def is not a copy
7537 // Insert a trivial select instead of creating a copy, because a copy from
7538 // SCC would semantically mean just copying a single bit, but we may need
7539 // the result to be a vector condition mask that needs preserving.
7540 unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
7541 : AMDGPU::S_CSELECT_B32;
7542 auto NewSelect =
7543 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
7544 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
7545 }
7546 }
7547
7548 Register NewDestReg = MRI.createVirtualRegister(
7549 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
7550 MachineInstr *NewInst;
7551 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
7552 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
7553 .addImm(0)
7554 .add(Src1) // False
7555 .addImm(0)
7556 .add(Src0) // True
7557 .addReg(NewCondReg);
7558 } else {
7559 NewInst =
7560 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
7561 .add(Src1) // False
7562 .add(Src0) // True
7563 .addReg(NewCondReg);
7564 }
7565 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
7566 legalizeOperands(*NewInst, MDT);
7567 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
7568}
7569
7570void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
7571 MachineInstr &Inst) const {
7572 MachineBasicBlock &MBB = *Inst.getParent();
7574 MachineBasicBlock::iterator MII = Inst;
7575 DebugLoc DL = Inst.getDebugLoc();
7576
7577 MachineOperand &Dest = Inst.getOperand(0);
7578 MachineOperand &Src = Inst.getOperand(1);
7579 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7580 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7581
7582 unsigned SubOp = ST.hasAddNoCarry() ?
7583 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
7584
7585 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
7586 .addImm(0)
7587 .addReg(Src.getReg());
7588
7589 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
7590 .addReg(Src.getReg())
7591 .addReg(TmpReg);
7592
7593 MRI.replaceRegWith(Dest.getReg(), ResultReg);
7594 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7595}
7596
7597void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
7598 MachineInstr &Inst) const {
7599 MachineBasicBlock &MBB = *Inst.getParent();
7601 MachineBasicBlock::iterator MII = Inst;
7602 const DebugLoc &DL = Inst.getDebugLoc();
7603
7604 MachineOperand &Dest = Inst.getOperand(0);
7605 MachineOperand &Src0 = Inst.getOperand(1);
7606 MachineOperand &Src1 = Inst.getOperand(2);
7607
7608 if (ST.hasDLInsts()) {
7609 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7610 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
7611 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
7612
7613 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
7614 .add(Src0)
7615 .add(Src1);
7616
7617 MRI.replaceRegWith(Dest.getReg(), NewDest);
7618 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7619 } else {
7620 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
7621 // invert either source and then perform the XOR. If either source is a
7622 // scalar register, then we can leave the inversion on the scalar unit to
7623 // achieve a better distribution of scalar and vector instructions.
7624 bool Src0IsSGPR = Src0.isReg() &&
7625 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
7626 bool Src1IsSGPR = Src1.isReg() &&
7627 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
7629 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7630 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7631
7632 // Build a pair of scalar instructions and add them to the work list.
7633 // The next iteration over the work list will lower these to the vector
7634 // unit as necessary.
7635 if (Src0IsSGPR) {
7636 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
7637 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7638 .addReg(Temp)
7639 .add(Src1);
7640 } else if (Src1IsSGPR) {
7641 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
7642 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7643 .add(Src0)
7644 .addReg(Temp);
7645 } else {
7646 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
7647 .add(Src0)
7648 .add(Src1);
7649 MachineInstr *Not =
7650 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
7651 Worklist.insert(Not);
7652 }
7653
7654 MRI.replaceRegWith(Dest.getReg(), NewDest);
7655
7656 Worklist.insert(Xor);
7657
7658 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7659 }
7660}
7661
7662void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
7663 MachineInstr &Inst,
7664 unsigned Opcode) const {
7665 MachineBasicBlock &MBB = *Inst.getParent();
7667 MachineBasicBlock::iterator MII = Inst;
7668 const DebugLoc &DL = Inst.getDebugLoc();
7669
7670 MachineOperand &Dest = Inst.getOperand(0);
7671 MachineOperand &Src0 = Inst.getOperand(1);
7672 MachineOperand &Src1 = Inst.getOperand(2);
7673
7674 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7675 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7676
7677 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
7678 .add(Src0)
7679 .add(Src1);
7680
7681 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
7682 .addReg(Interm);
7683
7684 Worklist.insert(&Op);
7685 Worklist.insert(&Not);
7686
7687 MRI.replaceRegWith(Dest.getReg(), NewDest);
7688 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7689}
7690
7691void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
7692 MachineInstr &Inst,
7693 unsigned Opcode) const {
7694 MachineBasicBlock &MBB = *Inst.getParent();
7696 MachineBasicBlock::iterator MII = Inst;
7697 const DebugLoc &DL = Inst.getDebugLoc();
7698
7699 MachineOperand &Dest = Inst.getOperand(0);
7700 MachineOperand &Src0 = Inst.getOperand(1);
7701 MachineOperand &Src1 = Inst.getOperand(2);
7702
7703 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7704 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7705
7706 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
7707 .add(Src1);
7708
7709 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
7710 .add(Src0)
7711 .addReg(Interm);
7712
7713 Worklist.insert(&Not);
7714 Worklist.insert(&Op);
7715
7716 MRI.replaceRegWith(Dest.getReg(), NewDest);
7717 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7718}
7719
7720void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
7721 MachineInstr &Inst, unsigned Opcode,
7722 bool Swap) const {
7723 MachineBasicBlock &MBB = *Inst.getParent();
7725
7726 MachineOperand &Dest = Inst.getOperand(0);
7727 MachineOperand &Src0 = Inst.getOperand(1);
7728 DebugLoc DL = Inst.getDebugLoc();
7729
7730 MachineBasicBlock::iterator MII = Inst;
7731
7732 const MCInstrDesc &InstDesc = get(Opcode);
7733 const TargetRegisterClass *Src0RC = Src0.isReg() ?
7734 MRI.getRegClass(Src0.getReg()) :
7735 &AMDGPU::SGPR_32RegClass;
7736
7737 const TargetRegisterClass *Src0SubRC =
7738 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7739
7740 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7741 AMDGPU::sub0, Src0SubRC);
7742
7743 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
7744 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
7745 const TargetRegisterClass *NewDestSubRC =
7746 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
7747
7748 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
7749 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
7750
7751 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7752 AMDGPU::sub1, Src0SubRC);
7753
7754 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
7755 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
7756
7757 if (Swap)
7758 std::swap(DestSub0, DestSub1);
7759
7760 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
7761 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7762 .addReg(DestSub0)
7763 .addImm(AMDGPU::sub0)
7764 .addReg(DestSub1)
7765 .addImm(AMDGPU::sub1);
7766
7767 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7768
7769 Worklist.insert(&LoHalf);
7770 Worklist.insert(&HiHalf);
7771
7772 // We don't need to legalizeOperands here because for a single operand, src0
7773 // will support any kind of input.
7774
7775 // Move all users of this moved value.
7776 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
7777}
7778
7779// There is not a vector equivalent of s_mul_u64. For this reason, we need to
7780// split the s_mul_u64 in 32-bit vector multiplications.
7781void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
7782 MachineInstr &Inst,
7783 MachineDominatorTree *MDT) const {
7784 MachineBasicBlock &MBB = *Inst.getParent();
7786
7787 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7788 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7789 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7790
7791 MachineOperand &Dest = Inst.getOperand(0);
7792 MachineOperand &Src0 = Inst.getOperand(1);
7793 MachineOperand &Src1 = Inst.getOperand(2);
7794 const DebugLoc &DL = Inst.getDebugLoc();
7795 MachineBasicBlock::iterator MII = Inst;
7796
7797 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
7798 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
7799 const TargetRegisterClass *Src0SubRC =
7800 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7801 if (RI.isSGPRClass(Src0SubRC))
7802 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
7803 const TargetRegisterClass *Src1SubRC =
7804 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
7805 if (RI.isSGPRClass(Src1SubRC))
7806 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
7807
7808 // First, we extract the low 32-bit and high 32-bit values from each of the
7809 // operands.
7810 MachineOperand Op0L =
7811 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
7812 MachineOperand Op1L =
7813 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
7814 MachineOperand Op0H =
7815 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
7816 MachineOperand Op1H =
7817 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
7818
7819 // The multilication is done as follows:
7820 //
7821 // Op1H Op1L
7822 // * Op0H Op0L
7823 // --------------------
7824 // Op1H*Op0L Op1L*Op0L
7825 // + Op1H*Op0H Op1L*Op0H
7826 // -----------------------------------------
7827 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
7828 //
7829 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
7830 // value and that would overflow.
7831 // The low 32-bit value is Op1L*Op0L.
7832 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
7833
7834 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7835 MachineInstr *Op1L_Op0H =
7836 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
7837 .add(Op1L)
7838 .add(Op0H);
7839
7840 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7841 MachineInstr *Op1H_Op0L =
7842 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
7843 .add(Op1H)
7844 .add(Op0L);
7845
7846 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7847 MachineInstr *Carry =
7848 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
7849 .add(Op1L)
7850 .add(Op0L);
7851
7852 MachineInstr *LoHalf =
7853 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
7854 .add(Op1L)
7855 .add(Op0L);
7856
7857 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7858 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
7859 .addReg(Op1L_Op0H_Reg)
7860 .addReg(Op1H_Op0L_Reg);
7861
7862 MachineInstr *HiHalf =
7863 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
7864 .addReg(AddReg)
7865 .addReg(CarryReg);
7866
7867 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7868 .addReg(DestSub0)
7869 .addImm(AMDGPU::sub0)
7870 .addReg(DestSub1)
7871 .addImm(AMDGPU::sub1);
7872
7873 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7874
7875 // Try to legalize the operands in case we need to swap the order to keep it
7876 // valid.
7877 legalizeOperands(*Op1L_Op0H, MDT);
7878 legalizeOperands(*Op1H_Op0L, MDT);
7879 legalizeOperands(*Carry, MDT);
7880 legalizeOperands(*LoHalf, MDT);
7881 legalizeOperands(*Add, MDT);
7882 legalizeOperands(*HiHalf, MDT);
7883
7884 // Move all users of this moved value.
7885 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
7886}
7887
7888// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
7889// multiplications.
7890void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
7891 MachineInstr &Inst,
7892 MachineDominatorTree *MDT) const {
7893 MachineBasicBlock &MBB = *Inst.getParent();
7895
7896 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7897 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7898 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7899
7900 MachineOperand &Dest = Inst.getOperand(0);
7901 MachineOperand &Src0 = Inst.getOperand(1);
7902 MachineOperand &Src1 = Inst.getOperand(2);
7903 const DebugLoc &DL = Inst.getDebugLoc();
7904 MachineBasicBlock::iterator MII = Inst;
7905
7906 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
7907 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
7908 const TargetRegisterClass *Src0SubRC =
7909 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7910 if (RI.isSGPRClass(Src0SubRC))
7911 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
7912 const TargetRegisterClass *Src1SubRC =
7913 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
7914 if (RI.isSGPRClass(Src1SubRC))
7915 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
7916
7917 // First, we extract the low 32-bit and high 32-bit values from each of the
7918 // operands.
7919 MachineOperand Op0L =
7920 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
7921 MachineOperand Op1L =
7922 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
7923
7924 unsigned Opc = Inst.getOpcode();
7925 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
7926 ? AMDGPU::V_MUL_HI_U32_e64
7927 : AMDGPU::V_MUL_HI_I32_e64;
7928 MachineInstr *HiHalf =
7929 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
7930
7931 MachineInstr *LoHalf =
7932 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
7933 .add(Op1L)
7934 .add(Op0L);
7935
7936 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7937 .addReg(DestSub0)
7938 .addImm(AMDGPU::sub0)
7939 .addReg(DestSub1)
7940 .addImm(AMDGPU::sub1);
7941
7942 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7943
7944 // Try to legalize the operands in case we need to swap the order to keep it
7945 // valid.
7946 legalizeOperands(*HiHalf, MDT);
7947 legalizeOperands(*LoHalf, MDT);
7948
7949 // Move all users of this moved value.
7950 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
7951}
7952
7953void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
7954 MachineInstr &Inst, unsigned Opcode,
7955 MachineDominatorTree *MDT) const {
7956 MachineBasicBlock &MBB = *Inst.getParent();
7958
7959 MachineOperand &Dest = Inst.getOperand(0);
7960 MachineOperand &Src0 = Inst.getOperand(1);
7961 MachineOperand &Src1 = Inst.getOperand(2);
7962 DebugLoc DL = Inst.getDebugLoc();
7963
7964 MachineBasicBlock::iterator MII = Inst;
7965
7966 const MCInstrDesc &InstDesc = get(Opcode);
7967 const TargetRegisterClass *Src0RC = Src0.isReg() ?
7968 MRI.getRegClass(Src0.getReg()) :
7969 &AMDGPU::SGPR_32RegClass;
7970
7971 const TargetRegisterClass *Src0SubRC =
7972 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7973 const TargetRegisterClass *Src1RC = Src1.isReg() ?
7974 MRI.getRegClass(Src1.getReg()) :
7975 &AMDGPU::SGPR_32RegClass;
7976
7977 const TargetRegisterClass *Src1SubRC =
7978 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
7979
7980 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7981 AMDGPU::sub0, Src0SubRC);
7982 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
7983 AMDGPU::sub0, Src1SubRC);
7984 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7985 AMDGPU::sub1, Src0SubRC);
7986 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
7987 AMDGPU::sub1, Src1SubRC);
7988
7989 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
7990 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
7991 const TargetRegisterClass *NewDestSubRC =
7992 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
7993
7994 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
7995 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
7996 .add(SrcReg0Sub0)
7997 .add(SrcReg1Sub0);
7998
7999 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8000 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8001 .add(SrcReg0Sub1)
8002 .add(SrcReg1Sub1);
8003
8004 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8005 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8006 .addReg(DestSub0)
8007 .addImm(AMDGPU::sub0)
8008 .addReg(DestSub1)
8009 .addImm(AMDGPU::sub1);
8010
8011 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8012
8013 Worklist.insert(&LoHalf);
8014 Worklist.insert(&HiHalf);
8015
8016 // Move all users of this moved value.
8017 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8018}
8019
8020void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8021 MachineInstr &Inst,
8022 MachineDominatorTree *MDT) const {
8023 MachineBasicBlock &MBB = *Inst.getParent();
8025
8026 MachineOperand &Dest = Inst.getOperand(0);
8027 MachineOperand &Src0 = Inst.getOperand(1);
8028 MachineOperand &Src1 = Inst.getOperand(2);
8029 const DebugLoc &DL = Inst.getDebugLoc();
8030
8031 MachineBasicBlock::iterator MII = Inst;
8032
8033 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8034
8035 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8036
8037 MachineOperand* Op0;
8038 MachineOperand* Op1;
8039
8040 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8041 Op0 = &Src0;
8042 Op1 = &Src1;
8043 } else {
8044 Op0 = &Src1;
8045 Op1 = &Src0;
8046 }
8047
8048 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8049 .add(*Op0);
8050
8051 Register NewDest = MRI.createVirtualRegister(DestRC);
8052
8053 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8054 .addReg(Interm)
8055 .add(*Op1);
8056
8057 MRI.replaceRegWith(Dest.getReg(), NewDest);
8058
8059 Worklist.insert(&Xor);
8060}
8061
8062void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8063 MachineInstr &Inst) const {
8064 MachineBasicBlock &MBB = *Inst.getParent();
8066
8067 MachineBasicBlock::iterator MII = Inst;
8068 const DebugLoc &DL = Inst.getDebugLoc();
8069
8070 MachineOperand &Dest = Inst.getOperand(0);
8071 MachineOperand &Src = Inst.getOperand(1);
8072
8073 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8074 const TargetRegisterClass *SrcRC = Src.isReg() ?
8075 MRI.getRegClass(Src.getReg()) :
8076 &AMDGPU::SGPR_32RegClass;
8077
8078 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8079 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8080
8081 const TargetRegisterClass *SrcSubRC =
8082 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8083
8084 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8085 AMDGPU::sub0, SrcSubRC);
8086 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8087 AMDGPU::sub1, SrcSubRC);
8088
8089 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8090
8091 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8092
8093 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8094
8095 // We don't need to legalize operands here. src0 for either instruction can be
8096 // an SGPR, and the second input is unused or determined here.
8097 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8098}
8099
8100void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8101 MachineInstr &Inst) const {
8102 MachineBasicBlock &MBB = *Inst.getParent();
8104 MachineBasicBlock::iterator MII = Inst;
8105 const DebugLoc &DL = Inst.getDebugLoc();
8106
8107 MachineOperand &Dest = Inst.getOperand(0);
8108 uint32_t Imm = Inst.getOperand(2).getImm();
8109 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8110 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8111
8112 (void) Offset;
8113
8114 // Only sext_inreg cases handled.
8115 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8116 Offset == 0 && "Not implemented");
8117
8118 if (BitWidth < 32) {
8119 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8120 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8121 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8122
8123 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8124 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8125 .addImm(0)
8126 .addImm(BitWidth);
8127
8128 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8129 .addImm(31)
8130 .addReg(MidRegLo);
8131
8132 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8133 .addReg(MidRegLo)
8134 .addImm(AMDGPU::sub0)
8135 .addReg(MidRegHi)
8136 .addImm(AMDGPU::sub1);
8137
8138 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8139 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8140 return;
8141 }
8142
8143 MachineOperand &Src = Inst.getOperand(1);
8144 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8145 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8146
8147 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8148 .addImm(31)
8149 .addReg(Src.getReg(), 0, AMDGPU::sub0);
8150
8151 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8152 .addReg(Src.getReg(), 0, AMDGPU::sub0)
8153 .addImm(AMDGPU::sub0)
8154 .addReg(TmpReg)
8155 .addImm(AMDGPU::sub1);
8156
8157 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8158 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8159}
8160
8161void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8162 MachineInstr &Inst, unsigned Opcode,
8163 MachineDominatorTree *MDT) const {
8164 // (S_FLBIT_I32_B64 hi:lo) ->
8165 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8166 // (S_FF1_I32_B64 hi:lo) ->
8167 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8168
8169 MachineBasicBlock &MBB = *Inst.getParent();
8171 MachineBasicBlock::iterator MII = Inst;
8172 const DebugLoc &DL = Inst.getDebugLoc();
8173
8174 MachineOperand &Dest = Inst.getOperand(0);
8175 MachineOperand &Src = Inst.getOperand(1);
8176
8177 const MCInstrDesc &InstDesc = get(Opcode);
8178
8179 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
8180 unsigned OpcodeAdd =
8181 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
8182
8183 const TargetRegisterClass *SrcRC =
8184 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
8185 const TargetRegisterClass *SrcSubRC =
8186 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8187
8188 MachineOperand SrcRegSub0 =
8189 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
8190 MachineOperand SrcRegSub1 =
8191 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
8192
8193 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8194 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8195 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8196 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8197
8198 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
8199
8200 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
8201
8202 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
8203 .addReg(IsCtlz ? MidReg1 : MidReg2)
8204 .addImm(32)
8205 .addImm(1); // enable clamp
8206
8207 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
8208 .addReg(MidReg3)
8209 .addReg(IsCtlz ? MidReg2 : MidReg1);
8210
8211 MRI.replaceRegWith(Dest.getReg(), MidReg4);
8212
8213 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
8214}
8215
8216void SIInstrInfo::addUsersToMoveToVALUWorklist(
8218 SIInstrWorklist &Worklist) const {
8219 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
8220 E = MRI.use_end(); I != E;) {
8221 MachineInstr &UseMI = *I->getParent();
8222
8223 unsigned OpNo = 0;
8224
8225 switch (UseMI.getOpcode()) {
8226 case AMDGPU::COPY:
8227 case AMDGPU::WQM:
8228 case AMDGPU::SOFT_WQM:
8229 case AMDGPU::STRICT_WWM:
8230 case AMDGPU::STRICT_WQM:
8231 case AMDGPU::REG_SEQUENCE:
8232 case AMDGPU::PHI:
8233 case AMDGPU::INSERT_SUBREG:
8234 break;
8235 default:
8236 OpNo = I.getOperandNo();
8237 break;
8238 }
8239
8240 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) {
8241 Worklist.insert(&UseMI);
8242
8243 do {
8244 ++I;
8245 } while (I != E && I->getParent() == &UseMI);
8246 } else {
8247 ++I;
8248 }
8249 }
8250}
8251
8252void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
8254 MachineInstr &Inst) const {
8255 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8257 MachineOperand &Src0 = Inst.getOperand(1);
8258 MachineOperand &Src1 = Inst.getOperand(2);
8259 const DebugLoc &DL = Inst.getDebugLoc();
8260
8261 switch (Inst.getOpcode()) {
8262 case AMDGPU::S_PACK_LL_B32_B16: {
8263 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8264 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8265
8266 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
8267 // 0.
8268 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8269 .addImm(0xffff);
8270
8271 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
8272 .addReg(ImmReg, RegState::Kill)
8273 .add(Src0);
8274
8275 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8276 .add(Src1)
8277 .addImm(16)
8278 .addReg(TmpReg, RegState::Kill);
8279 break;
8280 }
8281 case AMDGPU::S_PACK_LH_B32_B16: {
8282 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8283 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8284 .addImm(0xffff);
8285 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
8286 .addReg(ImmReg, RegState::Kill)
8287 .add(Src0)
8288 .add(Src1);
8289 break;
8290 }
8291 case AMDGPU::S_PACK_HL_B32_B16: {
8292 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8293 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8294 .addImm(16)
8295 .add(Src0);
8296 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8297 .add(Src1)
8298 .addImm(16)
8299 .addReg(TmpReg, RegState::Kill);
8300 break;
8301 }
8302 case AMDGPU::S_PACK_HH_B32_B16: {
8303 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8304 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8305 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8306 .addImm(16)
8307 .add(Src0);
8308 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8309 .addImm(0xffff0000);
8310 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
8311 .add(Src1)
8312 .addReg(ImmReg, RegState::Kill)
8313 .addReg(TmpReg, RegState::Kill);
8314 break;
8315 }
8316 default:
8317 llvm_unreachable("unhandled s_pack_* instruction");
8318 }
8319
8320 MachineOperand &Dest = Inst.getOperand(0);
8321 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8322 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8323}
8324
8325void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
8326 MachineInstr &SCCDefInst,
8327 SIInstrWorklist &Worklist,
8328 Register NewCond) const {
8329
8330 // Ensure that def inst defines SCC, which is still live.
8331 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
8332 !Op.isDead() && Op.getParent() == &SCCDefInst);
8333 SmallVector<MachineInstr *, 4> CopyToDelete;
8334 // This assumes that all the users of SCC are in the same block
8335 // as the SCC def.
8336 for (MachineInstr &MI : // Skip the def inst itself.
8337 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
8338 SCCDefInst.getParent()->end())) {
8339 // Check if SCC is used first.
8340 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI);
8341 if (SCCIdx != -1) {
8342 if (MI.isCopy()) {
8343 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8344 Register DestReg = MI.getOperand(0).getReg();
8345
8346 MRI.replaceRegWith(DestReg, NewCond);
8347 CopyToDelete.push_back(&MI);
8348 } else {
8349
8350 if (NewCond.isValid())
8351 MI.getOperand(SCCIdx).setReg(NewCond);
8352
8353 Worklist.insert(&MI);
8354 }
8355 }
8356 // Exit if we find another SCC def.
8357 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1)
8358 break;
8359 }
8360 for (auto &Copy : CopyToDelete)
8361 Copy->eraseFromParent();
8362}
8363
8364// Instructions that use SCC may be converted to VALU instructions. When that
8365// happens, the SCC register is changed to VCC_LO. The instruction that defines
8366// SCC must be changed to an instruction that defines VCC. This function makes
8367// sure that the instruction that defines SCC is added to the moveToVALU
8368// worklist.
8369void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
8370 SIInstrWorklist &Worklist) const {
8371 // Look for a preceding instruction that either defines VCC or SCC. If VCC
8372 // then there is nothing to do because the defining instruction has been
8373 // converted to a VALU already. If SCC then that instruction needs to be
8374 // converted to a VALU.
8375 for (MachineInstr &MI :
8376 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
8377 SCCUseInst->getParent()->rend())) {
8378 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
8379 break;
8380 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
8381 Worklist.insert(&MI);
8382 break;
8383 }
8384 }
8385}
8386
8387const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
8388 const MachineInstr &Inst) const {
8389 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
8390
8391 switch (Inst.getOpcode()) {
8392 // For target instructions, getOpRegClass just returns the virtual register
8393 // class associated with the operand, so we need to find an equivalent VGPR
8394 // register class in order to move the instruction to the VALU.
8395 case AMDGPU::COPY:
8396 case AMDGPU::PHI:
8397 case AMDGPU::REG_SEQUENCE:
8398 case AMDGPU::INSERT_SUBREG:
8399 case AMDGPU::WQM:
8400 case AMDGPU::SOFT_WQM:
8401 case AMDGPU::STRICT_WWM:
8402 case AMDGPU::STRICT_WQM: {
8403 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
8404 if (RI.isAGPRClass(SrcRC)) {
8405 if (RI.isAGPRClass(NewDstRC))
8406 return nullptr;
8407
8408 switch (Inst.getOpcode()) {
8409 case AMDGPU::PHI:
8410 case AMDGPU::REG_SEQUENCE:
8411 case AMDGPU::INSERT_SUBREG:
8412 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
8413 break;
8414 default:
8415 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8416 }
8417
8418 if (!NewDstRC)
8419 return nullptr;
8420 } else {
8421 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
8422 return nullptr;
8423
8424 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8425 if (!NewDstRC)
8426 return nullptr;
8427 }
8428
8429 return NewDstRC;
8430 }
8431 default:
8432 return NewDstRC;
8433 }
8434}
8435
8436// Find the one SGPR operand we are allowed to use.
8437Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
8438 int OpIndices[3]) const {
8439 const MCInstrDesc &Desc = MI.getDesc();
8440
8441 // Find the one SGPR operand we are allowed to use.
8442 //
8443 // First we need to consider the instruction's operand requirements before
8444 // legalizing. Some operands are required to be SGPRs, such as implicit uses
8445 // of VCC, but we are still bound by the constant bus requirement to only use
8446 // one.
8447 //
8448 // If the operand's class is an SGPR, we can never move it.
8449
8450 Register SGPRReg = findImplicitSGPRRead(MI);
8451 if (SGPRReg)
8452 return SGPRReg;
8453
8454 Register UsedSGPRs[3] = {Register()};
8455 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8456
8457 for (unsigned i = 0; i < 3; ++i) {
8458 int Idx = OpIndices[i];
8459 if (Idx == -1)
8460 break;
8461
8462 const MachineOperand &MO = MI.getOperand(Idx);
8463 if (!MO.isReg())
8464 continue;
8465
8466 // Is this operand statically required to be an SGPR based on the operand
8467 // constraints?
8468 const TargetRegisterClass *OpRC =
8469 RI.getRegClass(Desc.operands()[Idx].RegClass);
8470 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
8471 if (IsRequiredSGPR)
8472 return MO.getReg();
8473
8474 // If this could be a VGPR or an SGPR, Check the dynamic register class.
8475 Register Reg = MO.getReg();
8476 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
8477 if (RI.isSGPRClass(RegRC))
8478 UsedSGPRs[i] = Reg;
8479 }
8480
8481 // We don't have a required SGPR operand, so we have a bit more freedom in
8482 // selecting operands to move.
8483
8484 // Try to select the most used SGPR. If an SGPR is equal to one of the
8485 // others, we choose that.
8486 //
8487 // e.g.
8488 // V_FMA_F32 v0, s0, s0, s0 -> No moves
8489 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
8490
8491 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
8492 // prefer those.
8493
8494 if (UsedSGPRs[0]) {
8495 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
8496 SGPRReg = UsedSGPRs[0];
8497 }
8498
8499 if (!SGPRReg && UsedSGPRs[1]) {
8500 if (UsedSGPRs[1] == UsedSGPRs[2])
8501 SGPRReg = UsedSGPRs[1];
8502 }
8503
8504 return SGPRReg;
8505}
8506
8508 unsigned OperandName) const {
8509 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
8510 if (Idx == -1)
8511 return nullptr;
8512
8513 return &MI.getOperand(Idx);
8514}
8515
8521 return (Format << 44) |
8522 (1ULL << 56) | // RESOURCE_LEVEL = 1
8523 (3ULL << 60); // OOB_SELECT = 3
8524 }
8525
8526 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
8527 if (ST.isAmdHsaOS()) {
8528 // Set ATC = 1. GFX9 doesn't have this bit.
8530 RsrcDataFormat |= (1ULL << 56);
8531
8532 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
8533 // BTW, it disables TC L2 and therefore decreases performance.
8535 RsrcDataFormat |= (2ULL << 59);
8536 }
8537
8538 return RsrcDataFormat;
8539}
8540
8544 0xffffffff; // Size;
8545
8546 // GFX9 doesn't have ELEMENT_SIZE.
8548 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
8549 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
8550 }
8551
8552 // IndexStride = 64 / 32.
8553 uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2;
8554 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
8555
8556 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
8557 // Clear them unless we want a huge stride.
8560 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
8561
8562 return Rsrc23;
8563}
8564
8566 unsigned Opc = MI.getOpcode();
8567
8568 return isSMRD(Opc);
8569}
8570
8572 return get(Opc).mayLoad() &&
8573 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
8574}
8575
8577 int &FrameIndex) const {
8578 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
8579 if (!Addr || !Addr->isFI())
8580 return Register();
8581
8582 assert(!MI.memoperands_empty() &&
8583 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
8584
8585 FrameIndex = Addr->getIndex();
8586 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
8587}
8588
8590 int &FrameIndex) const {
8591 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
8592 assert(Addr && Addr->isFI());
8593 FrameIndex = Addr->getIndex();
8594 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
8595}
8596
8598 int &FrameIndex) const {
8599 if (!MI.mayLoad())
8600 return Register();
8601
8602 if (isMUBUF(MI) || isVGPRSpill(MI))
8603 return isStackAccess(MI, FrameIndex);
8604
8605 if (isSGPRSpill(MI))
8606 return isSGPRStackAccess(MI, FrameIndex);
8607
8608 return Register();
8609}
8610
8612 int &FrameIndex) const {
8613 if (!MI.mayStore())
8614 return Register();
8615
8616 if (isMUBUF(MI) || isVGPRSpill(MI))
8617 return isStackAccess(MI, FrameIndex);
8618
8619 if (isSGPRSpill(MI))
8620 return isSGPRStackAccess(MI, FrameIndex);
8621
8622 return Register();
8623}
8624
8626 unsigned Size = 0;
8628 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
8629 while (++I != E && I->isInsideBundle()) {
8630 assert(!I->isBundle() && "No nested bundle!");
8632 }
8633
8634 return Size;
8635}
8636
8638 unsigned Opc = MI.getOpcode();
8640 unsigned DescSize = Desc.getSize();
8641
8642 // If we have a definitive size, we can use it. Otherwise we need to inspect
8643 // the operands to know the size.
8644 if (isFixedSize(MI)) {
8645 unsigned Size = DescSize;
8646
8647 // If we hit the buggy offset, an extra nop will be inserted in MC so
8648 // estimate the worst case.
8649 if (MI.isBranch() && ST.hasOffset3fBug())
8650 Size += 4;
8651
8652 return Size;
8653 }
8654
8655 // Instructions may have a 32-bit literal encoded after them. Check
8656 // operands that could ever be literals.
8657 if (isVALU(MI) || isSALU(MI)) {
8658 if (isDPP(MI))
8659 return DescSize;
8660 bool HasLiteral = false;
8661 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
8662 const MachineOperand &Op = MI.getOperand(I);
8663 const MCOperandInfo &OpInfo = Desc.operands()[I];
8664 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
8665 HasLiteral = true;
8666 break;
8667 }
8668 }
8669 return HasLiteral ? DescSize + 4 : DescSize;
8670 }
8671
8672 // Check whether we have extra NSA words.
8673 if (isMIMG(MI)) {
8674 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
8675 if (VAddr0Idx < 0)
8676 return 8;
8677
8678 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
8679 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
8680 }
8681
8682 switch (Opc) {
8683 case TargetOpcode::BUNDLE:
8684 return getInstBundleSize(MI);
8685 case TargetOpcode::INLINEASM:
8686 case TargetOpcode::INLINEASM_BR: {
8687 const MachineFunction *MF = MI.getParent()->getParent();
8688 const char *AsmStr = MI.getOperand(0).getSymbolName();
8689 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
8690 }
8691 default:
8692 if (MI.isMetaInstruction())
8693 return 0;
8694 return DescSize;
8695 }
8696}
8697
8699 if (!isFLAT(MI))
8700 return false;
8701
8702 if (MI.memoperands_empty())
8703 return true;
8704
8705 for (const MachineMemOperand *MMO : MI.memoperands()) {
8706 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
8707 return true;
8708 }
8709 return false;
8710}
8711
8713 return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
8714}
8715
8717 MachineBasicBlock *IfEnd) const {
8719 assert(TI != IfEntry->end());
8720
8721 MachineInstr *Branch = &(*TI);
8722 MachineFunction *MF = IfEntry->getParent();
8724
8725 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
8726 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
8727 MachineInstr *SIIF =
8728 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
8729 .add(Branch->getOperand(0))
8730 .add(Branch->getOperand(1));
8731 MachineInstr *SIEND =
8732 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
8733 .addReg(DstReg);
8734
8735 IfEntry->erase(TI);
8736 IfEntry->insert(IfEntry->end(), SIIF);
8737 IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
8738 }
8739}
8740
8742 MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
8744 // We expect 2 terminators, one conditional and one unconditional.
8745 assert(TI != LoopEnd->end());
8746
8747 MachineInstr *Branch = &(*TI);
8748 MachineFunction *MF = LoopEnd->getParent();
8750
8751 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
8752
8753 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
8754 Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC());
8755 MachineInstrBuilder HeaderPHIBuilder =
8756 BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
8757 for (MachineBasicBlock *PMBB : LoopEntry->predecessors()) {
8758 if (PMBB == LoopEnd) {
8759 HeaderPHIBuilder.addReg(BackEdgeReg);
8760 } else {
8761 Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC());
8762 materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
8763 ZeroReg, 0);
8764 HeaderPHIBuilder.addReg(ZeroReg);
8765 }
8766 HeaderPHIBuilder.addMBB(PMBB);
8767 }
8768 MachineInstr *HeaderPhi = HeaderPHIBuilder;
8769 MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
8770 get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
8771 .addReg(DstReg)
8772 .add(Branch->getOperand(0));
8773 MachineInstr *SILOOP =
8774 BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
8775 .addReg(BackEdgeReg)
8776 .addMBB(LoopEntry);
8777
8778 LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
8779 LoopEnd->erase(TI);
8780 LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
8781 LoopEnd->insert(LoopEnd->end(), SILOOP);
8782 }
8783}
8784
8787 static const std::pair<int, const char *> TargetIndices[] = {
8788 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
8789 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
8790 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
8791 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
8792 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
8793 return ArrayRef(TargetIndices);
8794}
8795
8796/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
8797/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
8800 const ScheduleDAG *DAG) const {
8801 return new GCNHazardRecognizer(DAG->MF);
8802}
8803
8804/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
8805/// pass.
8808 return new GCNHazardRecognizer(MF);
8809}
8810
8811// Called during:
8812// - pre-RA scheduling and post-RA scheduling
8815 const ScheduleDAGMI *DAG) const {
8816 // Borrowed from Arm Target
8817 // We would like to restrict this hazard recognizer to only
8818 // post-RA scheduling; we can tell that we're post-RA because we don't
8819 // track VRegLiveness.
8820 if (!DAG->hasVRegLiveness())
8821 return new GCNHazardRecognizer(DAG->MF);
8823}
8824
8825std::pair<unsigned, unsigned>
8827 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
8828}
8829
8832 static const std::pair<unsigned, const char *> TargetFlags[] = {
8833 { MO_GOTPCREL, "amdgpu-gotprel" },
8834 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
8835 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
8836 { MO_REL32_LO, "amdgpu-rel32-lo" },
8837 { MO_REL32_HI, "amdgpu-rel32-hi" },
8838 { MO_ABS32_LO, "amdgpu-abs32-lo" },
8839 { MO_ABS32_HI, "amdgpu-abs32-hi" },
8840 };
8841
8842 return ArrayRef(TargetFlags);
8843}
8844
8847 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
8848 {
8849 {MONoClobber, "amdgpu-noclobber"},
8850 {MOLastUse, "amdgpu-last-use"},
8851 };
8852
8853 return ArrayRef(TargetFlags);
8854}
8855
8857 const MachineFunction &MF) const {
8859 assert(SrcReg.isVirtual());
8860 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
8861 return AMDGPU::WWM_COPY;
8862
8863 return AMDGPU::COPY;
8864}
8865
8867 Register Reg) const {
8868 // We need to handle instructions which may be inserted during register
8869 // allocation to handle the prolog. The initial prolog instruction may have
8870 // been separated from the start of the block by spills and copies inserted
8871 // needed by the prolog. However, the insertions for scalar registers can
8872 // always be placed at the BB top as they are independent of the exec mask
8873 // value.
8874 bool IsNullOrVectorRegister = true;
8875 if (Reg) {
8876 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8877 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
8878 }
8879
8880 uint16_t Opcode = MI.getOpcode();
8881 // FIXME: Copies inserted in the block prolog for live-range split should also
8882 // be included.
8883 return IsNullOrVectorRegister &&
8884 (isSpill(Opcode) || (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
8885 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
8886}
8887
8891 const DebugLoc &DL,
8892 Register DestReg) const {
8893 if (ST.hasAddNoCarry())
8894 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
8895
8897 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
8898 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
8899
8900 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
8901 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
8902}
8903
8906 const DebugLoc &DL,
8907 Register DestReg,
8908 RegScavenger &RS) const {
8909 if (ST.hasAddNoCarry())
8910 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
8911
8912 // If available, prefer to use vcc.
8913 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
8914 ? Register(RI.getVCC())
8916 *RI.getBoolRC(), I, /* RestoreAfter */ false,
8917 0, /* AllowSpill */ false);
8918
8919 // TODO: Users need to deal with this.
8920 if (!UnusedCarry.isValid())
8921 return MachineInstrBuilder();
8922
8923 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
8924 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
8925}
8926
8927bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
8928 switch (Opcode) {
8929 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
8930 case AMDGPU::SI_KILL_I1_TERMINATOR:
8931 return true;
8932 default:
8933 return false;
8934 }
8935}
8936
8938 switch (Opcode) {
8939 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
8940 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
8941 case AMDGPU::SI_KILL_I1_PSEUDO:
8942 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
8943 default:
8944 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
8945 }
8946}
8947
8948bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
8949 return Imm <= getMaxMUBUFImmOffset(ST);
8950}
8951
8953 // GFX12 field is non-negative 24-bit signed byte offset.
8954 const unsigned OffsetBits =
8955 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
8956 return (1 << OffsetBits) - 1;
8957}
8958
8960 if (!ST.isWave32())
8961 return;
8962
8963 if (MI.isInlineAsm())
8964 return;
8965
8966 for (auto &Op : MI.implicit_operands()) {
8967 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
8968 Op.setReg(AMDGPU::VCC_LO);
8969 }
8970}
8971
8973 if (!isSMRD(MI))
8974 return false;
8975
8976 // Check that it is using a buffer resource.
8977 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
8978 if (Idx == -1) // e.g. s_memtime
8979 return false;
8980
8981 const auto RCID = MI.getDesc().operands()[Idx].RegClass;
8982 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
8983}
8984
8985// Given Imm, split it into the values to put into the SOffset and ImmOffset
8986// fields in an MUBUF instruction. Return false if it is not possible (due to a
8987// hardware bug needing a workaround).
8988//
8989// The required alignment ensures that individual address components remain
8990// aligned if they are aligned to begin with. It also ensures that additional
8991// offsets within the given alignment can be added to the resulting ImmOffset.
8993 uint32_t &ImmOffset, Align Alignment) const {
8994 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
8995 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
8996 uint32_t Overflow = 0;
8997
8998 if (Imm > MaxImm) {
8999 if (Imm <= MaxImm + 64) {
9000 // Use an SOffset inline constant for 4..64
9001 Overflow = Imm - MaxImm;
9002 Imm = MaxImm;
9003 } else {
9004 // Try to keep the same value in SOffset for adjacent loads, so that
9005 // the corresponding register contents can be re-used.
9006 //
9007 // Load values with all low-bits (except for alignment bits) set into
9008 // SOffset, so that a larger range of values can be covered using
9009 // s_movk_i32.
9010 //
9011 // Atomic operations fail to work correctly when individual address
9012 // components are unaligned, even if their sum is aligned.
9013 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9014 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9015 Imm = Low;
9016 Overflow = High - Alignment.value();
9017 }
9018 }
9019
9020 if (Overflow > 0) {
9021 // There is a hardware bug in SI and CI which prevents address clamping in
9022 // MUBUF instructions from working correctly with SOffsets. The immediate
9023 // offset is unaffected.
9025 return false;
9026
9027 // It is not possible to set immediate in SOffset field on some targets.
9028 if (ST.hasRestrictedSOffset())
9029 return false;
9030 }
9031
9032 ImmOffset = Imm;
9033 SOffset = Overflow;
9034 return true;
9035}
9036
9037// Depending on the used address space and instructions, some immediate offsets
9038// are allowed and some are not.
9039// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9040// scratch instruction offsets can also be negative. On GFX12, offsets can be
9041// negative for all variants.
9042//
9043// There are several bugs related to these offsets:
9044// On gfx10.1, flat instructions that go into the global address space cannot
9045// use an offset.
9046//
9047// For scratch instructions, the address can be either an SGPR or a VGPR.
9048// The following offsets can be used, depending on the architecture (x means
9049// cannot be used):
9050// +----------------------------+------+------+
9051// | Address-Mode | SGPR | VGPR |
9052// +----------------------------+------+------+
9053// | gfx9 | | |
9054// | negative, 4-aligned offset | x | ok |
9055// | negative, unaligned offset | x | ok |
9056// +----------------------------+------+------+
9057// | gfx10 | | |
9058// | negative, 4-aligned offset | ok | ok |
9059// | negative, unaligned offset | ok | x |
9060// +----------------------------+------+------+
9061// | gfx10.3 | | |
9062// | negative, 4-aligned offset | ok | ok |
9063// | negative, unaligned offset | ok | ok |
9064// +----------------------------+------+------+
9065//
9066// This function ignores the addressing mode, so if an offset cannot be used in
9067// one addressing mode, it is considered illegal.
9068bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9069 uint64_t FlatVariant) const {
9070 // TODO: Should 0 be special cased?
9071 if (!ST.hasFlatInstOffsets())
9072 return false;
9073
9074 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9075 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9076 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9077 return false;
9078
9080 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9081 (Offset % 4) != 0) {
9082 return false;
9083 }
9084
9085 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9086 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9087 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9088}
9089
9090// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9091std::pair<int64_t, int64_t>
9092SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9093 uint64_t FlatVariant) const {
9094 int64_t RemainderOffset = COffsetVal;
9095 int64_t ImmField = 0;
9096
9097 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9098 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9099
9100 if (AllowNegative) {
9101 // Use signed division by a power of two to truncate towards 0.
9102 int64_t D = 1LL << NumBits;
9103 RemainderOffset = (COffsetVal / D) * D;
9104 ImmField = COffsetVal - RemainderOffset;
9105
9107 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9108 (ImmField % 4) != 0) {
9109 // Make ImmField a multiple of 4
9110 RemainderOffset += ImmField % 4;
9111 ImmField -= ImmField % 4;
9112 }
9113 } else if (COffsetVal >= 0) {
9114 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9115 RemainderOffset = COffsetVal - ImmField;
9116 }
9117
9118 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9119 assert(RemainderOffset + ImmField == COffsetVal);
9120 return {ImmField, RemainderOffset};
9121}
9122
9124 if (ST.hasNegativeScratchOffsetBug() &&
9125 FlatVariant == SIInstrFlags::FlatScratch)
9126 return false;
9127
9128 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9129}
9130
9131static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9132 switch (ST.getGeneration()) {
9133 default:
9134 break;
9137 return SIEncodingFamily::SI;
9140 return SIEncodingFamily::VI;
9147 }
9148 llvm_unreachable("Unknown subtarget generation!");
9149}
9150
9151bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9152 switch(MCOp) {
9153 // These opcodes use indirect register addressing so
9154 // they need special handling by codegen (currently missing).
9155 // Therefore it is too risky to allow these opcodes
9156 // to be selected by dpp combiner or sdwa peepholer.
9157 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
9158 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
9159 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
9160 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
9161 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
9162 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
9163 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
9164 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
9165 return true;
9166 default:
9167 return false;
9168 }
9169}
9170
9171int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
9172 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
9173
9174 unsigned Gen = subtargetEncodingFamily(ST);
9175
9176 if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
9179
9180 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
9181 // subtarget has UnpackedD16VMem feature.
9182 // TODO: remove this when we discard GFX80 encoding.
9183 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
9185
9186 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
9187 switch (ST.getGeneration()) {
9188 default:
9190 break;
9193 break;
9196 break;
9197 }
9198 }
9199
9200 if (isMAI(Opcode)) {
9201 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
9202 if (MFMAOp != -1)
9203 Opcode = MFMAOp;
9204 }
9205
9206 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
9207
9208 // -1 means that Opcode is already a native instruction.
9209 if (MCOp == -1)
9210 return Opcode;
9211
9212 if (ST.hasGFX90AInsts()) {
9213 uint16_t NMCOp = (uint16_t)-1;
9214 if (ST.hasGFX940Insts())
9216 if (NMCOp == (uint16_t)-1)
9218 if (NMCOp == (uint16_t)-1)
9220 if (NMCOp != (uint16_t)-1)
9221 MCOp = NMCOp;
9222 }
9223
9224 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
9225 // no encoding in the given subtarget generation.
9226 if (MCOp == (uint16_t)-1)
9227 return -1;
9228
9229 if (isAsmOnlyOpcode(MCOp))
9230 return -1;
9231
9232 return MCOp;
9233}
9234
9235static
9237 assert(RegOpnd.isReg());
9238 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
9239 getRegSubRegPair(RegOpnd);
9240}
9241
9244 assert(MI.isRegSequence());
9245 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
9246 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
9247 auto &RegOp = MI.getOperand(1 + 2 * I);
9248 return getRegOrUndef(RegOp);
9249 }
9251}
9252
9253// Try to find the definition of reg:subreg in subreg-manipulation pseudos
9254// Following a subreg of reg:subreg isn't supported
9257 if (!RSR.SubReg)
9258 return false;
9259 switch (MI.getOpcode()) {
9260 default: break;
9261 case AMDGPU::REG_SEQUENCE:
9262 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
9263 return true;
9264 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
9265 case AMDGPU::INSERT_SUBREG:
9266 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
9267 // inserted the subreg we're looking for
9268 RSR = getRegOrUndef(MI.getOperand(2));
9269 else { // the subreg in the rest of the reg
9270 auto R1 = getRegOrUndef(MI.getOperand(1));
9271 if (R1.SubReg) // subreg of subreg isn't supported
9272 return false;
9273 RSR.Reg = R1.Reg;
9274 }
9275 return true;
9276 }
9277 return false;
9278}
9279
9282 assert(MRI.isSSA());
9283 if (!P.Reg.isVirtual())
9284 return nullptr;
9285
9286 auto RSR = P;
9287 auto *DefInst = MRI.getVRegDef(RSR.Reg);
9288 while (auto *MI = DefInst) {
9289 DefInst = nullptr;
9290 switch (MI->getOpcode()) {
9291 case AMDGPU::COPY:
9292 case AMDGPU::V_MOV_B32_e32: {
9293 auto &Op1 = MI->getOperand(1);
9294 if (Op1.isReg() && Op1.getReg().isVirtual()) {
9295 if (Op1.isUndef())
9296 return nullptr;
9297 RSR = getRegSubRegPair(Op1);
9298 DefInst = MRI.getVRegDef(RSR.Reg);
9299 }
9300 break;
9301 }
9302 default:
9303 if (followSubRegDef(*MI, RSR)) {
9304 if (!RSR.Reg)
9305 return nullptr;
9306 DefInst = MRI.getVRegDef(RSR.Reg);
9307 }
9308 }
9309 if (!DefInst)
9310 return MI;
9311 }
9312 return nullptr;
9313}
9314
9316 Register VReg,
9317 const MachineInstr &DefMI,
9318 const MachineInstr &UseMI) {
9319 assert(MRI.isSSA() && "Must be run on SSA");
9320
9321 auto *TRI = MRI.getTargetRegisterInfo();
9322 auto *DefBB = DefMI.getParent();
9323
9324 // Don't bother searching between blocks, although it is possible this block
9325 // doesn't modify exec.
9326 if (UseMI.getParent() != DefBB)
9327 return true;
9328
9329 const int MaxInstScan = 20;
9330 int NumInst = 0;
9331
9332 // Stop scan at the use.
9333 auto E = UseMI.getIterator();
9334 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
9335 if (I->isDebugInstr())
9336 continue;
9337
9338 if (++NumInst > MaxInstScan)
9339 return true;
9340
9341 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
9342 return true;
9343 }
9344
9345 return false;
9346}
9347
9349 Register VReg,
9350 const MachineInstr &DefMI) {
9351 assert(MRI.isSSA() && "Must be run on SSA");
9352
9353 auto *TRI = MRI.getTargetRegisterInfo();
9354 auto *DefBB = DefMI.getParent();
9355
9356 const int MaxUseScan = 10;
9357 int NumUse = 0;
9358
9359 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
9360 auto &UseInst = *Use.getParent();
9361 // Don't bother searching between blocks, although it is possible this block
9362 // doesn't modify exec.
9363 if (UseInst.getParent() != DefBB || UseInst.isPHI())
9364 return true;
9365
9366 if (++NumUse > MaxUseScan)
9367 return true;
9368 }
9369
9370 if (NumUse == 0)
9371 return false;
9372
9373 const int MaxInstScan = 20;
9374 int NumInst = 0;
9375
9376 // Stop scan when we have seen all the uses.
9377 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
9378 assert(I != DefBB->end());
9379
9380 if (I->isDebugInstr())
9381 continue;
9382
9383 if (++NumInst > MaxInstScan)
9384 return true;
9385
9386 for (const MachineOperand &Op : I->operands()) {
9387 // We don't check reg masks here as they're used only on calls:
9388 // 1. EXEC is only considered const within one BB
9389 // 2. Call should be a terminator instruction if present in a BB
9390
9391 if (!Op.isReg())
9392 continue;
9393
9394 Register Reg = Op.getReg();
9395 if (Op.isUse()) {
9396 if (Reg == VReg && --NumUse == 0)
9397 return false;
9398 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
9399 return true;
9400 }
9401 }
9402}
9403
9406 const DebugLoc &DL, Register Src, Register Dst) const {
9407 auto Cur = MBB.begin();
9408 if (Cur != MBB.end())
9409 do {
9410 if (!Cur->isPHI() && Cur->readsRegister(Dst))
9411 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
9412 ++Cur;
9413 } while (Cur != MBB.end() && Cur != LastPHIIt);
9414
9415 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
9416 Dst);
9417}
9418
9421 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
9422 if (InsPt != MBB.end() &&
9423 (InsPt->getOpcode() == AMDGPU::SI_IF ||
9424 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
9425 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
9426 InsPt->definesRegister(Src)) {
9427 InsPt++;
9428 return BuildMI(MBB, InsPt, DL,
9429 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
9430 : AMDGPU::S_MOV_B64_term),
9431 Dst)
9432 .addReg(Src, 0, SrcSubReg)
9433 .addReg(AMDGPU::EXEC, RegState::Implicit);
9434 }
9435 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
9436 Dst);
9437}
9438
9439bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
9440
9443 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
9444 VirtRegMap *VRM) const {
9445 // This is a bit of a hack (copied from AArch64). Consider this instruction:
9446 //
9447 // %0:sreg_32 = COPY $m0
9448 //
9449 // We explicitly chose SReg_32 for the virtual register so such a copy might
9450 // be eliminated by RegisterCoalescer. However, that may not be possible, and
9451 // %0 may even spill. We can't spill $m0 normally (it would require copying to
9452 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
9453 // TargetInstrInfo::foldMemoryOperand() is going to try.
9454 // A similar issue also exists with spilling and reloading $exec registers.
9455 //
9456 // To prevent that, constrain the %0 register class here.
9457 if (isFullCopyInstr(MI)) {
9458 Register DstReg = MI.getOperand(0).getReg();
9459 Register SrcReg = MI.getOperand(1).getReg();
9460 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
9461 (DstReg.isVirtual() != SrcReg.isVirtual())) {
9463 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
9464 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
9465 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
9466 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
9467 return nullptr;
9468 } else if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
9469 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
9470 return nullptr;
9471 }
9472 }
9473 }
9474
9475 return nullptr;
9476}
9477
9479 const MachineInstr &MI,
9480 unsigned *PredCost) const {
9481 if (MI.isBundle()) {
9483 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
9484 unsigned Lat = 0, Count = 0;
9485 for (++I; I != E && I->isBundledWithPred(); ++I) {
9486 ++Count;
9487 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
9488 }
9489 return Lat + Count - 1;
9490 }
9491
9492 return SchedModel.computeInstrLatency(&MI);
9493}
9494
9497 unsigned opcode = MI.getOpcode();
9498 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
9499 auto IID = GI->getIntrinsicID();
9504
9505 switch (IID) {
9506 case Intrinsic::amdgcn_if:
9507 case Intrinsic::amdgcn_else:
9508 // FIXME: Uniform if second result
9509 break;
9510 }
9511
9513 }
9514
9515 // Loads from the private and flat address spaces are divergent, because
9516 // threads can execute the load instruction with the same inputs and get
9517 // different results.
9518 //
9519 // All other loads are not divergent, because if threads issue loads with the
9520 // same arguments, they will always get the same result.
9521 if (opcode == AMDGPU::G_LOAD) {
9522 if (MI.memoperands_empty())
9523 return InstructionUniformity::NeverUniform; // conservative assumption
9524
9525 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9526 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9527 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9528 })) {
9529 // At least one MMO in a non-global address space.
9531 }
9533 }
9534
9535 if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) ||
9536 opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
9537 opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
9538 AMDGPU::isGenericAtomic(opcode)) {
9540 }
9542}
9543
9546
9547 if (isNeverUniform(MI))
9549
9550 unsigned opcode = MI.getOpcode();
9551 if (opcode == AMDGPU::V_READLANE_B32 ||
9552 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
9553 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
9555
9556 if (isCopyInstr(MI)) {
9557 const MachineOperand &srcOp = MI.getOperand(1);
9558 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
9559 const TargetRegisterClass *regClass =
9560 RI.getPhysRegBaseClass(srcOp.getReg());
9563 }
9565 }
9566
9567 // GMIR handling
9568 if (MI.isPreISelOpcode())
9570
9571 // Atomics are divergent because they are executed sequentially: when an
9572 // atomic operation refers to the same address in each thread, then each
9573 // thread after the first sees the value written by the previous thread as
9574 // original value.
9575
9576 if (isAtomic(MI))
9578
9579 // Loads from the private and flat address spaces are divergent, because
9580 // threads can execute the load instruction with the same inputs and get
9581 // different results.
9582 if (isFLAT(MI) && MI.mayLoad()) {
9583 if (MI.memoperands_empty())
9584 return InstructionUniformity::NeverUniform; // conservative assumption
9585
9586 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9587 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9588 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9589 })) {
9590 // At least one MMO in a non-global address space.
9592 }
9593
9595 }
9596
9597 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9598 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
9599
9600 // FIXME: It's conceptually broken to report this for an instruction, and not
9601 // a specific def operand. For inline asm in particular, there could be mixed
9602 // uniform and divergent results.
9603 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
9604 const MachineOperand &SrcOp = MI.getOperand(I);
9605 if (!SrcOp.isReg())
9606 continue;
9607
9608 Register Reg = SrcOp.getReg();
9609 if (!Reg || !SrcOp.readsReg())
9610 continue;
9611
9612 // If RegBank is null, this is unassigned or an unallocatable special
9613 // register, which are all scalars.
9614 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
9615 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
9617 }
9618
9619 // TODO: Uniformity check condtions above can be rearranged for more
9620 // redability
9621
9622 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
9623 // currently turned into no-op COPYs by SelectionDAG ISel and are
9624 // therefore no longer recognizable.
9625
9627}
9628
9630 switch (MF.getFunction().getCallingConv()) {
9632 return 1;
9634 return 2;
9636 return 3;
9640 report_fatal_error("ds_ordered_count unsupported for this calling conv");
9643 case CallingConv::C:
9644 case CallingConv::Fast:
9645 default:
9646 // Assume other calling conventions are various compute callable functions
9647 return 0;
9648 }
9649}
9650
9652 Register &SrcReg2, int64_t &CmpMask,
9653 int64_t &CmpValue) const {
9654 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
9655 return false;
9656
9657 switch (MI.getOpcode()) {
9658 default:
9659 break;
9660 case AMDGPU::S_CMP_EQ_U32:
9661 case AMDGPU::S_CMP_EQ_I32:
9662 case AMDGPU::S_CMP_LG_U32:
9663 case AMDGPU::S_CMP_LG_I32:
9664 case AMDGPU::S_CMP_LT_U32:
9665 case AMDGPU::S_CMP_LT_I32:
9666 case AMDGPU::S_CMP_GT_U32:
9667 case AMDGPU::S_CMP_GT_I32:
9668 case AMDGPU::S_CMP_LE_U32:
9669 case AMDGPU::S_CMP_LE_I32:
9670 case AMDGPU::S_CMP_GE_U32:
9671 case AMDGPU::S_CMP_GE_I32:
9672 case AMDGPU::S_CMP_EQ_U64:
9673 case AMDGPU::S_CMP_LG_U64:
9674 SrcReg = MI.getOperand(0).getReg();
9675 if (MI.getOperand(1).isReg()) {
9676 if (MI.getOperand(1).getSubReg())
9677 return false;
9678 SrcReg2 = MI.getOperand(1).getReg();
9679 CmpValue = 0;
9680 } else if (MI.getOperand(1).isImm()) {
9681 SrcReg2 = Register();
9682 CmpValue = MI.getOperand(1).getImm();
9683 } else {
9684 return false;
9685 }
9686 CmpMask = ~0;
9687 return true;
9688 case AMDGPU::S_CMPK_EQ_U32:
9689 case AMDGPU::S_CMPK_EQ_I32:
9690 case AMDGPU::S_CMPK_LG_U32:
9691 case AMDGPU::S_CMPK_LG_I32:
9692 case AMDGPU::S_CMPK_LT_U32:
9693 case AMDGPU::S_CMPK_LT_I32:
9694 case AMDGPU::S_CMPK_GT_U32:
9695 case AMDGPU::S_CMPK_GT_I32:
9696 case AMDGPU::S_CMPK_LE_U32:
9697 case AMDGPU::S_CMPK_LE_I32:
9698 case AMDGPU::S_CMPK_GE_U32:
9699 case AMDGPU::S_CMPK_GE_I32:
9700 SrcReg = MI.getOperand(0).getReg();
9701 SrcReg2 = Register();
9702 CmpValue = MI.getOperand(1).getImm();
9703 CmpMask = ~0;
9704 return true;
9705 }
9706
9707 return false;
9708}
9709
9711 Register SrcReg2, int64_t CmpMask,
9712 int64_t CmpValue,
9713 const MachineRegisterInfo *MRI) const {
9714 if (!SrcReg || SrcReg.isPhysical())
9715 return false;
9716
9717 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
9718 return false;
9719
9720 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
9721 this](int64_t ExpectedValue, unsigned SrcSize,
9722 bool IsReversible, bool IsSigned) -> bool {
9723 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9724 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9725 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9726 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9727 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
9728 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9729 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9730 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9731 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9732 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
9733 //
9734 // Signed ge/gt are not used for the sign bit.
9735 //
9736 // If result of the AND is unused except in the compare:
9737 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
9738 //
9739 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9740 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9741 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
9742 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9743 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9744 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
9745
9746 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
9747 if (!Def || Def->getParent() != CmpInstr.getParent())
9748 return false;
9749
9750 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
9751 Def->getOpcode() != AMDGPU::S_AND_B64)
9752 return false;
9753
9754 int64_t Mask;
9755 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
9756 if (MO->isImm())
9757 Mask = MO->getImm();
9758 else if (!getFoldableImm(MO, Mask))
9759 return false;
9760 Mask &= maxUIntN(SrcSize);
9761 return isPowerOf2_64(Mask);
9762 };
9763
9764 MachineOperand *SrcOp = &Def->getOperand(1);
9765 if (isMask(SrcOp))
9766 SrcOp = &Def->getOperand(2);
9767 else if (isMask(&Def->getOperand(2)))
9768 SrcOp = &Def->getOperand(1);
9769 else
9770 return false;
9771
9772 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
9773 if (IsSigned && BitNo == SrcSize - 1)
9774 return false;
9775
9776 ExpectedValue <<= BitNo;
9777
9778 bool IsReversedCC = false;
9779 if (CmpValue != ExpectedValue) {
9780 if (!IsReversible)
9781 return false;
9782 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
9783 if (!IsReversedCC)
9784 return false;
9785 }
9786
9787 Register DefReg = Def->getOperand(0).getReg();
9788 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
9789 return false;
9790
9791 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
9792 I != E; ++I) {
9793 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
9794 I->killsRegister(AMDGPU::SCC, &RI))
9795 return false;
9796 }
9797
9798 MachineOperand *SccDef = Def->findRegisterDefOperand(AMDGPU::SCC);
9799 SccDef->setIsDead(false);
9800 CmpInstr.eraseFromParent();
9801
9802 if (!MRI->use_nodbg_empty(DefReg)) {
9803 assert(!IsReversedCC);
9804 return true;
9805 }
9806
9807 // Replace AND with unused result with a S_BITCMP.
9808 MachineBasicBlock *MBB = Def->getParent();
9809
9810 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
9811 : AMDGPU::S_BITCMP1_B32
9812 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
9813 : AMDGPU::S_BITCMP1_B64;
9814
9815 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
9816 .add(*SrcOp)
9817 .addImm(BitNo);
9818 Def->eraseFromParent();
9819
9820 return true;
9821 };
9822
9823 switch (CmpInstr.getOpcode()) {
9824 default:
9825 break;
9826 case AMDGPU::S_CMP_EQ_U32:
9827 case AMDGPU::S_CMP_EQ_I32:
9828 case AMDGPU::S_CMPK_EQ_U32:
9829 case AMDGPU::S_CMPK_EQ_I32:
9830 return optimizeCmpAnd(1, 32, true, false);
9831 case AMDGPU::S_CMP_GE_U32:
9832 case AMDGPU::S_CMPK_GE_U32:
9833 return optimizeCmpAnd(1, 32, false, false);
9834 case AMDGPU::S_CMP_GE_I32:
9835 case AMDGPU::S_CMPK_GE_I32:
9836 return optimizeCmpAnd(1, 32, false, true);
9837 case AMDGPU::S_CMP_EQ_U64:
9838 return optimizeCmpAnd(1, 64, true, false);
9839 case AMDGPU::S_CMP_LG_U32:
9840 case AMDGPU::S_CMP_LG_I32:
9841 case AMDGPU::S_CMPK_LG_U32:
9842 case AMDGPU::S_CMPK_LG_I32:
9843 return optimizeCmpAnd(0, 32, true, false);
9844 case AMDGPU::S_CMP_GT_U32:
9845 case AMDGPU::S_CMPK_GT_U32:
9846 return optimizeCmpAnd(0, 32, false, false);
9847 case AMDGPU::S_CMP_GT_I32:
9848 case AMDGPU::S_CMPK_GT_I32:
9849 return optimizeCmpAnd(0, 32, false, true);
9850 case AMDGPU::S_CMP_LG_U64:
9851 return optimizeCmpAnd(0, 64, true, false);
9852 }
9853
9854 return false;
9855}
9856
9858 unsigned OpName) const {
9859 if (!ST.needsAlignedVGPRs())
9860 return;
9861
9862 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
9863 if (OpNo < 0)
9864 return;
9865 MachineOperand &Op = MI.getOperand(OpNo);
9866 if (getOpSize(MI, OpNo) > 4)
9867 return;
9868
9869 // Add implicit aligned super-reg to force alignment on the data operand.
9870 const DebugLoc &DL = MI.getDebugLoc();
9871 MachineBasicBlock *BB = MI.getParent();
9873 Register DataReg = Op.getReg();
9874 bool IsAGPR = RI.isAGPR(MRI, DataReg);
9875 Register Undef = MRI.createVirtualRegister(
9876 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
9877 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
9878 Register NewVR =
9879 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
9880 : &AMDGPU::VReg_64_Align2RegClass);
9881 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
9882 .addReg(DataReg, 0, Op.getSubReg())
9883 .addImm(AMDGPU::sub0)
9884 .addReg(Undef)
9885 .addImm(AMDGPU::sub1);
9886 Op.setReg(NewVR);
9887 Op.setSubReg(AMDGPU::sub0);
9888 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
9889}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
unsigned RegSize
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
amdgpu AMDGPU Register Bank Select
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
uint64_t High
#define P(N)
TargetInstrInfo::RegSubRegPair RegSubRegPair
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static bool offsetsDoNotOverlap(int WidthA, int OffsetA, int WidthB, int OffsetB)
static unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillRestoreOpcode(unsigned Size)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, const MachineRegisterInfo &MRI, const MCInstrDesc &TID, unsigned RCID, bool IsAllocatable)
static unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillSaveOpcode(unsigned Size)
static bool resultDependsOnExec(const MachineInstr &MI)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, unsigned OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
Definition: SIInstrInfo.cpp:85
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static constexpr unsigned ModifierOpNames[]
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
Definition: SIInstrInfo.cpp:76
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool IsDead
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool has16BitInsts() const
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
Class for arbitrary precision integers.
Definition: APInt.h:76
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:168
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:262
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:342
bool useVGPRIndexMode() const
bool hasSDWAOmod() const
Definition: GCNSubtarget.h:731
bool hasA16() const
bool hasSDWAScalar() const
Definition: GCNSubtarget.h:735
bool hasScalarCompareEq64() const
Definition: GCNSubtarget.h:944
bool hasOnlyRevVALUShifts() const
Definition: GCNSubtarget.h:376
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:610
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:751
bool hasMAIInsts() const
Definition: GCNSubtarget.h:801
bool hasMFMAInlineLiteralBug() const
bool hasNegativeScratchOffsetBug() const
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasPkMovB32() const
Definition: GCNSubtarget.h:993
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasR128A16() const
bool hasOffset3fBug() const
bool hasGetPCZeroExtension() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:263
const AMDGPURegisterBankInfo * getRegBankInfo() const override
Definition: GCNSubtarget.h:283
bool hasSDWAOutModsVOPC() const
Definition: GCNSubtarget.h:747
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:666
bool hasGFX940Insts() const
bool hasSDWASdst() const
Definition: GCNSubtarget.h:739
bool hasMovB64() const
bool isWave32() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:329
bool hasNegativeUnalignedScratchOffsetBug() const
bool hasG16() const
unsigned getNSAMaxSize(bool HasSampler=false) const
Generation getGeneration() const
Definition: GCNSubtarget.h:302
bool hasVOP3Literal() const
Definition: GCNSubtarget.h:871
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:718
bool hasAddr64() const
Definition: GCNSubtarget.h:366
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:710
bool hasGDS() const
bool hasPartialNSAEncoding() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
A possibly irreducible generalization of a Loop.
void getExitBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all of the successor blocks of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:157
iterator find(SlotIndex Pos)
find - Return an iterator pointing to the first segment that ends after Pos, or end().
void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:541
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:611
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:621
static const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition: MCExpr.cpp:194
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
bool mayStore() const
Return true if this instruction could possibly modify memory.
Definition: MCInstrDesc.h:444
bool mayLoad() const
Return true if this instruction could possibly read memory.
Definition: MCInstrDesc.h:438
unsigned short Opcode
Definition: MCInstrDesc.h:205
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
Definition: MCInstrDesc.h:565
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
uint8_t OperandType
Information about the type of the operand.
Definition: MCInstrDesc.h:97
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition: MCInstrDesc.h:91
unsigned getSubRegIdxSize(unsigned Idx) const
Get the size of the bit range covered by a sub-register index.
unsigned getSubRegIdxOffset(unsigned Idx) const
Get the offset of the bit range covered by a sub-register index.
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx)
Definition: MCExpr.h:397
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:40
void setVariableValue(const MCExpr *Value)
Definition: MCSymbol.cpp:47
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
unsigned pred_size() const
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
reverse_iterator rend()
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< iterator > terminators()
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineDomTreeNode * addNewBlock(MachineBasicBlock *BB, MachineBasicBlock *DomBB)
addNewBlock - Add a new node to the dominator tree information.
bool properlyDominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
void changeImmediateDominator(MachineBasicBlock *N, MachineBasicBlock *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:544
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:327
int findRegisterDefOperandIdx(Register Reg, bool isDead=false, bool Overlap=false, const TargetRegisterInfo *TRI=nullptr) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:547
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
iterator_range< mop_iterator > explicit_operands()
Definition: MachineInstr.h:666
unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
Definition: MachineInstr.h:790
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:775
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:757
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:473
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
iterator_range< mop_iterator > implicit_operands()
Definition: MachineInstr.h:674
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:554
uint32_t getFlags() const
Return the MI flags bitvector.
Definition: MachineInstr.h:372
bool addRegisterDead(Register Reg, const TargetRegisterInfo *RegInfo, bool AddIfNotFound=false)
We have determined MI defined a register without a use.
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
reg_begin/reg_end - Provide iteration support to walk over all definitions and uses of a register wit...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
void enterBasicBlockEnd(MachineBasicBlock &MBB)
Start tracking liveness from the end of basic block MBB.
bool isRegUsed(Register Reg, bool includeReserved=true) const
Return if a specific register is currently used.
void setRegUsed(Register Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Tell the scavenger a register is used.
void backward()
Update internal register state and move MBB iterator backwards.
void enterBasicBlock(MachineBasicBlock &MBB)
Start tracking liveness from the begin of basic block MBB.
Register scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, bool AllowSpill=true)
Make a register of the specific register class available from the current position backwards to the p...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
static bool isMAI(const MachineInstr &MI)
Definition: SIInstrInfo.h:792
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:554
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
bool isNonUniformBranchInstr(MachineInstr &Instr) const
static bool isVOP3(const MachineInstr &MI)
Definition: SIInstrInfo.h:504
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
Definition: SIInstrInfo.h:924
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
Definition: SIInstrInfo.h:1094
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:636
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:544
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const
Get required immediate operand.
Definition: SIInstrInfo.h:1213
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:536
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:649
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:408
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:496
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:512
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
Definition: SIInstrInfo.h:604
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:520
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
bool isSpill(uint16_t Opcode) const
Definition: SIInstrInfo.h:740
static bool isVIMAGE(const MachineInstr &MI)
Definition: SIInstrInfo.h:588
static bool isSOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:448
static bool isGWS(const MachineInstr &MI)
Definition: SIInstrInfo.h:570
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, MachineOperand &Src1, unsigned Src1OpName) const
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:628
static bool isVSAMPLE(const MachineInstr &MI)
Definition: SIInstrInfo.h:596
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
unsigned isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isImage(const MachineInstr &MI)
Definition: SIInstrInfo.h:424
static bool isSOPK(const MachineInstr &MI)
Definition: SIInstrInfo.h:464
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
Definition: SIInstrInfo.h:936
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const override
bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, const MachineOperand &MO) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:528
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void convertNonUniformLoopRegion(MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const override final
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:618
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool isDPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:760
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
bool isLowLatencyInstruction(const MachineInstr &MI) const
void materializeImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, int64_t Value) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
bool isAlwaysGDS(uint16_t Opcode) const
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool isVGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:716
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool isVGPRCopy(const MachineInstr &MI) const
Definition: SIInstrInfo.h:959
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:580
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description.
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
const TargetRegisterClass * getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI, const MachineFunction &MF) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc) const override
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
void convertNonUniformIfRegion(MachineBasicBlock *IfEntry, MachineBasicBlock *IfEnd) const
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
Whether we must prevent this instruction from executing with EXEC = 0.
static bool isAtomic(const MachineInstr &MI)
Definition: SIInstrInfo.h:681
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
static bool sopkIsZext(unsigned Opcode)
Definition: SIInstrInfo.h:863
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const override
static bool isSGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:728
static bool isWMMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:809
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:456
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:612
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:416
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
Definition: SIInstrInfo.h:1226
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
bool hasModifiersSet(const MachineInstr &MI, unsigned OpName) const
static bool isFixedSize(const MachineInstr &MI)
Definition: SIInstrInfo.h:880
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isLDSDMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:562
unsigned isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isVOP1(const MachineInstr &MI)
Definition: SIInstrInfo.h:488
SIInstrInfo(const GCNSubtarget &ST)
Definition: SIInstrInfo.cpp:66
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
const TargetRegisterClass * getRegClass(unsigned RCID) const
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
static bool hasVectorRegisters(const TargetRegisterClass *RC)
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool opCanUseInlineConstant(unsigned OpType) const
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
bool opCanUseLiteralConstant(unsigned OpType) const
static bool hasVGPRs(const TargetRegisterClass *RC)
static bool isVGPRClass(const TargetRegisterClass *RC)
unsigned getHWRegIndex(MCRegister Reg) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
const TargetRegisterClass * getBoolRC() const
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getChannelFromSubReg(unsigned SubReg) const
MCRegister getVCC() const
static bool hasAGPRs(const TargetRegisterClass *RC)
const TargetRegisterClass * getWaveMaskRegClass() const
bool spillSGPRToVGPR() const
const TargetRegisterClass * getVGPR64Class() const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:559
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:68
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
Definition: SlotIndexes.h:240
SlotIndexes pass.
Definition: SlotIndexes.h:300
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:523
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
const TargetRegisterClass * getAllocatableClass(const TargetRegisterClass *RC) const
Return the maximal subclass of the given register class that is allocatable or NULL.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
self_iterator getIterator()
Definition: ilist_node.h:109
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
Definition: SIInstrInfo.h:1504
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isDPALU_DPP(const MCInstrDesc &OpDesc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
Definition: SIInstrInfo.h:1505
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
bool isTrue16Inst(unsigned Opc)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo)
Is this an AMDGPU specific source operand? These include registers, inline constants,...
const uint64_t RSRC_TID_ENABLE
Definition: SIInstrInfo.h:1507
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isGenericAtomic(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isHi(unsigned Reg, const MCRegisterInfo &MRI)
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition: SIDefines.h:234
@ OPERAND_REG_IMM_INT64
Definition: SIDefines.h:201
@ OPERAND_REG_IMM_V2FP16
Definition: SIDefines.h:211
@ OPERAND_REG_INLINE_C_V2INT32
Definition: SIDefines.h:227
@ OPERAND_REG_INLINE_C_FP64
Definition: SIDefines.h:223
@ OPERAND_REG_INLINE_C_BF16
Definition: SIDefines.h:220
@ OPERAND_REG_INLINE_C_V2BF16
Definition: SIDefines.h:225
@ OPERAND_REG_IMM_V2INT16
Definition: SIDefines.h:212
@ OPERAND_REG_IMM_BF16
Definition: SIDefines.h:205
@ OPERAND_REG_INLINE_AC_V2FP16
Definition: SIDefines.h:246
@ OPERAND_REG_IMM_INT32
Operands with register or 32-bit immediate.
Definition: SIDefines.h:200
@ OPERAND_REG_IMM_V2BF16
Definition: SIDefines.h:210
@ OPERAND_REG_IMM_BF16_DEFERRED
Definition: SIDefines.h:207
@ OPERAND_REG_IMM_FP16
Definition: SIDefines.h:206
@ OPERAND_REG_INLINE_C_INT64
Definition: SIDefines.h:219
@ OPERAND_REG_INLINE_AC_BF16
Definition: SIDefines.h:240
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition: SIDefines.h:217
@ OPERAND_REG_INLINE_AC_INT16
Operands with an AccVGPR register or inline constant.
Definition: SIDefines.h:238
@ OPERAND_REG_IMM_FP64
Definition: SIDefines.h:204
@ OPERAND_REG_INLINE_C_V2FP16
Definition: SIDefines.h:226
@ OPERAND_REG_INLINE_AC_V2INT16
Definition: SIDefines.h:244
@ OPERAND_REG_INLINE_AC_FP16
Definition: SIDefines.h:241
@ OPERAND_REG_INLINE_AC_INT32
Definition: SIDefines.h:239
@ OPERAND_REG_INLINE_AC_FP32
Definition: SIDefines.h:242
@ OPERAND_REG_INLINE_AC_V2BF16
Definition: SIDefines.h:245
@ OPERAND_REG_IMM_V2INT32
Definition: SIDefines.h:213
@ OPERAND_REG_IMM_FP32
Definition: SIDefines.h:203
@ OPERAND_INPUT_MODS
Definition: SIDefines.h:251
@ OPERAND_REG_INLINE_C_FP32
Definition: SIDefines.h:222
@ OPERAND_REG_INLINE_C_INT32
Definition: SIDefines.h:218
@ OPERAND_REG_INLINE_C_V2INT16
Definition: SIDefines.h:224
@ OPERAND_REG_IMM_V2FP32
Definition: SIDefines.h:214
@ OPERAND_REG_INLINE_AC_FP64
Definition: SIDefines.h:243
@ OPERAND_REG_INLINE_C_FP16
Definition: SIDefines.h:221
@ OPERAND_REG_IMM_INT16
Definition: SIDefines.h:202
@ OPERAND_REG_INLINE_C_V2FP32
Definition: SIDefines.h:228
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition: SIDefines.h:231
@ OPERAND_REG_IMM_FP32_DEFERRED
Definition: SIDefines.h:209
@ OPERAND_REG_IMM_FP16_DEFERRED
Definition: SIDefines.h:208
@ TI_SCRATCH_RSRC_DWORD1
Definition: AMDGPU.h:409
@ TI_SCRATCH_RSRC_DWORD3
Definition: AMDGPU.h:411
@ TI_SCRATCH_RSRC_DWORD0
Definition: AMDGPU.h:408
@ TI_SCRATCH_RSRC_DWORD2
Definition: AMDGPU.h:410
@ TI_CONSTDATA_START
Definition: AMDGPU.h:407
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
int getMCOpcode(uint16_t Opcode, unsigned Gen)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
Definition: SIInstrInfo.h:1506
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
bool isGraphics(CallingConv::ID cc)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ OPERAND_GENERIC_4
Definition: MCInstrDesc.h:70
@ OPERAND_GENERIC_2
Definition: MCInstrDesc.h:68
@ OPERAND_GENERIC_1
Definition: MCInstrDesc.h:67
@ OPERAND_REGISTER
Definition: MCInstrDesc.h:61
@ OPERAND_GENERIC_3
Definition: MCInstrDesc.h:69
@ OPERAND_IMMEDIATE
Definition: MCInstrDesc.h:60
@ OPERAND_UNKNOWN
Definition: MCInstrDesc.h:59
@ OPERAND_GENERIC_0
Definition: MCInstrDesc.h:66
@ OPERAND_GENERIC_5
Definition: MCInstrDesc.h:71
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:456
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1395
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1731
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:417
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:665
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:269
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:41
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1738
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:313
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:136
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:141
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:233
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
Definition: TargetOpcodes.h:36
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Error
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:45
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1888
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:428
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition: Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
@ NeverUniform
The result values can never be assumed to be uniform.
@ Default
The result values are uniform if and only if all operands are uniform.
uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition: MathExtras.h:201
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition: APFloat.cpp:216
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Description of the encoding of one expression Op.
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
Definition: LiveVariables.h:85
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition: SIInstrInfo.h:49
MachineInstr * top() const
Definition: SIInstrInfo.h:54
bool empty() const
Definition: SIInstrInfo.h:64
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition: SIInstrInfo.h:73
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.