LLVM 20.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "GCNHazardRecognizer.h"
18#include "GCNSubtarget.h"
31#include "llvm/IR/IntrinsicsAMDGPU.h"
32#include "llvm/MC/MCContext.h"
35
36using namespace llvm;
37
38#define DEBUG_TYPE "si-instr-info"
39
40#define GET_INSTRINFO_CTOR_DTOR
41#include "AMDGPUGenInstrInfo.inc"
42
43namespace llvm::AMDGPU {
44#define GET_D16ImageDimIntrinsics_IMPL
45#define GET_ImageDimIntrinsicTable_IMPL
46#define GET_RsrcIntrinsics_IMPL
47#include "AMDGPUGenSearchableTables.inc"
48} // namespace llvm::AMDGPU
49
50// Must be at least 4 to be able to branch over minimum unconditional branch
51// code. This is only for making it possible to write reasonably small tests for
52// long branches.
54BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
55 cl::desc("Restrict range of branch instructions (DEBUG)"));
56
58 "amdgpu-fix-16-bit-physreg-copies",
59 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
60 cl::init(true),
62
64 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
65 RI(ST), ST(ST) {
66 SchedModel.init(&ST);
67}
68
69//===----------------------------------------------------------------------===//
70// TargetInstrInfo callbacks
71//===----------------------------------------------------------------------===//
72
73static unsigned getNumOperandsNoGlue(SDNode *Node) {
74 unsigned N = Node->getNumOperands();
75 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
76 --N;
77 return N;
78}
79
80/// Returns true if both nodes have the same value for the given
81/// operand \p Op, or if both nodes do not have this operand.
82static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
83 unsigned Opc0 = N0->getMachineOpcode();
84 unsigned Opc1 = N1->getMachineOpcode();
85
86 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
87 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
88
89 if (Op0Idx == -1 && Op1Idx == -1)
90 return true;
91
92
93 if ((Op0Idx == -1 && Op1Idx != -1) ||
94 (Op1Idx == -1 && Op0Idx != -1))
95 return false;
96
97 // getNamedOperandIdx returns the index for the MachineInstr's operands,
98 // which includes the result as the first operand. We are indexing into the
99 // MachineSDNode's operands, so we need to skip the result operand to get
100 // the real index.
101 --Op0Idx;
102 --Op1Idx;
103
104 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
105}
106
107static bool canRemat(const MachineInstr &MI) {
108
112 return true;
113
114 if (SIInstrInfo::isSMRD(MI)) {
115 return !MI.memoperands_empty() &&
116 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
117 return MMO->isLoad() && MMO->isInvariant();
118 });
119 }
120
121 return false;
122}
123
125 const MachineInstr &MI) const {
126
127 if (canRemat(MI)) {
128 // Normally VALU use of exec would block the rematerialization, but that
129 // is OK in this case to have an implicit exec read as all VALU do.
130 // We really want all of the generic logic for this except for this.
131
132 // Another potential implicit use is mode register. The core logic of
133 // the RA will not attempt rematerialization if mode is set anywhere
134 // in the function, otherwise it is safe since mode is not changed.
135
136 // There is difference to generic method which does not allow
137 // rematerialization if there are virtual register uses. We allow this,
138 // therefore this method includes SOP instructions as well.
139 if (!MI.hasImplicitDef() &&
140 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
141 !MI.mayRaiseFPException())
142 return true;
143 }
144
146}
147
148// Returns true if the scalar result of a VALU instruction depends on exec.
150 // Ignore comparisons which are only used masked with exec.
151 // This allows some hoisting/sinking of VALU comparisons.
152 if (MI.isCompare()) {
153 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
154 Register DstReg = MI.getOperand(0).getReg();
155 if (!DstReg.isVirtual())
156 return true;
157 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
158 switch (Use.getOpcode()) {
159 case AMDGPU::S_AND_SAVEEXEC_B32:
160 case AMDGPU::S_AND_SAVEEXEC_B64:
161 break;
162 case AMDGPU::S_AND_B32:
163 case AMDGPU::S_AND_B64:
164 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
165 return true;
166 break;
167 default:
168 return true;
169 }
170 }
171 return false;
172 }
173
174 switch (MI.getOpcode()) {
175 default:
176 break;
177 case AMDGPU::V_READFIRSTLANE_B32:
178 return true;
179 }
180
181 return false;
182}
183
185 // Any implicit use of exec by VALU is not a real register read.
186 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
188}
189
191 MachineBasicBlock *SuccToSinkTo,
192 MachineCycleInfo *CI) const {
193 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
194 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
195 return true;
196
197 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
198 // Check if sinking of MI would create temporal divergent use.
199 for (auto Op : MI.uses()) {
200 if (Op.isReg() && Op.getReg().isVirtual() &&
201 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
202 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
203
204 // SgprDef defined inside cycle
205 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
206 if (FromCycle == nullptr)
207 continue;
208
209 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
210 // Check if there is a FromCycle that contains SgprDef's basic block but
211 // does not contain SuccToSinkTo and also has divergent exit condition.
212 while (FromCycle && !FromCycle->contains(ToCycle)) {
214 FromCycle->getExitingBlocks(ExitingBlocks);
215
216 // FromCycle has divergent exit condition.
217 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
218 if (hasDivergentBranch(ExitingBlock))
219 return false;
220 }
221
222 FromCycle = FromCycle->getParentCycle();
223 }
224 }
225 }
226
227 return true;
228}
229
231 int64_t &Offset0,
232 int64_t &Offset1) const {
233 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
234 return false;
235
236 unsigned Opc0 = Load0->getMachineOpcode();
237 unsigned Opc1 = Load1->getMachineOpcode();
238
239 // Make sure both are actually loads.
240 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
241 return false;
242
243 // A mayLoad instruction without a def is not a load. Likely a prefetch.
244 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
245 return false;
246
247 if (isDS(Opc0) && isDS(Opc1)) {
248
249 // FIXME: Handle this case:
250 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
251 return false;
252
253 // Check base reg.
254 if (Load0->getOperand(0) != Load1->getOperand(0))
255 return false;
256
257 // Skip read2 / write2 variants for simplicity.
258 // TODO: We should report true if the used offsets are adjacent (excluded
259 // st64 versions).
260 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
261 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
262 if (Offset0Idx == -1 || Offset1Idx == -1)
263 return false;
264
265 // XXX - be careful of dataless loads
266 // getNamedOperandIdx returns the index for MachineInstrs. Since they
267 // include the output in the operand list, but SDNodes don't, we need to
268 // subtract the index by one.
269 Offset0Idx -= get(Opc0).NumDefs;
270 Offset1Idx -= get(Opc1).NumDefs;
271 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
272 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
273 return true;
274 }
275
276 if (isSMRD(Opc0) && isSMRD(Opc1)) {
277 // Skip time and cache invalidation instructions.
278 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
279 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
280 return false;
281
282 unsigned NumOps = getNumOperandsNoGlue(Load0);
283 if (NumOps != getNumOperandsNoGlue(Load1))
284 return false;
285
286 // Check base reg.
287 if (Load0->getOperand(0) != Load1->getOperand(0))
288 return false;
289
290 // Match register offsets, if both register and immediate offsets present.
291 assert(NumOps == 4 || NumOps == 5);
292 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
293 return false;
294
295 const ConstantSDNode *Load0Offset =
296 dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3));
297 const ConstantSDNode *Load1Offset =
298 dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3));
299
300 if (!Load0Offset || !Load1Offset)
301 return false;
302
303 Offset0 = Load0Offset->getZExtValue();
304 Offset1 = Load1Offset->getZExtValue();
305 return true;
306 }
307
308 // MUBUF and MTBUF can access the same addresses.
309 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
310
311 // MUBUF and MTBUF have vaddr at different indices.
312 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
313 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
314 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
315 return false;
316
317 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
318 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
319
320 if (OffIdx0 == -1 || OffIdx1 == -1)
321 return false;
322
323 // getNamedOperandIdx returns the index for MachineInstrs. Since they
324 // include the output in the operand list, but SDNodes don't, we need to
325 // subtract the index by one.
326 OffIdx0 -= get(Opc0).NumDefs;
327 OffIdx1 -= get(Opc1).NumDefs;
328
329 SDValue Off0 = Load0->getOperand(OffIdx0);
330 SDValue Off1 = Load1->getOperand(OffIdx1);
331
332 // The offset might be a FrameIndexSDNode.
333 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
334 return false;
335
336 Offset0 = Off0->getAsZExtVal();
337 Offset1 = Off1->getAsZExtVal();
338 return true;
339 }
340
341 return false;
342}
343
344static bool isStride64(unsigned Opc) {
345 switch (Opc) {
346 case AMDGPU::DS_READ2ST64_B32:
347 case AMDGPU::DS_READ2ST64_B64:
348 case AMDGPU::DS_WRITE2ST64_B32:
349 case AMDGPU::DS_WRITE2ST64_B64:
350 return true;
351 default:
352 return false;
353 }
354}
355
358 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
359 const TargetRegisterInfo *TRI) const {
360 if (!LdSt.mayLoadOrStore())
361 return false;
362
363 unsigned Opc = LdSt.getOpcode();
364 OffsetIsScalable = false;
365 const MachineOperand *BaseOp, *OffsetOp;
366 int DataOpIdx;
367
368 if (isDS(LdSt)) {
369 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
370 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
371 if (OffsetOp) {
372 // Normal, single offset LDS instruction.
373 if (!BaseOp) {
374 // DS_CONSUME/DS_APPEND use M0 for the base address.
375 // TODO: find the implicit use operand for M0 and use that as BaseOp?
376 return false;
377 }
378 BaseOps.push_back(BaseOp);
379 Offset = OffsetOp->getImm();
380 // Get appropriate operand, and compute width accordingly.
381 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
382 if (DataOpIdx == -1)
383 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
384 Width = getOpSize(LdSt, DataOpIdx);
385 } else {
386 // The 2 offset instructions use offset0 and offset1 instead. We can treat
387 // these as a load with a single offset if the 2 offsets are consecutive.
388 // We will use this for some partially aligned loads.
389 const MachineOperand *Offset0Op =
390 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
391 const MachineOperand *Offset1Op =
392 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
393
394 unsigned Offset0 = Offset0Op->getImm() & 0xff;
395 unsigned Offset1 = Offset1Op->getImm() & 0xff;
396 if (Offset0 + 1 != Offset1)
397 return false;
398
399 // Each of these offsets is in element sized units, so we need to convert
400 // to bytes of the individual reads.
401
402 unsigned EltSize;
403 if (LdSt.mayLoad())
404 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
405 else {
406 assert(LdSt.mayStore());
407 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
408 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
409 }
410
411 if (isStride64(Opc))
412 EltSize *= 64;
413
414 BaseOps.push_back(BaseOp);
415 Offset = EltSize * Offset0;
416 // Get appropriate operand(s), and compute width accordingly.
417 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
418 if (DataOpIdx == -1) {
419 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
420 Width = getOpSize(LdSt, DataOpIdx);
421 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
422 Width = Width.getValue() + getOpSize(LdSt, DataOpIdx);
423 } else {
424 Width = getOpSize(LdSt, DataOpIdx);
425 }
426 }
427 return true;
428 }
429
430 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
431 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
432 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
433 return false;
434 BaseOps.push_back(RSrc);
435 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
436 if (BaseOp && !BaseOp->isFI())
437 BaseOps.push_back(BaseOp);
438 const MachineOperand *OffsetImm =
439 getNamedOperand(LdSt, AMDGPU::OpName::offset);
440 Offset = OffsetImm->getImm();
441 const MachineOperand *SOffset =
442 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
443 if (SOffset) {
444 if (SOffset->isReg())
445 BaseOps.push_back(SOffset);
446 else
447 Offset += SOffset->getImm();
448 }
449 // Get appropriate operand, and compute width accordingly.
450 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
451 if (DataOpIdx == -1)
452 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
453 if (DataOpIdx == -1) // LDS DMA
454 return false;
455 Width = getOpSize(LdSt, DataOpIdx);
456 return true;
457 }
458
459 if (isImage(LdSt)) {
460 auto RsrcOpName =
461 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
462 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
463 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
464 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
465 if (VAddr0Idx >= 0) {
466 // GFX10 possible NSA encoding.
467 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
468 BaseOps.push_back(&LdSt.getOperand(I));
469 } else {
470 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
471 }
472 Offset = 0;
473 // Get appropriate operand, and compute width accordingly.
474 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
475 if (DataOpIdx == -1)
476 return false; // no return sampler
477 Width = getOpSize(LdSt, DataOpIdx);
478 return true;
479 }
480
481 if (isSMRD(LdSt)) {
482 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
483 if (!BaseOp) // e.g. S_MEMTIME
484 return false;
485 BaseOps.push_back(BaseOp);
486 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
487 Offset = OffsetOp ? OffsetOp->getImm() : 0;
488 // Get appropriate operand, and compute width accordingly.
489 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
490 if (DataOpIdx == -1)
491 return false;
492 Width = getOpSize(LdSt, DataOpIdx);
493 return true;
494 }
495
496 if (isFLAT(LdSt)) {
497 // Instructions have either vaddr or saddr or both or none.
498 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
499 if (BaseOp)
500 BaseOps.push_back(BaseOp);
501 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
502 if (BaseOp)
503 BaseOps.push_back(BaseOp);
504 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
505 // Get appropriate operand, and compute width accordingly.
506 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
507 if (DataOpIdx == -1)
508 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
509 if (DataOpIdx == -1) // LDS DMA
510 return false;
511 Width = getOpSize(LdSt, DataOpIdx);
512 return true;
513 }
514
515 return false;
516}
517
518static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
520 const MachineInstr &MI2,
522 // Only examine the first "base" operand of each instruction, on the
523 // assumption that it represents the real base address of the memory access.
524 // Other operands are typically offsets or indices from this base address.
525 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
526 return true;
527
528 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
529 return false;
530
531 auto MO1 = *MI1.memoperands_begin();
532 auto MO2 = *MI2.memoperands_begin();
533 if (MO1->getAddrSpace() != MO2->getAddrSpace())
534 return false;
535
536 auto Base1 = MO1->getValue();
537 auto Base2 = MO2->getValue();
538 if (!Base1 || !Base2)
539 return false;
540 Base1 = getUnderlyingObject(Base1);
541 Base2 = getUnderlyingObject(Base2);
542
543 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
544 return false;
545
546 return Base1 == Base2;
547}
548
550 int64_t Offset1, bool OffsetIsScalable1,
552 int64_t Offset2, bool OffsetIsScalable2,
553 unsigned ClusterSize,
554 unsigned NumBytes) const {
555 // If the mem ops (to be clustered) do not have the same base ptr, then they
556 // should not be clustered
557 if (!BaseOps1.empty() && !BaseOps2.empty()) {
558 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
559 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
560 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
561 return false;
562 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
563 // If only one base op is empty, they do not have the same base ptr
564 return false;
565 }
566
567 // In order to avoid register pressure, on an average, the number of DWORDS
568 // loaded together by all clustered mem ops should not exceed 8. This is an
569 // empirical value based on certain observations and performance related
570 // experiments.
571 // The good thing about this heuristic is - it avoids clustering of too many
572 // sub-word loads, and also avoids clustering of wide loads. Below is the
573 // brief summary of how the heuristic behaves for various `LoadSize`.
574 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
575 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
576 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
577 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
578 // (5) LoadSize >= 17: do not cluster
579 const unsigned LoadSize = NumBytes / ClusterSize;
580 const unsigned NumDWORDs = ((LoadSize + 3) / 4) * ClusterSize;
581 return NumDWORDs <= 8;
582}
583
584// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
585// the first 16 loads will be interleaved with the stores, and the next 16 will
586// be clustered as expected. It should really split into 2 16 store batches.
587//
588// Loads are clustered until this returns false, rather than trying to schedule
589// groups of stores. This also means we have to deal with saying different
590// address space loads should be clustered, and ones which might cause bank
591// conflicts.
592//
593// This might be deprecated so it might not be worth that much effort to fix.
595 int64_t Offset0, int64_t Offset1,
596 unsigned NumLoads) const {
597 assert(Offset1 > Offset0 &&
598 "Second offset should be larger than first offset!");
599 // If we have less than 16 loads in a row, and the offsets are within 64
600 // bytes, then schedule together.
601
602 // A cacheline is 64 bytes (for global memory).
603 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
604}
605
608 const DebugLoc &DL, MCRegister DestReg,
609 MCRegister SrcReg, bool KillSrc,
610 const char *Msg = "illegal VGPR to SGPR copy") {
612 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
614 C.diagnose(IllegalCopy);
615
616 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
617 .addReg(SrcReg, getKillRegState(KillSrc));
618}
619
620/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
621/// possible to have a direct copy in these cases on GFX908, so an intermediate
622/// VGPR copy is required.
626 const DebugLoc &DL, MCRegister DestReg,
627 MCRegister SrcReg, bool KillSrc,
628 RegScavenger &RS, bool RegsOverlap,
629 Register ImpDefSuperReg = Register(),
630 Register ImpUseSuperReg = Register()) {
631 assert((TII.getSubtarget().hasMAIInsts() &&
632 !TII.getSubtarget().hasGFX90AInsts()) &&
633 "Expected GFX908 subtarget.");
634
635 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
636 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
637 "Source register of the copy should be either an SGPR or an AGPR.");
638
639 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
640 "Destination register of the copy should be an AGPR.");
641
642 const SIRegisterInfo &RI = TII.getRegisterInfo();
643
644 // First try to find defining accvgpr_write to avoid temporary registers.
645 // In the case of copies of overlapping AGPRs, we conservatively do not
646 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
647 // an accvgpr_write used for this same copy due to implicit-defs
648 if (!RegsOverlap) {
649 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
650 --Def;
651
652 if (!Def->modifiesRegister(SrcReg, &RI))
653 continue;
654
655 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
656 Def->getOperand(0).getReg() != SrcReg)
657 break;
658
659 MachineOperand &DefOp = Def->getOperand(1);
660 assert(DefOp.isReg() || DefOp.isImm());
661
662 if (DefOp.isReg()) {
663 bool SafeToPropagate = true;
664 // Check that register source operand is not clobbered before MI.
665 // Immediate operands are always safe to propagate.
666 for (auto I = Def; I != MI && SafeToPropagate; ++I)
667 if (I->modifiesRegister(DefOp.getReg(), &RI))
668 SafeToPropagate = false;
669
670 if (!SafeToPropagate)
671 break;
672
673 DefOp.setIsKill(false);
674 }
675
676 MachineInstrBuilder Builder =
677 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
678 .add(DefOp);
679 if (ImpDefSuperReg)
680 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
681
682 if (ImpUseSuperReg) {
683 Builder.addReg(ImpUseSuperReg,
685 }
686
687 return;
688 }
689 }
690
692 RS.backward(std::next(MI));
693
694 // Ideally we want to have three registers for a long reg_sequence copy
695 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
696 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
697 *MBB.getParent());
698
699 // Registers in the sequence are allocated contiguously so we can just
700 // use register number to pick one of three round-robin temps.
701 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
702 Register Tmp =
703 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
705 "VGPR used for an intermediate copy should have been reserved.");
706
707 // Only loop through if there are any free registers left. We don't want to
708 // spill.
709 while (RegNo--) {
710 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
711 /* RestoreAfter */ false, 0,
712 /* AllowSpill */ false);
713 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
714 break;
715 Tmp = Tmp2;
716 RS.setRegUsed(Tmp);
717 }
718
719 // Insert copy to temporary VGPR.
720 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
721 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
722 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
723 } else {
724 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
725 }
726
727 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
728 .addReg(SrcReg, getKillRegState(KillSrc));
729 if (ImpUseSuperReg) {
730 UseBuilder.addReg(ImpUseSuperReg,
732 }
733
734 MachineInstrBuilder DefBuilder
735 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
736 .addReg(Tmp, RegState::Kill);
737
738 if (ImpDefSuperReg)
739 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
740}
741
744 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
745 const TargetRegisterClass *RC, bool Forward) {
746 const SIRegisterInfo &RI = TII.getRegisterInfo();
747 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
749 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
750
751 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
752 int16_t SubIdx = BaseIndices[Idx];
753 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
754 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
755 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
756 unsigned Opcode = AMDGPU::S_MOV_B32;
757
758 // Is SGPR aligned? If so try to combine with next.
759 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
760 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
761 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
762 // Can use SGPR64 copy
763 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
764 SubIdx = RI.getSubRegFromChannel(Channel, 2);
765 DestSubReg = RI.getSubReg(DestReg, SubIdx);
766 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
767 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
768 Opcode = AMDGPU::S_MOV_B64;
769 Idx++;
770 }
771
772 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
773 .addReg(SrcSubReg)
774 .addReg(SrcReg, RegState::Implicit);
775
776 if (!FirstMI)
777 FirstMI = LastMI;
778
779 if (!Forward)
780 I--;
781 }
782
783 assert(FirstMI && LastMI);
784 if (!Forward)
785 std::swap(FirstMI, LastMI);
786
787 FirstMI->addOperand(
788 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
789
790 if (KillSrc)
791 LastMI->addRegisterKilled(SrcReg, &RI);
792}
793
796 const DebugLoc &DL, MCRegister DestReg,
797 MCRegister SrcReg, bool KillSrc) const {
798 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
799 unsigned Size = RI.getRegSizeInBits(*RC);
800 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
801 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
802
803 // The rest of copyPhysReg assumes Src and Dst size are the same size.
804 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
805 // we remove Fix16BitCopies and this code block?
806 if (Fix16BitCopies) {
807 if (((Size == 16) != (SrcSize == 16))) {
808 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
810 MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg;
811 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
812 RegToFix = SubReg;
813
814 if (DestReg == SrcReg) {
815 // Identity copy. Insert empty bundle since ExpandPostRA expects an
816 // instruction here.
817 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
818 return;
819 }
820 RC = RI.getPhysRegBaseClass(DestReg);
821 Size = RI.getRegSizeInBits(*RC);
822 SrcRC = RI.getPhysRegBaseClass(SrcReg);
823 SrcSize = RI.getRegSizeInBits(*SrcRC);
824 }
825 }
826
827 if (RC == &AMDGPU::VGPR_32RegClass) {
828 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
829 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
830 AMDGPU::AGPR_32RegClass.contains(SrcReg));
831 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
832 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
833 BuildMI(MBB, MI, DL, get(Opc), DestReg)
834 .addReg(SrcReg, getKillRegState(KillSrc));
835 return;
836 }
837
838 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
839 RC == &AMDGPU::SReg_32RegClass) {
840 if (SrcReg == AMDGPU::SCC) {
841 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
842 .addImm(1)
843 .addImm(0);
844 return;
845 }
846
847 if (DestReg == AMDGPU::VCC_LO) {
848 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
849 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
850 .addReg(SrcReg, getKillRegState(KillSrc));
851 } else {
852 // FIXME: Hack until VReg_1 removed.
853 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
854 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
855 .addImm(0)
856 .addReg(SrcReg, getKillRegState(KillSrc));
857 }
858
859 return;
860 }
861
862 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
863 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
864 return;
865 }
866
867 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
868 .addReg(SrcReg, getKillRegState(KillSrc));
869 return;
870 }
871
872 if (RC == &AMDGPU::SReg_64RegClass) {
873 if (SrcReg == AMDGPU::SCC) {
874 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
875 .addImm(1)
876 .addImm(0);
877 return;
878 }
879
880 if (DestReg == AMDGPU::VCC) {
881 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
882 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
883 .addReg(SrcReg, getKillRegState(KillSrc));
884 } else {
885 // FIXME: Hack until VReg_1 removed.
886 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
887 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
888 .addImm(0)
889 .addReg(SrcReg, getKillRegState(KillSrc));
890 }
891
892 return;
893 }
894
895 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
896 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
897 return;
898 }
899
900 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
901 .addReg(SrcReg, getKillRegState(KillSrc));
902 return;
903 }
904
905 if (DestReg == AMDGPU::SCC) {
906 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
907 // but SelectionDAG emits such copies for i1 sources.
908 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
909 // This copy can only be produced by patterns
910 // with explicit SCC, which are known to be enabled
911 // only for subtargets with S_CMP_LG_U64 present.
913 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
914 .addReg(SrcReg, getKillRegState(KillSrc))
915 .addImm(0);
916 } else {
917 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
918 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
919 .addReg(SrcReg, getKillRegState(KillSrc))
920 .addImm(0);
921 }
922
923 return;
924 }
925
926 if (RC == &AMDGPU::AGPR_32RegClass) {
927 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
928 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
929 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
930 .addReg(SrcReg, getKillRegState(KillSrc));
931 return;
932 }
933
934 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
935 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
936 .addReg(SrcReg, getKillRegState(KillSrc));
937 return;
938 }
939
940 // FIXME: Pass should maintain scavenger to avoid scan through the block on
941 // every AGPR spill.
942 RegScavenger RS;
943 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
944 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
945 return;
946 }
947
948 if (Size == 16) {
949 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
950 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
951 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
952
953 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
954 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
955 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
956 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
957 bool DstLow = !AMDGPU::isHi(DestReg, RI);
958 bool SrcLow = !AMDGPU::isHi(SrcReg, RI);
959 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
960 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
961
962 if (IsSGPRDst) {
963 if (!IsSGPRSrc) {
964 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
965 return;
966 }
967
968 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
969 .addReg(NewSrcReg, getKillRegState(KillSrc));
970 return;
971 }
972
973 if (IsAGPRDst || IsAGPRSrc) {
974 if (!DstLow || !SrcLow) {
975 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
976 "Cannot use hi16 subreg with an AGPR!");
977 }
978
979 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
980 return;
981 }
982
983 if (ST.hasTrue16BitInsts()) {
984 if (IsSGPRSrc) {
985 assert(SrcLow);
986 SrcReg = NewSrcReg;
987 }
988 // Use the smaller instruction encoding if possible.
989 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
990 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
991 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
992 .addReg(SrcReg);
993 } else {
994 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
995 .addImm(0) // src0_modifiers
996 .addReg(SrcReg)
997 .addImm(0); // op_sel
998 }
999 return;
1000 }
1001
1002 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1003 if (!DstLow || !SrcLow) {
1004 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1005 "Cannot use hi16 subreg on VI!");
1006 }
1007
1008 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1009 .addReg(NewSrcReg, getKillRegState(KillSrc));
1010 return;
1011 }
1012
1013 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1014 .addImm(0) // src0_modifiers
1015 .addReg(NewSrcReg)
1016 .addImm(0) // clamp
1023 // First implicit operand is $exec.
1024 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1025 return;
1026 }
1027
1028 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1029 if (ST.hasMovB64()) {
1030 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1031 .addReg(SrcReg, getKillRegState(KillSrc));
1032 return;
1033 }
1034 if (ST.hasPkMovB32()) {
1035 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1037 .addReg(SrcReg)
1039 .addReg(SrcReg)
1040 .addImm(0) // op_sel_lo
1041 .addImm(0) // op_sel_hi
1042 .addImm(0) // neg_lo
1043 .addImm(0) // neg_hi
1044 .addImm(0) // clamp
1045 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1046 return;
1047 }
1048 }
1049
1050 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1051 if (RI.isSGPRClass(RC)) {
1052 if (!RI.isSGPRClass(SrcRC)) {
1053 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1054 return;
1055 }
1056 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1057 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1058 Forward);
1059 return;
1060 }
1061
1062 unsigned EltSize = 4;
1063 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1064 if (RI.isAGPRClass(RC)) {
1065 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1066 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1067 else if (RI.hasVGPRs(SrcRC) ||
1068 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1069 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1070 else
1071 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1072 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1073 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1074 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1075 (RI.isProperlyAlignedRC(*RC) &&
1076 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1077 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1078 if (ST.hasMovB64()) {
1079 Opcode = AMDGPU::V_MOV_B64_e32;
1080 EltSize = 8;
1081 } else if (ST.hasPkMovB32()) {
1082 Opcode = AMDGPU::V_PK_MOV_B32;
1083 EltSize = 8;
1084 }
1085 }
1086
1087 // For the cases where we need an intermediate instruction/temporary register
1088 // (destination is an AGPR), we need a scavenger.
1089 //
1090 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1091 // whole block for every handled copy.
1092 std::unique_ptr<RegScavenger> RS;
1093 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1094 RS = std::make_unique<RegScavenger>();
1095
1096 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1097
1098 // If there is an overlap, we can't kill the super-register on the last
1099 // instruction, since it will also kill the components made live by this def.
1100 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1101 const bool CanKillSuperReg = KillSrc && !Overlap;
1102
1103 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1104 unsigned SubIdx;
1105 if (Forward)
1106 SubIdx = SubIndices[Idx];
1107 else
1108 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1109 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1110 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1111 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1112
1113 bool IsFirstSubreg = Idx == 0;
1114 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1115
1116 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1117 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1118 Register ImpUseSuper = SrcReg;
1119 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1120 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1121 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1123 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1125 .addReg(SrcSubReg)
1127 .addReg(SrcSubReg)
1128 .addImm(0) // op_sel_lo
1129 .addImm(0) // op_sel_hi
1130 .addImm(0) // neg_lo
1131 .addImm(0) // neg_hi
1132 .addImm(0) // clamp
1133 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1134 if (IsFirstSubreg)
1136 } else {
1137 MachineInstrBuilder Builder =
1138 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1139 if (IsFirstSubreg)
1140 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1141
1142 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1143 }
1144 }
1145}
1146
1147int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1148 int NewOpc;
1149
1150 // Try to map original to commuted opcode
1151 NewOpc = AMDGPU::getCommuteRev(Opcode);
1152 if (NewOpc != -1)
1153 // Check if the commuted (REV) opcode exists on the target.
1154 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1155
1156 // Try to map commuted to original opcode
1157 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1158 if (NewOpc != -1)
1159 // Check if the original (non-REV) opcode exists on the target.
1160 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1161
1162 return Opcode;
1163}
1164
1167 const DebugLoc &DL, Register DestReg,
1168 int64_t Value) const {
1170 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
1171 if (RegClass == &AMDGPU::SReg_32RegClass ||
1172 RegClass == &AMDGPU::SGPR_32RegClass ||
1173 RegClass == &AMDGPU::SReg_32_XM0RegClass ||
1174 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
1175 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
1176 .addImm(Value);
1177 return;
1178 }
1179
1180 if (RegClass == &AMDGPU::SReg_64RegClass ||
1181 RegClass == &AMDGPU::SGPR_64RegClass ||
1182 RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
1183 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
1184 .addImm(Value);
1185 return;
1186 }
1187
1188 if (RegClass == &AMDGPU::VGPR_32RegClass) {
1189 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
1190 .addImm(Value);
1191 return;
1192 }
1193 if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
1194 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
1195 .addImm(Value);
1196 return;
1197 }
1198
1199 unsigned EltSize = 4;
1200 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1201 if (RI.isSGPRClass(RegClass)) {
1202 if (RI.getRegSizeInBits(*RegClass) > 32) {
1203 Opcode = AMDGPU::S_MOV_B64;
1204 EltSize = 8;
1205 } else {
1206 Opcode = AMDGPU::S_MOV_B32;
1207 EltSize = 4;
1208 }
1209 }
1210
1211 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
1212 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1213 int64_t IdxValue = Idx == 0 ? Value : 0;
1214
1215 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
1216 get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
1217 Builder.addImm(IdxValue);
1218 }
1219}
1220
1221const TargetRegisterClass *
1223 return &AMDGPU::VGPR_32RegClass;
1224}
1225
1228 const DebugLoc &DL, Register DstReg,
1230 Register TrueReg,
1231 Register FalseReg) const {
1233 const TargetRegisterClass *BoolXExecRC =
1234 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1235 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1236 "Not a VGPR32 reg");
1237
1238 if (Cond.size() == 1) {
1239 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1240 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1241 .add(Cond[0]);
1242 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1243 .addImm(0)
1244 .addReg(FalseReg)
1245 .addImm(0)
1246 .addReg(TrueReg)
1247 .addReg(SReg);
1248 } else if (Cond.size() == 2) {
1249 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1250 switch (Cond[0].getImm()) {
1251 case SIInstrInfo::SCC_TRUE: {
1252 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1253 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1254 : AMDGPU::S_CSELECT_B64), SReg)
1255 .addImm(1)
1256 .addImm(0);
1257 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1258 .addImm(0)
1259 .addReg(FalseReg)
1260 .addImm(0)
1261 .addReg(TrueReg)
1262 .addReg(SReg);
1263 break;
1264 }
1265 case SIInstrInfo::SCC_FALSE: {
1266 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1267 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1268 : AMDGPU::S_CSELECT_B64), SReg)
1269 .addImm(0)
1270 .addImm(1);
1271 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1272 .addImm(0)
1273 .addReg(FalseReg)
1274 .addImm(0)
1275 .addReg(TrueReg)
1276 .addReg(SReg);
1277 break;
1278 }
1279 case SIInstrInfo::VCCNZ: {
1280 MachineOperand RegOp = Cond[1];
1281 RegOp.setImplicit(false);
1282 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1283 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1284 .add(RegOp);
1285 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1286 .addImm(0)
1287 .addReg(FalseReg)
1288 .addImm(0)
1289 .addReg(TrueReg)
1290 .addReg(SReg);
1291 break;
1292 }
1293 case SIInstrInfo::VCCZ: {
1294 MachineOperand RegOp = Cond[1];
1295 RegOp.setImplicit(false);
1296 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1297 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1298 .add(RegOp);
1299 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1300 .addImm(0)
1301 .addReg(TrueReg)
1302 .addImm(0)
1303 .addReg(FalseReg)
1304 .addReg(SReg);
1305 break;
1306 }
1307 case SIInstrInfo::EXECNZ: {
1308 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1309 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1310 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1311 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1312 .addImm(0);
1313 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1314 : AMDGPU::S_CSELECT_B64), SReg)
1315 .addImm(1)
1316 .addImm(0);
1317 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1318 .addImm(0)
1319 .addReg(FalseReg)
1320 .addImm(0)
1321 .addReg(TrueReg)
1322 .addReg(SReg);
1323 break;
1324 }
1325 case SIInstrInfo::EXECZ: {
1326 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1327 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1328 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1329 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1330 .addImm(0);
1331 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1332 : AMDGPU::S_CSELECT_B64), SReg)
1333 .addImm(0)
1334 .addImm(1);
1335 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1336 .addImm(0)
1337 .addReg(FalseReg)
1338 .addImm(0)
1339 .addReg(TrueReg)
1340 .addReg(SReg);
1341 llvm_unreachable("Unhandled branch predicate EXECZ");
1342 break;
1343 }
1344 default:
1345 llvm_unreachable("invalid branch predicate");
1346 }
1347 } else {
1348 llvm_unreachable("Can only handle Cond size 1 or 2");
1349 }
1350}
1351
1354 const DebugLoc &DL,
1355 Register SrcReg, int Value) const {
1357 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1358 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1359 .addImm(Value)
1360 .addReg(SrcReg);
1361
1362 return Reg;
1363}
1364
1367 const DebugLoc &DL,
1368 Register SrcReg, int Value) const {
1370 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1371 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1372 .addImm(Value)
1373 .addReg(SrcReg);
1374
1375 return Reg;
1376}
1377
1379
1380 if (RI.isAGPRClass(DstRC))
1381 return AMDGPU::COPY;
1382 if (RI.getRegSizeInBits(*DstRC) == 16) {
1383 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1384 // before RA.
1385 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1386 }
1387 if (RI.getRegSizeInBits(*DstRC) == 32)
1388 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1389 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1390 return AMDGPU::S_MOV_B64;
1391 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1392 return AMDGPU::V_MOV_B64_PSEUDO;
1393 return AMDGPU::COPY;
1394}
1395
1396const MCInstrDesc &
1398 bool IsIndirectSrc) const {
1399 if (IsIndirectSrc) {
1400 if (VecSize <= 32) // 4 bytes
1401 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1402 if (VecSize <= 64) // 8 bytes
1403 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1404 if (VecSize <= 96) // 12 bytes
1405 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1406 if (VecSize <= 128) // 16 bytes
1407 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1408 if (VecSize <= 160) // 20 bytes
1409 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1410 if (VecSize <= 256) // 32 bytes
1411 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1412 if (VecSize <= 288) // 36 bytes
1413 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1414 if (VecSize <= 320) // 40 bytes
1415 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1416 if (VecSize <= 352) // 44 bytes
1417 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1418 if (VecSize <= 384) // 48 bytes
1419 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1420 if (VecSize <= 512) // 64 bytes
1421 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1422 if (VecSize <= 1024) // 128 bytes
1423 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1424
1425 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1426 }
1427
1428 if (VecSize <= 32) // 4 bytes
1429 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1430 if (VecSize <= 64) // 8 bytes
1431 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1432 if (VecSize <= 96) // 12 bytes
1433 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1434 if (VecSize <= 128) // 16 bytes
1435 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1436 if (VecSize <= 160) // 20 bytes
1437 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1438 if (VecSize <= 256) // 32 bytes
1439 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1440 if (VecSize <= 288) // 36 bytes
1441 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1442 if (VecSize <= 320) // 40 bytes
1443 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1444 if (VecSize <= 352) // 44 bytes
1445 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1446 if (VecSize <= 384) // 48 bytes
1447 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1448 if (VecSize <= 512) // 64 bytes
1449 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1450 if (VecSize <= 1024) // 128 bytes
1451 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1452
1453 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1454}
1455
1456static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1457 if (VecSize <= 32) // 4 bytes
1458 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1459 if (VecSize <= 64) // 8 bytes
1460 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1461 if (VecSize <= 96) // 12 bytes
1462 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1463 if (VecSize <= 128) // 16 bytes
1464 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1465 if (VecSize <= 160) // 20 bytes
1466 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1467 if (VecSize <= 256) // 32 bytes
1468 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1469 if (VecSize <= 288) // 36 bytes
1470 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1471 if (VecSize <= 320) // 40 bytes
1472 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1473 if (VecSize <= 352) // 44 bytes
1474 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1475 if (VecSize <= 384) // 48 bytes
1476 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1477 if (VecSize <= 512) // 64 bytes
1478 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1479 if (VecSize <= 1024) // 128 bytes
1480 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1481
1482 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1483}
1484
1485static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1486 if (VecSize <= 32) // 4 bytes
1487 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1488 if (VecSize <= 64) // 8 bytes
1489 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1490 if (VecSize <= 96) // 12 bytes
1491 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1492 if (VecSize <= 128) // 16 bytes
1493 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1494 if (VecSize <= 160) // 20 bytes
1495 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1496 if (VecSize <= 256) // 32 bytes
1497 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1498 if (VecSize <= 288) // 36 bytes
1499 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1500 if (VecSize <= 320) // 40 bytes
1501 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1502 if (VecSize <= 352) // 44 bytes
1503 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1504 if (VecSize <= 384) // 48 bytes
1505 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1506 if (VecSize <= 512) // 64 bytes
1507 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1508 if (VecSize <= 1024) // 128 bytes
1509 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1510
1511 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1512}
1513
1514static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1515 if (VecSize <= 64) // 8 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1517 if (VecSize <= 128) // 16 bytes
1518 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1519 if (VecSize <= 256) // 32 bytes
1520 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1521 if (VecSize <= 512) // 64 bytes
1522 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1523 if (VecSize <= 1024) // 128 bytes
1524 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1525
1526 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1527}
1528
1529const MCInstrDesc &
1530SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1531 bool IsSGPR) const {
1532 if (IsSGPR) {
1533 switch (EltSize) {
1534 case 32:
1535 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1536 case 64:
1537 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1538 default:
1539 llvm_unreachable("invalid reg indexing elt size");
1540 }
1541 }
1542
1543 assert(EltSize == 32 && "invalid reg indexing elt size");
1545}
1546
1547static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1548 switch (Size) {
1549 case 4:
1550 return AMDGPU::SI_SPILL_S32_SAVE;
1551 case 8:
1552 return AMDGPU::SI_SPILL_S64_SAVE;
1553 case 12:
1554 return AMDGPU::SI_SPILL_S96_SAVE;
1555 case 16:
1556 return AMDGPU::SI_SPILL_S128_SAVE;
1557 case 20:
1558 return AMDGPU::SI_SPILL_S160_SAVE;
1559 case 24:
1560 return AMDGPU::SI_SPILL_S192_SAVE;
1561 case 28:
1562 return AMDGPU::SI_SPILL_S224_SAVE;
1563 case 32:
1564 return AMDGPU::SI_SPILL_S256_SAVE;
1565 case 36:
1566 return AMDGPU::SI_SPILL_S288_SAVE;
1567 case 40:
1568 return AMDGPU::SI_SPILL_S320_SAVE;
1569 case 44:
1570 return AMDGPU::SI_SPILL_S352_SAVE;
1571 case 48:
1572 return AMDGPU::SI_SPILL_S384_SAVE;
1573 case 64:
1574 return AMDGPU::SI_SPILL_S512_SAVE;
1575 case 128:
1576 return AMDGPU::SI_SPILL_S1024_SAVE;
1577 default:
1578 llvm_unreachable("unknown register size");
1579 }
1580}
1581
1582static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1583 switch (Size) {
1584 case 4:
1585 return AMDGPU::SI_SPILL_V32_SAVE;
1586 case 8:
1587 return AMDGPU::SI_SPILL_V64_SAVE;
1588 case 12:
1589 return AMDGPU::SI_SPILL_V96_SAVE;
1590 case 16:
1591 return AMDGPU::SI_SPILL_V128_SAVE;
1592 case 20:
1593 return AMDGPU::SI_SPILL_V160_SAVE;
1594 case 24:
1595 return AMDGPU::SI_SPILL_V192_SAVE;
1596 case 28:
1597 return AMDGPU::SI_SPILL_V224_SAVE;
1598 case 32:
1599 return AMDGPU::SI_SPILL_V256_SAVE;
1600 case 36:
1601 return AMDGPU::SI_SPILL_V288_SAVE;
1602 case 40:
1603 return AMDGPU::SI_SPILL_V320_SAVE;
1604 case 44:
1605 return AMDGPU::SI_SPILL_V352_SAVE;
1606 case 48:
1607 return AMDGPU::SI_SPILL_V384_SAVE;
1608 case 64:
1609 return AMDGPU::SI_SPILL_V512_SAVE;
1610 case 128:
1611 return AMDGPU::SI_SPILL_V1024_SAVE;
1612 default:
1613 llvm_unreachable("unknown register size");
1614 }
1615}
1616
1617static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
1618 switch (Size) {
1619 case 4:
1620 return AMDGPU::SI_SPILL_A32_SAVE;
1621 case 8:
1622 return AMDGPU::SI_SPILL_A64_SAVE;
1623 case 12:
1624 return AMDGPU::SI_SPILL_A96_SAVE;
1625 case 16:
1626 return AMDGPU::SI_SPILL_A128_SAVE;
1627 case 20:
1628 return AMDGPU::SI_SPILL_A160_SAVE;
1629 case 24:
1630 return AMDGPU::SI_SPILL_A192_SAVE;
1631 case 28:
1632 return AMDGPU::SI_SPILL_A224_SAVE;
1633 case 32:
1634 return AMDGPU::SI_SPILL_A256_SAVE;
1635 case 36:
1636 return AMDGPU::SI_SPILL_A288_SAVE;
1637 case 40:
1638 return AMDGPU::SI_SPILL_A320_SAVE;
1639 case 44:
1640 return AMDGPU::SI_SPILL_A352_SAVE;
1641 case 48:
1642 return AMDGPU::SI_SPILL_A384_SAVE;
1643 case 64:
1644 return AMDGPU::SI_SPILL_A512_SAVE;
1645 case 128:
1646 return AMDGPU::SI_SPILL_A1024_SAVE;
1647 default:
1648 llvm_unreachable("unknown register size");
1649 }
1650}
1651
1652static unsigned getAVSpillSaveOpcode(unsigned Size) {
1653 switch (Size) {
1654 case 4:
1655 return AMDGPU::SI_SPILL_AV32_SAVE;
1656 case 8:
1657 return AMDGPU::SI_SPILL_AV64_SAVE;
1658 case 12:
1659 return AMDGPU::SI_SPILL_AV96_SAVE;
1660 case 16:
1661 return AMDGPU::SI_SPILL_AV128_SAVE;
1662 case 20:
1663 return AMDGPU::SI_SPILL_AV160_SAVE;
1664 case 24:
1665 return AMDGPU::SI_SPILL_AV192_SAVE;
1666 case 28:
1667 return AMDGPU::SI_SPILL_AV224_SAVE;
1668 case 32:
1669 return AMDGPU::SI_SPILL_AV256_SAVE;
1670 case 36:
1671 return AMDGPU::SI_SPILL_AV288_SAVE;
1672 case 40:
1673 return AMDGPU::SI_SPILL_AV320_SAVE;
1674 case 44:
1675 return AMDGPU::SI_SPILL_AV352_SAVE;
1676 case 48:
1677 return AMDGPU::SI_SPILL_AV384_SAVE;
1678 case 64:
1679 return AMDGPU::SI_SPILL_AV512_SAVE;
1680 case 128:
1681 return AMDGPU::SI_SPILL_AV1024_SAVE;
1682 default:
1683 llvm_unreachable("unknown register size");
1684 }
1685}
1686
1687static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1688 bool IsVectorSuperClass) {
1689 // Currently, there is only 32-bit WWM register spills needed.
1690 if (Size != 4)
1691 llvm_unreachable("unknown wwm register spill size");
1692
1693 if (IsVectorSuperClass)
1694 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1695
1696 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1697}
1698
1700 const TargetRegisterClass *RC,
1701 unsigned Size,
1702 const SIRegisterInfo &TRI,
1703 const SIMachineFunctionInfo &MFI) {
1704 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1705
1706 // Choose the right opcode if spilling a WWM register.
1708 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1709
1710 if (IsVectorSuperClass)
1711 return getAVSpillSaveOpcode(Size);
1712
1713 return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
1715}
1716
1719 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1720 const TargetRegisterInfo *TRI, Register VReg) const {
1723 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1724 const DebugLoc &DL = MBB.findDebugLoc(MI);
1725
1726 MachinePointerInfo PtrInfo
1727 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1729 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1730 FrameInfo.getObjectAlign(FrameIndex));
1731 unsigned SpillSize = TRI->getSpillSize(*RC);
1732
1734 if (RI.isSGPRClass(RC)) {
1735 MFI->setHasSpilledSGPRs();
1736 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1737 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1738 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1739
1740 // We are only allowed to create one new instruction when spilling
1741 // registers, so we need to use pseudo instruction for spilling SGPRs.
1742 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1743
1744 // The SGPR spill/restore instructions only work on number sgprs, so we need
1745 // to make sure we are using the correct register class.
1746 if (SrcReg.isVirtual() && SpillSize == 4) {
1747 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1748 }
1749
1750 BuildMI(MBB, MI, DL, OpDesc)
1751 .addReg(SrcReg, getKillRegState(isKill)) // data
1752 .addFrameIndex(FrameIndex) // addr
1753 .addMemOperand(MMO)
1755
1756 if (RI.spillSGPRToVGPR())
1757 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1758 return;
1759 }
1760
1761 unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
1762 SpillSize, RI, *MFI);
1763 MFI->setHasSpilledVGPRs();
1764
1765 BuildMI(MBB, MI, DL, get(Opcode))
1766 .addReg(SrcReg, getKillRegState(isKill)) // data
1767 .addFrameIndex(FrameIndex) // addr
1768 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1769 .addImm(0) // offset
1770 .addMemOperand(MMO);
1771}
1772
1773static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1774 switch (Size) {
1775 case 4:
1776 return AMDGPU::SI_SPILL_S32_RESTORE;
1777 case 8:
1778 return AMDGPU::SI_SPILL_S64_RESTORE;
1779 case 12:
1780 return AMDGPU::SI_SPILL_S96_RESTORE;
1781 case 16:
1782 return AMDGPU::SI_SPILL_S128_RESTORE;
1783 case 20:
1784 return AMDGPU::SI_SPILL_S160_RESTORE;
1785 case 24:
1786 return AMDGPU::SI_SPILL_S192_RESTORE;
1787 case 28:
1788 return AMDGPU::SI_SPILL_S224_RESTORE;
1789 case 32:
1790 return AMDGPU::SI_SPILL_S256_RESTORE;
1791 case 36:
1792 return AMDGPU::SI_SPILL_S288_RESTORE;
1793 case 40:
1794 return AMDGPU::SI_SPILL_S320_RESTORE;
1795 case 44:
1796 return AMDGPU::SI_SPILL_S352_RESTORE;
1797 case 48:
1798 return AMDGPU::SI_SPILL_S384_RESTORE;
1799 case 64:
1800 return AMDGPU::SI_SPILL_S512_RESTORE;
1801 case 128:
1802 return AMDGPU::SI_SPILL_S1024_RESTORE;
1803 default:
1804 llvm_unreachable("unknown register size");
1805 }
1806}
1807
1808static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1809 switch (Size) {
1810 case 4:
1811 return AMDGPU::SI_SPILL_V32_RESTORE;
1812 case 8:
1813 return AMDGPU::SI_SPILL_V64_RESTORE;
1814 case 12:
1815 return AMDGPU::SI_SPILL_V96_RESTORE;
1816 case 16:
1817 return AMDGPU::SI_SPILL_V128_RESTORE;
1818 case 20:
1819 return AMDGPU::SI_SPILL_V160_RESTORE;
1820 case 24:
1821 return AMDGPU::SI_SPILL_V192_RESTORE;
1822 case 28:
1823 return AMDGPU::SI_SPILL_V224_RESTORE;
1824 case 32:
1825 return AMDGPU::SI_SPILL_V256_RESTORE;
1826 case 36:
1827 return AMDGPU::SI_SPILL_V288_RESTORE;
1828 case 40:
1829 return AMDGPU::SI_SPILL_V320_RESTORE;
1830 case 44:
1831 return AMDGPU::SI_SPILL_V352_RESTORE;
1832 case 48:
1833 return AMDGPU::SI_SPILL_V384_RESTORE;
1834 case 64:
1835 return AMDGPU::SI_SPILL_V512_RESTORE;
1836 case 128:
1837 return AMDGPU::SI_SPILL_V1024_RESTORE;
1838 default:
1839 llvm_unreachable("unknown register size");
1840 }
1841}
1842
1843static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1844 switch (Size) {
1845 case 4:
1846 return AMDGPU::SI_SPILL_A32_RESTORE;
1847 case 8:
1848 return AMDGPU::SI_SPILL_A64_RESTORE;
1849 case 12:
1850 return AMDGPU::SI_SPILL_A96_RESTORE;
1851 case 16:
1852 return AMDGPU::SI_SPILL_A128_RESTORE;
1853 case 20:
1854 return AMDGPU::SI_SPILL_A160_RESTORE;
1855 case 24:
1856 return AMDGPU::SI_SPILL_A192_RESTORE;
1857 case 28:
1858 return AMDGPU::SI_SPILL_A224_RESTORE;
1859 case 32:
1860 return AMDGPU::SI_SPILL_A256_RESTORE;
1861 case 36:
1862 return AMDGPU::SI_SPILL_A288_RESTORE;
1863 case 40:
1864 return AMDGPU::SI_SPILL_A320_RESTORE;
1865 case 44:
1866 return AMDGPU::SI_SPILL_A352_RESTORE;
1867 case 48:
1868 return AMDGPU::SI_SPILL_A384_RESTORE;
1869 case 64:
1870 return AMDGPU::SI_SPILL_A512_RESTORE;
1871 case 128:
1872 return AMDGPU::SI_SPILL_A1024_RESTORE;
1873 default:
1874 llvm_unreachable("unknown register size");
1875 }
1876}
1877
1878static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1879 switch (Size) {
1880 case 4:
1881 return AMDGPU::SI_SPILL_AV32_RESTORE;
1882 case 8:
1883 return AMDGPU::SI_SPILL_AV64_RESTORE;
1884 case 12:
1885 return AMDGPU::SI_SPILL_AV96_RESTORE;
1886 case 16:
1887 return AMDGPU::SI_SPILL_AV128_RESTORE;
1888 case 20:
1889 return AMDGPU::SI_SPILL_AV160_RESTORE;
1890 case 24:
1891 return AMDGPU::SI_SPILL_AV192_RESTORE;
1892 case 28:
1893 return AMDGPU::SI_SPILL_AV224_RESTORE;
1894 case 32:
1895 return AMDGPU::SI_SPILL_AV256_RESTORE;
1896 case 36:
1897 return AMDGPU::SI_SPILL_AV288_RESTORE;
1898 case 40:
1899 return AMDGPU::SI_SPILL_AV320_RESTORE;
1900 case 44:
1901 return AMDGPU::SI_SPILL_AV352_RESTORE;
1902 case 48:
1903 return AMDGPU::SI_SPILL_AV384_RESTORE;
1904 case 64:
1905 return AMDGPU::SI_SPILL_AV512_RESTORE;
1906 case 128:
1907 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1908 default:
1909 llvm_unreachable("unknown register size");
1910 }
1911}
1912
1913static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1914 bool IsVectorSuperClass) {
1915 // Currently, there is only 32-bit WWM register spills needed.
1916 if (Size != 4)
1917 llvm_unreachable("unknown wwm register spill size");
1918
1919 if (IsVectorSuperClass)
1920 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1921
1922 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1923}
1924
1925static unsigned
1927 unsigned Size, const SIRegisterInfo &TRI,
1928 const SIMachineFunctionInfo &MFI) {
1929 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1930
1931 // Choose the right opcode if restoring a WWM register.
1933 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1934
1935 if (IsVectorSuperClass)
1937
1938 return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
1940}
1941
1944 Register DestReg, int FrameIndex,
1945 const TargetRegisterClass *RC,
1946 const TargetRegisterInfo *TRI,
1947 Register VReg) const {
1950 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1951 const DebugLoc &DL = MBB.findDebugLoc(MI);
1952 unsigned SpillSize = TRI->getSpillSize(*RC);
1953
1954 MachinePointerInfo PtrInfo
1955 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1956
1958 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1959 FrameInfo.getObjectAlign(FrameIndex));
1960
1961 if (RI.isSGPRClass(RC)) {
1962 MFI->setHasSpilledSGPRs();
1963 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1964 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1965 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1966
1967 // FIXME: Maybe this should not include a memoperand because it will be
1968 // lowered to non-memory instructions.
1969 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1970 if (DestReg.isVirtual() && SpillSize == 4) {
1972 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1973 }
1974
1975 if (RI.spillSGPRToVGPR())
1976 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1977 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1978 .addFrameIndex(FrameIndex) // addr
1979 .addMemOperand(MMO)
1981
1982 return;
1983 }
1984
1985 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1986 SpillSize, RI, *MFI);
1987 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1988 .addFrameIndex(FrameIndex) // vaddr
1989 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1990 .addImm(0) // offset
1991 .addMemOperand(MMO);
1992}
1993
1996 insertNoops(MBB, MI, 1);
1997}
1998
2001 unsigned Quantity) const {
2003 while (Quantity > 0) {
2004 unsigned Arg = std::min(Quantity, 8u);
2005 Quantity -= Arg;
2006 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
2007 }
2008}
2009
2011 auto MF = MBB.getParent();
2013
2014 assert(Info->isEntryFunction());
2015
2016 if (MBB.succ_empty()) {
2017 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
2018 if (HasNoTerminator) {
2019 if (Info->returnsVoid()) {
2020 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
2021 } else {
2022 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
2023 }
2024 }
2025 }
2026}
2027
2031 const DebugLoc &DL) const {
2033 constexpr unsigned DoorbellIDMask = 0x3ff;
2034 constexpr unsigned ECQueueWaveAbort = 0x400;
2035
2036 MachineBasicBlock *TrapBB = &MBB;
2037 MachineBasicBlock *ContBB = &MBB;
2038 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2039
2040 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
2041 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
2042 TrapBB = MF->CreateMachineBasicBlock();
2043 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
2044 MF->push_back(TrapBB);
2045 MBB.addSuccessor(TrapBB);
2046 }
2047
2048 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2049 // will be a nop.
2050 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
2051 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2052 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2053 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2054 DoorbellReg)
2056 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2057 .addUse(AMDGPU::M0);
2058 Register DoorbellRegMasked =
2059 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2060 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2061 .addUse(DoorbellReg)
2062 .addImm(DoorbellIDMask);
2063 Register SetWaveAbortBit =
2064 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2065 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2066 .addUse(DoorbellRegMasked)
2067 .addImm(ECQueueWaveAbort);
2068 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2069 .addUse(SetWaveAbortBit);
2070 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2072 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2073 .addUse(AMDGPU::TTMP2);
2074 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2075 TrapBB->addSuccessor(HaltLoopBB);
2076
2077 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2078 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2079 .addMBB(HaltLoopBB);
2080 MF->push_back(HaltLoopBB);
2081 HaltLoopBB->addSuccessor(HaltLoopBB);
2082
2083 return ContBB;
2084}
2085
2087 switch (MI.getOpcode()) {
2088 default:
2089 if (MI.isMetaInstruction())
2090 return 0;
2091 return 1; // FIXME: Do wait states equal cycles?
2092
2093 case AMDGPU::S_NOP:
2094 return MI.getOperand(0).getImm() + 1;
2095 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2096 // hazard, even if one exist, won't really be visible. Should we handle it?
2097 }
2098}
2099
2101 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2102 MachineBasicBlock &MBB = *MI.getParent();
2104 switch (MI.getOpcode()) {
2105 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2106 case AMDGPU::S_MOV_B64_term:
2107 // This is only a terminator to get the correct spill code placement during
2108 // register allocation.
2109 MI.setDesc(get(AMDGPU::S_MOV_B64));
2110 break;
2111
2112 case AMDGPU::S_MOV_B32_term:
2113 // This is only a terminator to get the correct spill code placement during
2114 // register allocation.
2115 MI.setDesc(get(AMDGPU::S_MOV_B32));
2116 break;
2117
2118 case AMDGPU::S_XOR_B64_term:
2119 // This is only a terminator to get the correct spill code placement during
2120 // register allocation.
2121 MI.setDesc(get(AMDGPU::S_XOR_B64));
2122 break;
2123
2124 case AMDGPU::S_XOR_B32_term:
2125 // This is only a terminator to get the correct spill code placement during
2126 // register allocation.
2127 MI.setDesc(get(AMDGPU::S_XOR_B32));
2128 break;
2129 case AMDGPU::S_OR_B64_term:
2130 // This is only a terminator to get the correct spill code placement during
2131 // register allocation.
2132 MI.setDesc(get(AMDGPU::S_OR_B64));
2133 break;
2134 case AMDGPU::S_OR_B32_term:
2135 // This is only a terminator to get the correct spill code placement during
2136 // register allocation.
2137 MI.setDesc(get(AMDGPU::S_OR_B32));
2138 break;
2139
2140 case AMDGPU::S_ANDN2_B64_term:
2141 // This is only a terminator to get the correct spill code placement during
2142 // register allocation.
2143 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2144 break;
2145
2146 case AMDGPU::S_ANDN2_B32_term:
2147 // This is only a terminator to get the correct spill code placement during
2148 // register allocation.
2149 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2150 break;
2151
2152 case AMDGPU::S_AND_B64_term:
2153 // This is only a terminator to get the correct spill code placement during
2154 // register allocation.
2155 MI.setDesc(get(AMDGPU::S_AND_B64));
2156 break;
2157
2158 case AMDGPU::S_AND_B32_term:
2159 // This is only a terminator to get the correct spill code placement during
2160 // register allocation.
2161 MI.setDesc(get(AMDGPU::S_AND_B32));
2162 break;
2163
2164 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2165 // This is only a terminator to get the correct spill code placement during
2166 // register allocation.
2167 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2168 break;
2169
2170 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2171 // This is only a terminator to get the correct spill code placement during
2172 // register allocation.
2173 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2174 break;
2175
2176 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2177 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2178 break;
2179
2180 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2181 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2182 break;
2183
2184 case AMDGPU::V_MOV_B64_PSEUDO: {
2185 Register Dst = MI.getOperand(0).getReg();
2186 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2187 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2188
2189 const MachineOperand &SrcOp = MI.getOperand(1);
2190 // FIXME: Will this work for 64-bit floating point immediates?
2191 assert(!SrcOp.isFPImm());
2192 if (ST.hasMovB64()) {
2193 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2194 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2195 isUInt<32>(SrcOp.getImm()))
2196 break;
2197 }
2198 if (SrcOp.isImm()) {
2199 APInt Imm(64, SrcOp.getImm());
2200 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2201 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2202 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2203 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2205 .addImm(Lo.getSExtValue())
2207 .addImm(Lo.getSExtValue())
2208 .addImm(0) // op_sel_lo
2209 .addImm(0) // op_sel_hi
2210 .addImm(0) // neg_lo
2211 .addImm(0) // neg_hi
2212 .addImm(0); // clamp
2213 } else {
2214 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2215 .addImm(Lo.getSExtValue())
2217 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2218 .addImm(Hi.getSExtValue())
2220 }
2221 } else {
2222 assert(SrcOp.isReg());
2223 if (ST.hasPkMovB32() &&
2224 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2225 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2226 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2227 .addReg(SrcOp.getReg())
2229 .addReg(SrcOp.getReg())
2230 .addImm(0) // op_sel_lo
2231 .addImm(0) // op_sel_hi
2232 .addImm(0) // neg_lo
2233 .addImm(0) // neg_hi
2234 .addImm(0); // clamp
2235 } else {
2236 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2237 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2239 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2240 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2242 }
2243 }
2244 MI.eraseFromParent();
2245 break;
2246 }
2247 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2249 break;
2250 }
2251 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2252 const MachineOperand &SrcOp = MI.getOperand(1);
2253 assert(!SrcOp.isFPImm());
2254 APInt Imm(64, SrcOp.getImm());
2255 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2256 MI.setDesc(get(AMDGPU::S_MOV_B64));
2257 break;
2258 }
2259
2260 Register Dst = MI.getOperand(0).getReg();
2261 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2262 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2263
2264 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2265 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2266 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2267 .addImm(Lo.getSExtValue())
2269 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2270 .addImm(Hi.getSExtValue())
2272 MI.eraseFromParent();
2273 break;
2274 }
2275 case AMDGPU::V_SET_INACTIVE_B32: {
2276 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2277 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2278 // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
2279 // optimizations (mainly Register Coalescer) aware of WWM register liveness.
2280 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2281 .add(MI.getOperand(1));
2282 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2283 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2284 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2285 .add(MI.getOperand(2));
2286 BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2287 .addReg(Exec);
2288 MI.eraseFromParent();
2289 break;
2290 }
2291 case AMDGPU::V_SET_INACTIVE_B64: {
2292 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2293 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2294 MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2295 MI.getOperand(0).getReg())
2296 .add(MI.getOperand(1));
2297 expandPostRAPseudo(*Copy);
2298 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2299 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2300 Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2301 MI.getOperand(0).getReg())
2302 .add(MI.getOperand(2));
2303 expandPostRAPseudo(*Copy);
2304 BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2305 .addReg(Exec);
2306 MI.eraseFromParent();
2307 break;
2308 }
2309 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2310 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2311 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2312 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2313 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2314 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2315 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2316 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2317 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2318 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2319 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2320 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2321 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2322 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2323 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2324 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2325 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2326 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2327 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2328 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2329 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2330 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2331 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2332 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2333 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2334 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2335 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2336 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2337 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2338 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2339
2340 unsigned Opc;
2341 if (RI.hasVGPRs(EltRC)) {
2342 Opc = AMDGPU::V_MOVRELD_B32_e32;
2343 } else {
2344 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2345 : AMDGPU::S_MOVRELD_B32;
2346 }
2347
2348 const MCInstrDesc &OpDesc = get(Opc);
2349 Register VecReg = MI.getOperand(0).getReg();
2350 bool IsUndef = MI.getOperand(1).isUndef();
2351 unsigned SubReg = MI.getOperand(3).getImm();
2352 assert(VecReg == MI.getOperand(1).getReg());
2353
2355 BuildMI(MBB, MI, DL, OpDesc)
2356 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2357 .add(MI.getOperand(2))
2359 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2360
2361 const int ImpDefIdx =
2362 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2363 const int ImpUseIdx = ImpDefIdx + 1;
2364 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2365 MI.eraseFromParent();
2366 break;
2367 }
2368 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2369 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2370 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2371 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2372 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2373 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2374 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2375 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2376 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2377 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2378 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2379 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2381 Register VecReg = MI.getOperand(0).getReg();
2382 bool IsUndef = MI.getOperand(1).isUndef();
2383 Register Idx = MI.getOperand(3).getReg();
2384 Register SubReg = MI.getOperand(4).getImm();
2385
2386 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2387 .addReg(Idx)
2389 SetOn->getOperand(3).setIsUndef();
2390
2391 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2393 BuildMI(MBB, MI, DL, OpDesc)
2394 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2395 .add(MI.getOperand(2))
2397 .addReg(VecReg,
2398 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2399
2400 const int ImpDefIdx =
2401 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2402 const int ImpUseIdx = ImpDefIdx + 1;
2403 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2404
2405 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2406
2407 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2408
2409 MI.eraseFromParent();
2410 break;
2411 }
2412 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2413 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2414 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2415 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2416 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2417 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2418 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2419 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2420 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2421 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2422 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2423 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2425 Register Dst = MI.getOperand(0).getReg();
2426 Register VecReg = MI.getOperand(1).getReg();
2427 bool IsUndef = MI.getOperand(1).isUndef();
2428 Register Idx = MI.getOperand(2).getReg();
2429 Register SubReg = MI.getOperand(3).getImm();
2430
2431 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2432 .addReg(Idx)
2434 SetOn->getOperand(3).setIsUndef();
2435
2436 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2437 .addDef(Dst)
2438 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2439 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2440
2441 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2442
2443 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2444
2445 MI.eraseFromParent();
2446 break;
2447 }
2448 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2449 MachineFunction &MF = *MBB.getParent();
2450 Register Reg = MI.getOperand(0).getReg();
2451 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2452 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2453 MachineOperand OpLo = MI.getOperand(1);
2454 MachineOperand OpHi = MI.getOperand(2);
2455
2456 // Create a bundle so these instructions won't be re-ordered by the
2457 // post-RA scheduler.
2458 MIBundleBuilder Bundler(MBB, MI);
2459 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2460
2461 // What we want here is an offset from the value returned by s_getpc (which
2462 // is the address of the s_add_u32 instruction) to the global variable, but
2463 // since the encoding of $symbol starts 4 bytes after the start of the
2464 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2465 // small. This requires us to add 4 to the global variable offset in order
2466 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2467 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2468 // instruction.
2469
2470 int64_t Adjust = 0;
2471 if (ST.hasGetPCZeroExtension()) {
2472 // Fix up hardware that does not sign-extend the 48-bit PC value by
2473 // inserting: s_sext_i32_i16 reghi, reghi
2474 Bundler.append(
2475 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2476 Adjust += 4;
2477 }
2478
2479 if (OpLo.isGlobal())
2480 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2481 Bundler.append(
2482 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2483
2484 if (OpHi.isGlobal())
2485 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2486 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2487 .addReg(RegHi)
2488 .add(OpHi));
2489
2490 finalizeBundle(MBB, Bundler.begin());
2491
2492 MI.eraseFromParent();
2493 break;
2494 }
2495 case AMDGPU::ENTER_STRICT_WWM: {
2496 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2497 // Whole Wave Mode is entered.
2498 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
2499 : AMDGPU::S_OR_SAVEEXEC_B64));
2500 break;
2501 }
2502 case AMDGPU::ENTER_STRICT_WQM: {
2503 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2504 // STRICT_WQM is entered.
2505 const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2506 const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
2507 const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2508 BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
2509 BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
2510
2511 MI.eraseFromParent();
2512 break;
2513 }
2514 case AMDGPU::EXIT_STRICT_WWM:
2515 case AMDGPU::EXIT_STRICT_WQM: {
2516 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2517 // WWM/STICT_WQM is exited.
2518 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
2519 break;
2520 }
2521 case AMDGPU::SI_RETURN: {
2522 const MachineFunction *MF = MBB.getParent();
2523 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2524 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2525 // Hiding the return address use with SI_RETURN may lead to extra kills in
2526 // the function and missing live-ins. We are fine in practice because callee
2527 // saved register handling ensures the register value is restored before
2528 // RET, but we need the undef flag here to appease the MachineVerifier
2529 // liveness checks.
2531 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2532 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2533
2534 MIB.copyImplicitOps(MI);
2535 MI.eraseFromParent();
2536 break;
2537 }
2538
2539 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2540 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2541 MI.setDesc(get(AMDGPU::S_MUL_U64));
2542 break;
2543
2544 case AMDGPU::S_GETPC_B64_pseudo:
2545 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2546 if (ST.hasGetPCZeroExtension()) {
2547 Register Dst = MI.getOperand(0).getReg();
2548 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2549 // Fix up hardware that does not sign-extend the 48-bit PC value by
2550 // inserting: s_sext_i32_i16 dsthi, dsthi
2551 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2552 DstHi)
2553 .addReg(DstHi);
2554 }
2555 break;
2556 }
2557 return true;
2558}
2559
2562 unsigned SubIdx, const MachineInstr &Orig,
2563 const TargetRegisterInfo &RI) const {
2564
2565 // Try shrinking the instruction to remat only the part needed for current
2566 // context.
2567 // TODO: Handle more cases.
2568 unsigned Opcode = Orig.getOpcode();
2569 switch (Opcode) {
2570 case AMDGPU::S_LOAD_DWORDX16_IMM:
2571 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2572 if (SubIdx != 0)
2573 break;
2574
2575 if (I == MBB.end())
2576 break;
2577
2578 if (I->isBundled())
2579 break;
2580
2581 // Look for a single use of the register that is also a subreg.
2582 Register RegToFind = Orig.getOperand(0).getReg();
2583 MachineOperand *UseMO = nullptr;
2584 for (auto &CandMO : I->operands()) {
2585 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2586 continue;
2587 if (UseMO) {
2588 UseMO = nullptr;
2589 break;
2590 }
2591 UseMO = &CandMO;
2592 }
2593 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2594 break;
2595
2596 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2597 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2598
2601 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2602
2603 unsigned NewOpcode = -1;
2604 if (SubregSize == 256)
2605 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2606 else if (SubregSize == 128)
2607 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2608 else
2609 break;
2610
2611 const MCInstrDesc &TID = get(NewOpcode);
2612 const TargetRegisterClass *NewRC =
2613 RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
2614 MRI.setRegClass(DestReg, NewRC);
2615
2616 UseMO->setReg(DestReg);
2617 UseMO->setSubReg(AMDGPU::NoSubRegister);
2618
2619 // Use a smaller load with the desired size, possibly with updated offset.
2620 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2621 MI->setDesc(TID);
2622 MI->getOperand(0).setReg(DestReg);
2623 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2624 if (Offset) {
2625 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2626 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2627 OffsetMO->setImm(FinalOffset);
2628 }
2630 for (const MachineMemOperand *MemOp : Orig.memoperands())
2631 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2632 SubregSize / 8));
2633 MI->setMemRefs(*MF, NewMMOs);
2634
2635 MBB.insert(I, MI);
2636 return;
2637 }
2638
2639 default:
2640 break;
2641 }
2642
2643 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2644}
2645
2646std::pair<MachineInstr*, MachineInstr*>
2648 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2649
2650 if (ST.hasMovB64() &&
2652 getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2653 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2654 return std::pair(&MI, nullptr);
2655 }
2656
2657 MachineBasicBlock &MBB = *MI.getParent();
2661 Register Dst = MI.getOperand(0).getReg();
2662 unsigned Part = 0;
2663 MachineInstr *Split[2];
2664
2665 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2666 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2667 if (Dst.isPhysical()) {
2668 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2669 } else {
2670 assert(MRI.isSSA());
2671 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2672 MovDPP.addDef(Tmp);
2673 }
2674
2675 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2676 const MachineOperand &SrcOp = MI.getOperand(I);
2677 assert(!SrcOp.isFPImm());
2678 if (SrcOp.isImm()) {
2679 APInt Imm(64, SrcOp.getImm());
2680 Imm.ashrInPlace(Part * 32);
2681 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2682 } else {
2683 assert(SrcOp.isReg());
2684 Register Src = SrcOp.getReg();
2685 if (Src.isPhysical())
2686 MovDPP.addReg(RI.getSubReg(Src, Sub));
2687 else
2688 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2689 }
2690 }
2691
2692 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2693 MovDPP.addImm(MO.getImm());
2694
2695 Split[Part] = MovDPP;
2696 ++Part;
2697 }
2698
2699 if (Dst.isVirtual())
2700 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2701 .addReg(Split[0]->getOperand(0).getReg())
2702 .addImm(AMDGPU::sub0)
2703 .addReg(Split[1]->getOperand(0).getReg())
2704 .addImm(AMDGPU::sub1);
2705
2706 MI.eraseFromParent();
2707 return std::pair(Split[0], Split[1]);
2708}
2709
2710std::optional<DestSourcePair>
2712 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2713 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2714
2715 return std::nullopt;
2716}
2717
2719 MachineOperand &Src0,
2720 unsigned Src0OpName,
2721 MachineOperand &Src1,
2722 unsigned Src1OpName) const {
2723 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2724 if (!Src0Mods)
2725 return false;
2726
2727 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2728 assert(Src1Mods &&
2729 "All commutable instructions have both src0 and src1 modifiers");
2730
2731 int Src0ModsVal = Src0Mods->getImm();
2732 int Src1ModsVal = Src1Mods->getImm();
2733
2734 Src1Mods->setImm(Src0ModsVal);
2735 Src0Mods->setImm(Src1ModsVal);
2736 return true;
2737}
2738
2740 MachineOperand &RegOp,
2741 MachineOperand &NonRegOp) {
2742 Register Reg = RegOp.getReg();
2743 unsigned SubReg = RegOp.getSubReg();
2744 bool IsKill = RegOp.isKill();
2745 bool IsDead = RegOp.isDead();
2746 bool IsUndef = RegOp.isUndef();
2747 bool IsDebug = RegOp.isDebug();
2748
2749 if (NonRegOp.isImm())
2750 RegOp.ChangeToImmediate(NonRegOp.getImm());
2751 else if (NonRegOp.isFI())
2752 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2753 else if (NonRegOp.isGlobal()) {
2754 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2755 NonRegOp.getTargetFlags());
2756 } else
2757 return nullptr;
2758
2759 // Make sure we don't reinterpret a subreg index in the target flags.
2760 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2761
2762 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2763 NonRegOp.setSubReg(SubReg);
2764
2765 return &MI;
2766}
2767
2769 unsigned Src0Idx,
2770 unsigned Src1Idx) const {
2771 assert(!NewMI && "this should never be used");
2772
2773 unsigned Opc = MI.getOpcode();
2774 int CommutedOpcode = commuteOpcode(Opc);
2775 if (CommutedOpcode == -1)
2776 return nullptr;
2777
2778 if (Src0Idx > Src1Idx)
2779 std::swap(Src0Idx, Src1Idx);
2780
2781 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2782 static_cast<int>(Src0Idx) &&
2783 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2784 static_cast<int>(Src1Idx) &&
2785 "inconsistency with findCommutedOpIndices");
2786
2787 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2788 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2789
2790 MachineInstr *CommutedMI = nullptr;
2791 if (Src0.isReg() && Src1.isReg()) {
2792 if (isOperandLegal(MI, Src1Idx, &Src0)) {
2793 // Be sure to copy the source modifiers to the right place.
2794 CommutedMI
2795 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2796 }
2797
2798 } else if (Src0.isReg() && !Src1.isReg()) {
2799 // src0 should always be able to support any operand type, so no need to
2800 // check operand legality.
2801 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2802 } else if (!Src0.isReg() && Src1.isReg()) {
2803 if (isOperandLegal(MI, Src1Idx, &Src0))
2804 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2805 } else {
2806 // FIXME: Found two non registers to commute. This does happen.
2807 return nullptr;
2808 }
2809
2810 if (CommutedMI) {
2811 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2812 Src1, AMDGPU::OpName::src1_modifiers);
2813
2814 CommutedMI->setDesc(get(CommutedOpcode));
2815 }
2816
2817 return CommutedMI;
2818}
2819
2820// This needs to be implemented because the source modifiers may be inserted
2821// between the true commutable operands, and the base
2822// TargetInstrInfo::commuteInstruction uses it.
2824 unsigned &SrcOpIdx0,
2825 unsigned &SrcOpIdx1) const {
2826 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2827}
2828
2830 unsigned &SrcOpIdx0,
2831 unsigned &SrcOpIdx1) const {
2832 if (!Desc.isCommutable())
2833 return false;
2834
2835 unsigned Opc = Desc.getOpcode();
2836 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2837 if (Src0Idx == -1)
2838 return false;
2839
2840 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2841 if (Src1Idx == -1)
2842 return false;
2843
2844 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2845}
2846
2848 int64_t BrOffset) const {
2849 // BranchRelaxation should never have to check s_setpc_b64 because its dest
2850 // block is unanalyzable.
2851 assert(BranchOp != AMDGPU::S_SETPC_B64);
2852
2853 // Convert to dwords.
2854 BrOffset /= 4;
2855
2856 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2857 // from the next instruction.
2858 BrOffset -= 1;
2859
2860 return isIntN(BranchOffsetBits, BrOffset);
2861}
2862
2865 return MI.getOperand(0).getMBB();
2866}
2867
2869 for (const MachineInstr &MI : MBB->terminators()) {
2870 if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO ||
2871 MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2872 MI.getOpcode() == AMDGPU::SI_LOOP)
2873 return true;
2874 }
2875 return false;
2876}
2877
2879 MachineBasicBlock &DestBB,
2880 MachineBasicBlock &RestoreBB,
2881 const DebugLoc &DL, int64_t BrOffset,
2882 RegScavenger *RS) const {
2883 assert(RS && "RegScavenger required for long branching");
2884 assert(MBB.empty() &&
2885 "new block should be inserted for expanding unconditional branch");
2886 assert(MBB.pred_size() == 1);
2887 assert(RestoreBB.empty() &&
2888 "restore block should be inserted for restoring clobbered registers");
2889
2893
2894 // FIXME: Virtual register workaround for RegScavenger not working with empty
2895 // blocks.
2896 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2897
2898 auto I = MBB.end();
2899
2900 // We need to compute the offset relative to the instruction immediately after
2901 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2902 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2903
2904 auto &MCCtx = MF->getContext();
2905 MCSymbol *PostGetPCLabel =
2906 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2907 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2908
2909 MCSymbol *OffsetLo =
2910 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2911 MCSymbol *OffsetHi =
2912 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2913 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2914 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2915 .addReg(PCReg, 0, AMDGPU::sub0)
2916 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2917 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2918 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2919 .addReg(PCReg, 0, AMDGPU::sub1)
2920 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2921
2922 // Insert the indirect branch after the other terminator.
2923 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2924 .addReg(PCReg);
2925
2926 // If a spill is needed for the pc register pair, we need to insert a spill
2927 // restore block right before the destination block, and insert a short branch
2928 // into the old destination block's fallthrough predecessor.
2929 // e.g.:
2930 //
2931 // s_cbranch_scc0 skip_long_branch:
2932 //
2933 // long_branch_bb:
2934 // spill s[8:9]
2935 // s_getpc_b64 s[8:9]
2936 // s_add_u32 s8, s8, restore_bb
2937 // s_addc_u32 s9, s9, 0
2938 // s_setpc_b64 s[8:9]
2939 //
2940 // skip_long_branch:
2941 // foo;
2942 //
2943 // .....
2944 //
2945 // dest_bb_fallthrough_predecessor:
2946 // bar;
2947 // s_branch dest_bb
2948 //
2949 // restore_bb:
2950 // restore s[8:9]
2951 // fallthrough dest_bb
2952 ///
2953 // dest_bb:
2954 // buzz;
2955
2956 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
2957 Register Scav;
2958
2959 // If we've previously reserved a register for long branches
2960 // avoid running the scavenger and just use those registers
2961 if (LongBranchReservedReg) {
2962 RS->enterBasicBlock(MBB);
2963 Scav = LongBranchReservedReg;
2964 } else {
2966 Scav = RS->scavengeRegisterBackwards(
2967 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
2968 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
2969 }
2970 if (Scav) {
2971 RS->setRegUsed(Scav);
2972 MRI.replaceRegWith(PCReg, Scav);
2973 MRI.clearVirtRegs();
2974 } else {
2975 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
2976 // SGPR spill.
2977 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2978 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2979 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
2980 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
2981 MRI.clearVirtRegs();
2982 }
2983
2984 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
2985 // Now, the distance could be defined.
2987 MCSymbolRefExpr::create(DestLabel, MCCtx),
2988 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
2989 // Add offset assignments.
2990 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
2991 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
2992 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
2993 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
2994}
2995
2996unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
2997 switch (Cond) {
2998 case SIInstrInfo::SCC_TRUE:
2999 return AMDGPU::S_CBRANCH_SCC1;
3000 case SIInstrInfo::SCC_FALSE:
3001 return AMDGPU::S_CBRANCH_SCC0;
3002 case SIInstrInfo::VCCNZ:
3003 return AMDGPU::S_CBRANCH_VCCNZ;
3004 case SIInstrInfo::VCCZ:
3005 return AMDGPU::S_CBRANCH_VCCZ;
3006 case SIInstrInfo::EXECNZ:
3007 return AMDGPU::S_CBRANCH_EXECNZ;
3008 case SIInstrInfo::EXECZ:
3009 return AMDGPU::S_CBRANCH_EXECZ;
3010 default:
3011 llvm_unreachable("invalid branch predicate");
3012 }
3013}
3014
3015SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3016 switch (Opcode) {
3017 case AMDGPU::S_CBRANCH_SCC0:
3018 return SCC_FALSE;
3019 case AMDGPU::S_CBRANCH_SCC1:
3020 return SCC_TRUE;
3021 case AMDGPU::S_CBRANCH_VCCNZ:
3022 return VCCNZ;
3023 case AMDGPU::S_CBRANCH_VCCZ:
3024 return VCCZ;
3025 case AMDGPU::S_CBRANCH_EXECNZ:
3026 return EXECNZ;
3027 case AMDGPU::S_CBRANCH_EXECZ:
3028 return EXECZ;
3029 default:
3030 return INVALID_BR;
3031 }
3032}
3033
3037 MachineBasicBlock *&FBB,
3039 bool AllowModify) const {
3040 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3041 // Unconditional Branch
3042 TBB = I->getOperand(0).getMBB();
3043 return false;
3044 }
3045
3046 MachineBasicBlock *CondBB = nullptr;
3047
3048 if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
3049 CondBB = I->getOperand(1).getMBB();
3050 Cond.push_back(I->getOperand(0));
3051 } else {
3052 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3053 if (Pred == INVALID_BR)
3054 return true;
3055
3056 CondBB = I->getOperand(0).getMBB();
3057 Cond.push_back(MachineOperand::CreateImm(Pred));
3058 Cond.push_back(I->getOperand(1)); // Save the branch register.
3059 }
3060 ++I;
3061
3062 if (I == MBB.end()) {
3063 // Conditional branch followed by fall-through.
3064 TBB = CondBB;
3065 return false;
3066 }
3067
3068 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3069 TBB = CondBB;
3070 FBB = I->getOperand(0).getMBB();
3071 return false;
3072 }
3073
3074 return true;
3075}
3076
3078 MachineBasicBlock *&FBB,
3080 bool AllowModify) const {
3082 auto E = MBB.end();
3083 if (I == E)
3084 return false;
3085
3086 // Skip over the instructions that are artificially terminators for special
3087 // exec management.
3088 while (I != E && !I->isBranch() && !I->isReturn()) {
3089 switch (I->getOpcode()) {
3090 case AMDGPU::S_MOV_B64_term:
3091 case AMDGPU::S_XOR_B64_term:
3092 case AMDGPU::S_OR_B64_term:
3093 case AMDGPU::S_ANDN2_B64_term:
3094 case AMDGPU::S_AND_B64_term:
3095 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3096 case AMDGPU::S_MOV_B32_term:
3097 case AMDGPU::S_XOR_B32_term:
3098 case AMDGPU::S_OR_B32_term:
3099 case AMDGPU::S_ANDN2_B32_term:
3100 case AMDGPU::S_AND_B32_term:
3101 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3102 break;
3103 case AMDGPU::SI_IF:
3104 case AMDGPU::SI_ELSE:
3105 case AMDGPU::SI_KILL_I1_TERMINATOR:
3106 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3107 // FIXME: It's messy that these need to be considered here at all.
3108 return true;
3109 default:
3110 llvm_unreachable("unexpected non-branch terminator inst");
3111 }
3112
3113 ++I;
3114 }
3115
3116 if (I == E)
3117 return false;
3118
3119 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3120}
3121
3123 int *BytesRemoved) const {
3124 unsigned Count = 0;
3125 unsigned RemovedSize = 0;
3127 // Skip over artificial terminators when removing instructions.
3128 if (MI.isBranch() || MI.isReturn()) {
3129 RemovedSize += getInstSizeInBytes(MI);
3130 MI.eraseFromParent();
3131 ++Count;
3132 }
3133 }
3134
3135 if (BytesRemoved)
3136 *BytesRemoved = RemovedSize;
3137
3138 return Count;
3139}
3140
3141// Copy the flags onto the implicit condition register operand.
3143 const MachineOperand &OrigCond) {
3144 CondReg.setIsUndef(OrigCond.isUndef());
3145 CondReg.setIsKill(OrigCond.isKill());
3146}
3147
3150 MachineBasicBlock *FBB,
3152 const DebugLoc &DL,
3153 int *BytesAdded) const {
3154 if (!FBB && Cond.empty()) {
3155 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3156 .addMBB(TBB);
3157 if (BytesAdded)
3158 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3159 return 1;
3160 }
3161
3162 if(Cond.size() == 1 && Cond[0].isReg()) {
3163 BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
3164 .add(Cond[0])
3165 .addMBB(TBB);
3166 return 1;
3167 }
3168
3169 assert(TBB && Cond[0].isImm());
3170
3171 unsigned Opcode
3172 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3173
3174 if (!FBB) {
3175 MachineInstr *CondBr =
3176 BuildMI(&MBB, DL, get(Opcode))
3177 .addMBB(TBB);
3178
3179 // Copy the flags onto the implicit condition register operand.
3180 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3181 fixImplicitOperands(*CondBr);
3182
3183 if (BytesAdded)
3184 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3185 return 1;
3186 }
3187
3188 assert(TBB && FBB);
3189
3190 MachineInstr *CondBr =
3191 BuildMI(&MBB, DL, get(Opcode))
3192 .addMBB(TBB);
3193 fixImplicitOperands(*CondBr);
3194 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3195 .addMBB(FBB);
3196
3197 MachineOperand &CondReg = CondBr->getOperand(1);
3198 CondReg.setIsUndef(Cond[1].isUndef());
3199 CondReg.setIsKill(Cond[1].isKill());
3200
3201 if (BytesAdded)
3202 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3203
3204 return 2;
3205}
3206
3209 if (Cond.size() != 2) {
3210 return true;
3211 }
3212
3213 if (Cond[0].isImm()) {
3214 Cond[0].setImm(-Cond[0].getImm());
3215 return false;
3216 }
3217
3218 return true;
3219}
3220
3223 Register DstReg, Register TrueReg,
3224 Register FalseReg, int &CondCycles,
3225 int &TrueCycles, int &FalseCycles) const {
3226 switch (Cond[0].getImm()) {
3227 case VCCNZ:
3228 case VCCZ: {
3230 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3231 if (MRI.getRegClass(FalseReg) != RC)
3232 return false;
3233
3234 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3235 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3236
3237 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3238 return RI.hasVGPRs(RC) && NumInsts <= 6;
3239 }
3240 case SCC_TRUE:
3241 case SCC_FALSE: {
3242 // FIXME: We could insert for VGPRs if we could replace the original compare
3243 // with a vector one.
3245 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3246 if (MRI.getRegClass(FalseReg) != RC)
3247 return false;
3248
3249 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3250
3251 // Multiples of 8 can do s_cselect_b64
3252 if (NumInsts % 2 == 0)
3253 NumInsts /= 2;
3254
3255 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3256 return RI.isSGPRClass(RC);
3257 }
3258 default:
3259 return false;
3260 }
3261}
3262
3266 Register TrueReg, Register FalseReg) const {
3267 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3268 if (Pred == VCCZ || Pred == SCC_FALSE) {
3269 Pred = static_cast<BranchPredicate>(-Pred);
3270 std::swap(TrueReg, FalseReg);
3271 }
3272
3274 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3275 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3276
3277 if (DstSize == 32) {
3279 if (Pred == SCC_TRUE) {
3280 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3281 .addReg(TrueReg)
3282 .addReg(FalseReg);
3283 } else {
3284 // Instruction's operands are backwards from what is expected.
3285 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3286 .addReg(FalseReg)
3287 .addReg(TrueReg);
3288 }
3289
3290 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3291 return;
3292 }
3293
3294 if (DstSize == 64 && Pred == SCC_TRUE) {
3296 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3297 .addReg(TrueReg)
3298 .addReg(FalseReg);
3299
3300 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3301 return;
3302 }
3303
3304 static const int16_t Sub0_15[] = {
3305 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3306 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3307 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3308 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3309 };
3310
3311 static const int16_t Sub0_15_64[] = {
3312 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3313 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3314 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3315 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3316 };
3317
3318 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3319 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3320 const int16_t *SubIndices = Sub0_15;
3321 int NElts = DstSize / 32;
3322
3323 // 64-bit select is only available for SALU.
3324 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3325 if (Pred == SCC_TRUE) {
3326 if (NElts % 2) {
3327 SelOp = AMDGPU::S_CSELECT_B32;
3328 EltRC = &AMDGPU::SGPR_32RegClass;
3329 } else {
3330 SelOp = AMDGPU::S_CSELECT_B64;
3331 EltRC = &AMDGPU::SGPR_64RegClass;
3332 SubIndices = Sub0_15_64;
3333 NElts /= 2;
3334 }
3335 }
3336
3338 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3339
3340 I = MIB->getIterator();
3341
3343 for (int Idx = 0; Idx != NElts; ++Idx) {
3344 Register DstElt = MRI.createVirtualRegister(EltRC);
3345 Regs.push_back(DstElt);
3346
3347 unsigned SubIdx = SubIndices[Idx];
3348
3350 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3351 Select =
3352 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3353 .addReg(FalseReg, 0, SubIdx)
3354 .addReg(TrueReg, 0, SubIdx);
3355 } else {
3356 Select =
3357 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3358 .addReg(TrueReg, 0, SubIdx)
3359 .addReg(FalseReg, 0, SubIdx);
3360 }
3361
3362 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3364
3365 MIB.addReg(DstElt)
3366 .addImm(SubIdx);
3367 }
3368}
3369
3371 switch (MI.getOpcode()) {
3372 case AMDGPU::V_MOV_B32_e32:
3373 case AMDGPU::V_MOV_B32_e64:
3374 case AMDGPU::V_MOV_B64_PSEUDO:
3375 case AMDGPU::V_MOV_B64_e32:
3376 case AMDGPU::V_MOV_B64_e64:
3377 case AMDGPU::S_MOV_B32:
3378 case AMDGPU::S_MOV_B64:
3379 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3380 case AMDGPU::COPY:
3381 case AMDGPU::WWM_COPY:
3382 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3383 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3384 case AMDGPU::V_ACCVGPR_MOV_B32:
3385 return true;
3386 default:
3387 return false;
3388 }
3389}
3390
3391static constexpr unsigned ModifierOpNames[] = {
3392 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3393 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3394 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3395
3397 unsigned Opc = MI.getOpcode();
3398 for (unsigned Name : reverse(ModifierOpNames)) {
3400 if (Idx >= 0)
3401 MI.removeOperand(Idx);
3402 }
3403}
3404
3406 Register Reg, MachineRegisterInfo *MRI) const {
3407 if (!MRI->hasOneNonDBGUse(Reg))
3408 return false;
3409
3410 switch (DefMI.getOpcode()) {
3411 default:
3412 return false;
3413 case AMDGPU::V_MOV_B64_e32:
3414 case AMDGPU::S_MOV_B64:
3415 case AMDGPU::V_MOV_B64_PSEUDO:
3416 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3417 case AMDGPU::V_MOV_B32_e32:
3418 case AMDGPU::S_MOV_B32:
3419 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3420 break;
3421 }
3422
3423 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
3424 assert(ImmOp);
3425 // FIXME: We could handle FrameIndex values here.
3426 if (!ImmOp->isImm())
3427 return false;
3428
3429 auto getImmFor = [ImmOp](const MachineOperand &UseOp) -> int64_t {
3430 int64_t Imm = ImmOp->getImm();
3431 switch (UseOp.getSubReg()) {
3432 default:
3433 return Imm;
3434 case AMDGPU::sub0:
3435 return Lo_32(Imm);
3436 case AMDGPU::sub1:
3437 return Hi_32(Imm);
3438 case AMDGPU::lo16:
3439 return APInt(16, Imm).getSExtValue();
3440 case AMDGPU::hi16:
3441 return APInt(32, Imm).ashr(16).getSExtValue();
3442 case AMDGPU::sub1_lo16:
3443 return APInt(16, Hi_32(Imm)).getSExtValue();
3444 case AMDGPU::sub1_hi16:
3445 return APInt(32, Hi_32(Imm)).ashr(16).getSExtValue();
3446 }
3447 };
3448
3449 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3450
3451 unsigned Opc = UseMI.getOpcode();
3452 if (Opc == AMDGPU::COPY) {
3453 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3454
3455 Register DstReg = UseMI.getOperand(0).getReg();
3456 unsigned OpSize = getOpSize(UseMI, 0);
3457 bool Is16Bit = OpSize == 2;
3458 bool Is64Bit = OpSize == 8;
3459 bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
3460 unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
3461 : AMDGPU::V_MOV_B32_e32
3462 : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
3463 : AMDGPU::S_MOV_B32;
3464 APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1)));
3465
3466 if (RI.isAGPR(*MRI, DstReg)) {
3467 if (Is64Bit || !isInlineConstant(Imm))
3468 return false;
3469 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
3470 }
3471
3472 if (Is16Bit) {
3473 if (isVGPRCopy)
3474 return false; // Do not clobber vgpr_hi16
3475
3476 if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
3477 return false;
3478
3479 UseMI.getOperand(0).setSubReg(0);
3480 if (DstReg.isPhysical()) {
3481 DstReg = RI.get32BitRegister(DstReg);
3482 UseMI.getOperand(0).setReg(DstReg);
3483 }
3484 assert(UseMI.getOperand(1).getReg().isVirtual());
3485 }
3486
3487 const MCInstrDesc &NewMCID = get(NewOpc);
3488 if (DstReg.isPhysical() &&
3489 !RI.getRegClass(NewMCID.operands()[0].RegClass)->contains(DstReg))
3490 return false;
3491
3492 UseMI.setDesc(NewMCID);
3493 UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
3494 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
3495 return true;
3496 }
3497
3498 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3499 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3500 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3501 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3502 Opc == AMDGPU::V_FMAC_F16_t16_e64) {
3503 // Don't fold if we are using source or output modifiers. The new VOP2
3504 // instructions don't have them.
3506 return false;
3507
3508 // If this is a free constant, there's no reason to do this.
3509 // TODO: We could fold this here instead of letting SIFoldOperands do it
3510 // later.
3511 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
3512
3513 // Any src operand can be used for the legality check.
3514 if (isInlineConstant(UseMI, *Src0, *ImmOp))
3515 return false;
3516
3517 bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3518 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64;
3519 bool IsFMA =
3520 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3521 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3522 Opc == AMDGPU::V_FMAC_F16_t16_e64;
3523 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3524 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3525
3526 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3527 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3528 (Src1->isReg() && Src1->getReg() == Reg)) {
3529 MachineOperand *RegSrc =
3530 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3531 if (!RegSrc->isReg())
3532 return false;
3533 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3534 ST.getConstantBusLimit(Opc) < 2)
3535 return false;
3536
3537 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3538 return false;
3539
3540 // If src2 is also a literal constant then we have to choose which one to
3541 // fold. In general it is better to choose madak so that the other literal
3542 // can be materialized in an sgpr instead of a vgpr:
3543 // s_mov_b32 s0, literal
3544 // v_madak_f32 v0, s0, v0, literal
3545 // Instead of:
3546 // v_mov_b32 v1, literal
3547 // v_madmk_f32 v0, v0, literal, v1
3548 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3549 if (Def && Def->isMoveImmediate() &&
3550 !isInlineConstant(Def->getOperand(1)))
3551 return false;
3552
3553 unsigned NewOpc =
3554 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3555 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
3556 : AMDGPU::V_FMAMK_F16)
3557 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
3558 if (pseudoToMCOpcode(NewOpc) == -1)
3559 return false;
3560
3561 // V_FMAMK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
3562 // would also require restricting their register classes. For now
3563 // just bail out.
3564 if (NewOpc == AMDGPU::V_FMAMK_F16_t16)
3565 return false;
3566
3567 const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
3568
3569 // FIXME: This would be a lot easier if we could return a new instruction
3570 // instead of having to modify in place.
3571
3572 Register SrcReg = RegSrc->getReg();
3573 unsigned SrcSubReg = RegSrc->getSubReg();
3574 Src0->setReg(SrcReg);
3575 Src0->setSubReg(SrcSubReg);
3576 Src0->setIsKill(RegSrc->isKill());
3577
3578 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3579 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3580 Opc == AMDGPU::V_FMAC_F16_e64)
3581 UseMI.untieRegOperand(
3582 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3583
3584 Src1->ChangeToImmediate(Imm);
3585
3587 UseMI.setDesc(get(NewOpc));
3588
3589 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3590 if (DeleteDef)
3591 DefMI.eraseFromParent();
3592
3593 return true;
3594 }
3595
3596 // Added part is the constant: Use v_madak_{f16, f32}.
3597 if (Src2->isReg() && Src2->getReg() == Reg) {
3598 if (ST.getConstantBusLimit(Opc) < 2) {
3599 // Not allowed to use constant bus for another operand.
3600 // We can however allow an inline immediate as src0.
3601 bool Src0Inlined = false;
3602 if (Src0->isReg()) {
3603 // Try to inline constant if possible.
3604 // If the Def moves immediate and the use is single
3605 // We are saving VGPR here.
3606 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3607 if (Def && Def->isMoveImmediate() &&
3608 isInlineConstant(Def->getOperand(1)) &&
3609 MRI->hasOneUse(Src0->getReg())) {
3610 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3611 Src0Inlined = true;
3612 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3613 RI.isSGPRReg(*MRI, Src0->getReg())) {
3614 return false;
3615 }
3616 // VGPR is okay as Src0 - fallthrough
3617 }
3618
3619 if (Src1->isReg() && !Src0Inlined) {
3620 // We have one slot for inlinable constant so far - try to fill it
3621 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3622 if (Def && Def->isMoveImmediate() &&
3623 isInlineConstant(Def->getOperand(1)) &&
3624 MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI))
3625 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3626 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3627 return false;
3628 // VGPR is okay as Src1 - fallthrough
3629 }
3630 }
3631
3632 unsigned NewOpc =
3633 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3634 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
3635 : AMDGPU::V_FMAAK_F16)
3636 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
3637 if (pseudoToMCOpcode(NewOpc) == -1)
3638 return false;
3639
3640 // V_FMAAK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
3641 // would also require restricting their register classes. For now
3642 // just bail out.
3643 if (NewOpc == AMDGPU::V_FMAAK_F16_t16)
3644 return false;
3645
3646 // FIXME: This would be a lot easier if we could return a new instruction
3647 // instead of having to modify in place.
3648
3649 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3650 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3651 Opc == AMDGPU::V_FMAC_F16_e64)
3652 UseMI.untieRegOperand(
3653 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3654
3655 // ChangingToImmediate adds Src2 back to the instruction.
3656 Src2->ChangeToImmediate(getImmFor(*Src2));
3657
3658 // These come before src2.
3660 UseMI.setDesc(get(NewOpc));
3661 // It might happen that UseMI was commuted
3662 // and we now have SGPR as SRC1. If so 2 inlined
3663 // constant and SGPR are illegal.
3665
3666 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3667 if (DeleteDef)
3668 DefMI.eraseFromParent();
3669
3670 return true;
3671 }
3672 }
3673
3674 return false;
3675}
3676
3677static bool
3680 if (BaseOps1.size() != BaseOps2.size())
3681 return false;
3682 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3683 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3684 return false;
3685 }
3686 return true;
3687}
3688
3689static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3690 LocationSize WidthB, int OffsetB) {
3691 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3692 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3693 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3694 return LowWidth.hasValue() &&
3695 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3696}
3697
3698bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3699 const MachineInstr &MIb) const {
3700 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3701 int64_t Offset0, Offset1;
3702 LocationSize Dummy0 = 0, Dummy1 = 0;
3703 bool Offset0IsScalable, Offset1IsScalable;
3704 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3705 Dummy0, &RI) ||
3706 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3707 Dummy1, &RI))
3708 return false;
3709
3710 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3711 return false;
3712
3713 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3714 // FIXME: Handle ds_read2 / ds_write2.
3715 return false;
3716 }
3717 LocationSize Width0 = MIa.memoperands().front()->getSize();
3718 LocationSize Width1 = MIb.memoperands().front()->getSize();
3719 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3720}
3721
3723 const MachineInstr &MIb) const {
3724 assert(MIa.mayLoadOrStore() &&
3725 "MIa must load from or modify a memory location");
3726 assert(MIb.mayLoadOrStore() &&
3727 "MIb must load from or modify a memory location");
3728
3730 return false;
3731
3732 // XXX - Can we relax this between address spaces?
3733 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3734 return false;
3735
3736 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3737 return false;
3738
3739 // TODO: Should we check the address space from the MachineMemOperand? That
3740 // would allow us to distinguish objects we know don't alias based on the
3741 // underlying address space, even if it was lowered to a different one,
3742 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3743 // buffer.
3744 if (isDS(MIa)) {
3745 if (isDS(MIb))
3746 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3747
3748 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3749 }
3750
3751 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3752 if (isMUBUF(MIb) || isMTBUF(MIb))
3753 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3754
3755 if (isFLAT(MIb))
3756 return isFLATScratch(MIb);
3757
3758 return !isSMRD(MIb);
3759 }
3760
3761 if (isSMRD(MIa)) {
3762 if (isSMRD(MIb))
3763 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3764
3765 if (isFLAT(MIb))
3766 return isFLATScratch(MIb);
3767
3768 return !isMUBUF(MIb) && !isMTBUF(MIb);
3769 }
3770
3771 if (isFLAT(MIa)) {
3772 if (isFLAT(MIb)) {
3773 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3774 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3775 return true;
3776
3777 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3778 }
3779
3780 return false;
3781 }
3782
3783 return false;
3784}
3785
3787 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3788 if (Reg.isPhysical())
3789 return false;
3790 auto *Def = MRI.getUniqueVRegDef(Reg);
3791 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3792 Imm = Def->getOperand(1).getImm();
3793 if (DefMI)
3794 *DefMI = Def;
3795 return true;
3796 }
3797 return false;
3798}
3799
3800static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3801 MachineInstr **DefMI = nullptr) {
3802 if (!MO->isReg())
3803 return false;
3804 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3805 const MachineRegisterInfo &MRI = MF->getRegInfo();
3806 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3807}
3808
3810 MachineInstr &NewMI) {
3811 if (LV) {
3812 unsigned NumOps = MI.getNumOperands();
3813 for (unsigned I = 1; I < NumOps; ++I) {
3814 MachineOperand &Op = MI.getOperand(I);
3815 if (Op.isReg() && Op.isKill())
3816 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3817 }
3818 }
3819}
3820
3822 LiveVariables *LV,
3823 LiveIntervals *LIS) const {
3824 MachineBasicBlock &MBB = *MI.getParent();
3825 unsigned Opc = MI.getOpcode();
3826
3827 // Handle MFMA.
3828 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
3829 if (NewMFMAOpc != -1) {
3831 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
3832 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3833 MIB.add(MI.getOperand(I));
3834 updateLiveVariables(LV, MI, *MIB);
3835 if (LIS) {
3836 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3837 // SlotIndex of defs needs to be updated when converting to early-clobber
3838 MachineOperand &Def = MIB->getOperand(0);
3839 if (Def.isEarlyClobber() && Def.isReg() &&
3840 LIS->hasInterval(Def.getReg())) {
3841 SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
3842 SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
3843 auto &LI = LIS->getInterval(Def.getReg());
3844 auto UpdateDefIndex = [&](LiveRange &LR) {
3845 auto S = LR.find(OldIndex);
3846 if (S != LR.end() && S->start == OldIndex) {
3847 assert(S->valno && S->valno->def == OldIndex);
3848 S->start = NewIndex;
3849 S->valno->def = NewIndex;
3850 }
3851 };
3852 UpdateDefIndex(LI);
3853 for (auto &SR : LI.subranges())
3854 UpdateDefIndex(SR);
3855 }
3856 }
3857 return MIB;
3858 }
3859
3860 if (SIInstrInfo::isWMMA(MI)) {
3861 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
3862 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3863 .setMIFlags(MI.getFlags());
3864 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3865 MIB->addOperand(MI.getOperand(I));
3866
3867 updateLiveVariables(LV, MI, *MIB);
3868 if (LIS)
3869 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3870
3871 return MIB;
3872 }
3873
3874 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
3875 "V_FMAC_F16_t16_e32 is not supported and not expected to be present "
3876 "pre-RA");
3877
3878 // Handle MAC/FMAC.
3879 bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
3880 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3881 Opc == AMDGPU::V_FMAC_F16_t16_e64;
3882 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3883 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3884 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
3885 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3886 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3887 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3888 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3889 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
3890 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
3891 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3892 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
3893 bool Src0Literal = false;
3894
3895 switch (Opc) {
3896 default:
3897 return nullptr;
3898 case AMDGPU::V_MAC_F16_e64:
3899 case AMDGPU::V_FMAC_F16_e64:
3900 case AMDGPU::V_FMAC_F16_t16_e64:
3901 case AMDGPU::V_MAC_F32_e64:
3902 case AMDGPU::V_MAC_LEGACY_F32_e64:
3903 case AMDGPU::V_FMAC_F32_e64:
3904 case AMDGPU::V_FMAC_LEGACY_F32_e64:
3905 case AMDGPU::V_FMAC_F64_e64:
3906 break;
3907 case AMDGPU::V_MAC_F16_e32:
3908 case AMDGPU::V_FMAC_F16_e32:
3909 case AMDGPU::V_MAC_F32_e32:
3910 case AMDGPU::V_MAC_LEGACY_F32_e32:
3911 case AMDGPU::V_FMAC_F32_e32:
3912 case AMDGPU::V_FMAC_LEGACY_F32_e32:
3913 case AMDGPU::V_FMAC_F64_e32: {
3914 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
3915 AMDGPU::OpName::src0);
3916 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
3917 if (!Src0->isReg() && !Src0->isImm())
3918 return nullptr;
3919
3920 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
3921 Src0Literal = true;
3922
3923 break;
3924 }
3925 }
3926
3928 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
3929 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
3930 const MachineOperand *Src0Mods =
3931 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
3932 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3933 const MachineOperand *Src1Mods =
3934 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
3935 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3936 const MachineOperand *Src2Mods =
3937 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
3938 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
3939 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
3940 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
3941
3942 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 &&
3943 !IsLegacy &&
3944 // If we have an SGPR input, we will violate the constant bus restriction.
3945 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
3946 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
3948 const auto killDef = [&]() -> void {
3950 // The only user is the instruction which will be killed.
3951 Register DefReg = DefMI->getOperand(0).getReg();
3952 if (!MRI.hasOneNonDBGUse(DefReg))
3953 return;
3954 // We cannot just remove the DefMI here, calling pass will crash.
3955 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
3956 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
3958 if (LV)
3959 LV->getVarInfo(DefReg).AliveBlocks.clear();
3960 };
3961
3962 int64_t Imm;
3963 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
3964 unsigned NewOpc =
3965 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
3966 : AMDGPU::V_FMAAK_F16)
3967 : AMDGPU::V_FMAAK_F32)
3968 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
3969 if (pseudoToMCOpcode(NewOpc) != -1) {
3970 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3971 .add(*Dst)
3972 .add(*Src0)
3973 .add(*Src1)
3974 .addImm(Imm)
3975 .setMIFlags(MI.getFlags());
3976 updateLiveVariables(LV, MI, *MIB);
3977 if (LIS)
3978 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3979 killDef();
3980 return MIB;
3981 }
3982 }
3983 unsigned NewOpc =
3984 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
3985 : AMDGPU::V_FMAMK_F16)
3986 : AMDGPU::V_FMAMK_F32)
3987 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
3988 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
3989 if (pseudoToMCOpcode(NewOpc) != -1) {
3990 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3991 .add(*Dst)
3992 .add(*Src0)
3993 .addImm(Imm)
3994 .add(*Src2)
3995 .setMIFlags(MI.getFlags());
3996 updateLiveVariables(LV, MI, *MIB);
3997 if (LIS)
3998 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3999 killDef();
4000 return MIB;
4001 }
4002 }
4003 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4004 if (Src0Literal) {
4005 Imm = Src0->getImm();
4006 DefMI = nullptr;
4007 }
4008 if (pseudoToMCOpcode(NewOpc) != -1 &&
4010 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4011 Src1)) {
4012 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4013 .add(*Dst)
4014 .add(*Src1)
4015 .addImm(Imm)
4016 .add(*Src2)
4017 .setMIFlags(MI.getFlags());
4018 updateLiveVariables(LV, MI, *MIB);
4019 if (LIS)
4020 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4021 if (DefMI)
4022 killDef();
4023 return MIB;
4024 }
4025 }
4026 }
4027
4028 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4029 // if VOP3 does not allow a literal operand.
4030 if (Src0Literal && !ST.hasVOP3Literal())
4031 return nullptr;
4032
4033 unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64
4034 : IsF64 ? AMDGPU::V_FMA_F64_e64
4035 : IsLegacy
4036 ? AMDGPU::V_FMA_LEGACY_F32_e64
4037 : AMDGPU::V_FMA_F32_e64
4038 : IsF16 ? AMDGPU::V_MAD_F16_e64
4039 : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64
4040 : AMDGPU::V_MAD_F32_e64;
4041 if (pseudoToMCOpcode(NewOpc) == -1)
4042 return nullptr;
4043
4044 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4045 .add(*Dst)
4046 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4047 .add(*Src0)
4048 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4049 .add(*Src1)
4050 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4051 .add(*Src2)
4052 .addImm(Clamp ? Clamp->getImm() : 0)
4053 .addImm(Omod ? Omod->getImm() : 0)
4054 .setMIFlags(MI.getFlags());
4055 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4056 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4057 updateLiveVariables(LV, MI, *MIB);
4058 if (LIS)
4059 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4060 return MIB;
4061}
4062
4063// It's not generally safe to move VALU instructions across these since it will
4064// start using the register as a base index rather than directly.
4065// XXX - Why isn't hasSideEffects sufficient for these?
4067 switch (MI.getOpcode()) {
4068 case AMDGPU::S_SET_GPR_IDX_ON:
4069 case AMDGPU::S_SET_GPR_IDX_MODE:
4070 case AMDGPU::S_SET_GPR_IDX_OFF:
4071 return true;
4072 default:
4073 return false;
4074 }
4075}
4076
4078 const MachineBasicBlock *MBB,
4079 const MachineFunction &MF) const {
4080 // Skipping the check for SP writes in the base implementation. The reason it
4081 // was added was apparently due to compile time concerns.
4082 //
4083 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4084 // but is probably avoidable.
4085
4086 // Copied from base implementation.
4087 // Terminators and labels can't be scheduled around.
4088 if (MI.isTerminator() || MI.isPosition())
4089 return true;
4090
4091 // INLINEASM_BR can jump to another block
4092 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4093 return true;
4094
4095 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4096 return true;
4097
4098 // Target-independent instructions do not have an implicit-use of EXEC, even
4099 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4100 // boundaries prevents incorrect movements of such instructions.
4101 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4102 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4103 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4104 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4106}
4107
4109 return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
4110}
4111
4113 // Skip the full operand and register alias search modifiesRegister
4114 // does. There's only a handful of instructions that touch this, it's only an
4115 // implicit def, and doesn't alias any other registers.
4116 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4117}
4118
4120 unsigned Opcode = MI.getOpcode();
4121
4122 if (MI.mayStore() && isSMRD(MI))
4123 return true; // scalar store or atomic
4124
4125 // This will terminate the function when other lanes may need to continue.
4126 if (MI.isReturn())
4127 return true;
4128
4129 // These instructions cause shader I/O that may cause hardware lockups
4130 // when executed with an empty EXEC mask.
4131 //
4132 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4133 // EXEC = 0, but checking for that case here seems not worth it
4134 // given the typical code patterns.
4135 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4136 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4137 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4138 return true;
4139
4140 if (MI.isCall() || MI.isInlineAsm())
4141 return true; // conservative assumption
4142
4143 // Assume that barrier interactions are only intended with active lanes.
4144 if (isBarrier(Opcode))
4145 return true;
4146
4147 // A mode change is a scalar operation that influences vector instructions.
4149 return true;
4150
4151 // These are like SALU instructions in terms of effects, so it's questionable
4152 // whether we should return true for those.
4153 //
4154 // However, executing them with EXEC = 0 causes them to operate on undefined
4155 // data, which we avoid by returning true here.
4156 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4157 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4158 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4159 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4160 return true;
4161
4162 return false;
4163}
4164
4166 const MachineInstr &MI) const {
4167 if (MI.isMetaInstruction())
4168 return false;
4169
4170 // This won't read exec if this is an SGPR->SGPR copy.
4171 if (MI.isCopyLike()) {
4172 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4173 return true;
4174
4175 // Make sure this isn't copying exec as a normal operand
4176 return MI.readsRegister(AMDGPU::EXEC, &RI);
4177 }
4178
4179 // Make a conservative assumption about the callee.
4180 if (MI.isCall())
4181 return true;
4182
4183 // Be conservative with any unhandled generic opcodes.
4184 if (!isTargetSpecificOpcode(MI.getOpcode()))
4185 return true;
4186
4187 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4188}
4189
4190bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4191 switch (Imm.getBitWidth()) {
4192 case 1: // This likely will be a condition code mask.
4193 return true;
4194
4195 case 32:
4196 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4197 ST.hasInv2PiInlineImm());
4198 case 64:
4199 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4200 ST.hasInv2PiInlineImm());
4201 case 16:
4202 return ST.has16BitInsts() &&
4203 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4204 ST.hasInv2PiInlineImm());
4205 default:
4206 llvm_unreachable("invalid bitwidth");
4207 }
4208}
4209
4211 APInt IntImm = Imm.bitcastToAPInt();
4212 int64_t IntImmVal = IntImm.getSExtValue();
4213 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4214 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4215 default:
4216 llvm_unreachable("invalid fltSemantics");
4219 return isInlineConstant(IntImm);
4221 return ST.has16BitInsts() &&
4222 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4224 return ST.has16BitInsts() &&
4225 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4226 }
4227}
4228
4230 uint8_t OperandType) const {
4231 assert(!MO.isReg() && "isInlineConstant called on register operand!");
4232 if (!MO.isImm())
4233 return false;
4234
4235 // MachineOperand provides no way to tell the true operand size, since it only
4236 // records a 64-bit value. We need to know the size to determine if a 32-bit
4237 // floating point immediate bit pattern is legal for an integer immediate. It
4238 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4239
4240 int64_t Imm = MO.getImm();
4241 switch (OperandType) {
4254 int32_t Trunc = static_cast<int32_t>(Imm);
4256 }
4263 ST.hasInv2PiInlineImm());
4267 // We would expect inline immediates to not be concerned with an integer/fp
4268 // distinction. However, in the case of 16-bit integer operations, the
4269 // "floating point" values appear to not work. It seems read the low 16-bits
4270 // of 32-bit immediates, which happens to always work for the integer
4271 // values.
4272 //
4273 // See llvm bugzilla 46302.
4274 //
4275 // TODO: Theoretically we could use op-sel to use the high bits of the
4276 // 32-bit FP values.
4294 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4295 // A few special case instructions have 16-bit operands on subtargets
4296 // where 16-bit instructions are not legal.
4297 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4298 // constants in these cases
4299 int16_t Trunc = static_cast<int16_t>(Imm);
4300 return ST.has16BitInsts() &&
4302 }
4303
4304 return false;
4305 }
4310 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4311 int16_t Trunc = static_cast<int16_t>(Imm);
4312 return ST.has16BitInsts() &&
4314 }
4315 return false;
4316 }
4319 return false;
4322 // Always embedded in the instruction for free.
4323 return true;
4333 // Just ignore anything else.
4334 return true;
4335 default:
4336 llvm_unreachable("invalid operand type");
4337 }
4338}
4339
4340static bool compareMachineOp(const MachineOperand &Op0,
4341 const MachineOperand &Op1) {
4342 if (Op0.getType() != Op1.getType())
4343 return false;
4344
4345 switch (Op0.getType()) {
4347 return Op0.getReg() == Op1.getReg();
4349 return Op0.getImm() == Op1.getImm();
4350 default:
4351 llvm_unreachable("Didn't expect to be comparing these operand types");
4352 }
4353}
4354
4356 const MachineOperand &MO) const {
4357 const MCInstrDesc &InstDesc = MI.getDesc();
4358 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4359
4360 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
4361
4363 return true;
4364
4365 if (OpInfo.RegClass < 0)
4366 return false;
4367
4368 if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
4369 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
4370 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4371 AMDGPU::OpName::src2))
4372 return false;
4373 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4374 }
4375
4376 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4377 return false;
4378
4379 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
4380 return true;
4381
4382 return ST.hasVOP3Literal();
4383}
4384
4385bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4386 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4387 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4388 return false;
4389
4390 int Op32 = AMDGPU::getVOPe32(Opcode);
4391 if (Op32 == -1)
4392 return false;
4393
4394 return pseudoToMCOpcode(Op32) != -1;
4395}
4396
4397bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4398 // The src0_modifier operand is present on all instructions
4399 // that have modifiers.
4400
4401 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4402}
4403
4405 unsigned OpName) const {
4406 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4407 return Mods && Mods->getImm();
4408}
4409
4411 return any_of(ModifierOpNames,
4412 [&](unsigned Name) { return hasModifiersSet(MI, Name); });
4413}
4414
4416 const MachineRegisterInfo &MRI) const {
4417 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4418 // Can't shrink instruction with three operands.
4419 if (Src2) {
4420 switch (MI.getOpcode()) {
4421 default: return false;
4422
4423 case AMDGPU::V_ADDC_U32_e64:
4424 case AMDGPU::V_SUBB_U32_e64:
4425 case AMDGPU::V_SUBBREV_U32_e64: {
4426 const MachineOperand *Src1
4427 = getNamedOperand(MI, AMDGPU::OpName::src1);
4428 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4429 return false;
4430 // Additional verification is needed for sdst/src2.
4431 return true;
4432 }
4433 case AMDGPU::V_MAC_F16_e64:
4434 case AMDGPU::V_MAC_F32_e64:
4435 case AMDGPU::V_MAC_LEGACY_F32_e64:
4436 case AMDGPU::V_FMAC_F16_e64:
4437 case AMDGPU::V_FMAC_F16_t16_e64:
4438 case AMDGPU::V_FMAC_F32_e64:
4439 case AMDGPU::V_FMAC_F64_e64:
4440 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4441 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4442 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4443 return false;
4444 break;
4445
4446 case AMDGPU::V_CNDMASK_B32_e64:
4447 break;
4448 }
4449 }
4450
4451 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4452 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4453 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4454 return false;
4455
4456 // We don't need to check src0, all input types are legal, so just make sure
4457 // src0 isn't using any modifiers.
4458 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4459 return false;
4460
4461 // Can it be shrunk to a valid 32 bit opcode?
4462 if (!hasVALU32BitEncoding(MI.getOpcode()))
4463 return false;
4464
4465 // Check output modifiers
4466 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4467 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4468 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel);
4469}
4470
4471// Set VCC operand with all flags from \p Orig, except for setting it as
4472// implicit.
4474 const MachineOperand &Orig) {
4475
4476 for (MachineOperand &Use : MI.implicit_operands()) {
4477 if (Use.isUse() &&
4478 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4479 Use.setIsUndef(Orig.isUndef());
4480 Use.setIsKill(Orig.isKill());
4481 return;
4482 }
4483 }
4484}
4485
4487 unsigned Op32) const {
4488 MachineBasicBlock *MBB = MI.getParent();
4489
4490 const MCInstrDesc &Op32Desc = get(Op32);
4491 MachineInstrBuilder Inst32 =
4492 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4493 .setMIFlags(MI.getFlags());
4494
4495 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4496 // For VOPC instructions, this is replaced by an implicit def of vcc.
4497
4498 // We assume the defs of the shrunk opcode are in the same order, and the
4499 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4500 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4501 Inst32.add(MI.getOperand(I));
4502
4503 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4504
4505 int Idx = MI.getNumExplicitDefs();
4506 for (const MachineOperand &Use : MI.explicit_uses()) {
4507 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4509 continue;
4510
4511 if (&Use == Src2) {
4512 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4513 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4514 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4515 // of vcc was already added during the initial BuildMI, but we
4516 // 1) may need to change vcc to vcc_lo to preserve the original register
4517 // 2) have to preserve the original flags.
4518 fixImplicitOperands(*Inst32);
4519 copyFlagsToImplicitVCC(*Inst32, *Src2);
4520 continue;
4521 }
4522 }
4523
4524 Inst32.add(Use);
4525 }
4526
4527 // FIXME: Losing implicit operands
4528
4529 return Inst32;
4530}
4531
4533 const MachineOperand &MO,
4534 const MCOperandInfo &OpInfo) const {
4535 // Literal constants use the constant bus.
4536 if (!MO.isReg())
4537 return !isInlineConstant(MO, OpInfo);
4538
4539 if (!MO.isUse())
4540 return false;
4541
4542 if (MO.getReg().isVirtual())
4543 return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
4544
4545 // Null is free
4546 if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64)
4547 return false;
4548
4549 // SGPRs use the constant bus
4550 if (MO.isImplicit()) {
4551 return MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
4552 MO.getReg() == AMDGPU::VCC_LO;
4553 }
4554 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
4555 AMDGPU::SReg_64RegClass.contains(MO.getReg());
4556}
4557
4559 for (const MachineOperand &MO : MI.implicit_operands()) {
4560 // We only care about reads.
4561 if (MO.isDef())
4562 continue;
4563
4564 switch (MO.getReg()) {
4565 case AMDGPU::VCC:
4566 case AMDGPU::VCC_LO:
4567 case AMDGPU::VCC_HI:
4568 case AMDGPU::M0:
4569 case AMDGPU::FLAT_SCR:
4570 return MO.getReg();
4571
4572 default:
4573 break;
4574 }
4575 }
4576
4577 return Register();
4578}
4579
4580static bool shouldReadExec(const MachineInstr &MI) {
4581 if (SIInstrInfo::isVALU(MI)) {
4582 switch (MI.getOpcode()) {
4583 case AMDGPU::V_READLANE_B32:
4584 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4585 case AMDGPU::V_WRITELANE_B32:
4586 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4587 return false;
4588 }
4589
4590 return true;
4591 }
4592
4593 if (MI.isPreISelOpcode() ||
4594 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4597 return false;
4598
4599 return true;
4600}
4601
4602static bool isSubRegOf(const SIRegisterInfo &TRI,
4603 const MachineOperand &SuperVec,
4604 const MachineOperand &SubReg) {
4605 if (SubReg.getReg().isPhysical())
4606 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4607
4608 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4609 SubReg.getReg() == SuperVec.getReg();
4610}
4611
4613 StringRef &ErrInfo) const {
4614 uint16_t Opcode = MI.getOpcode();
4615 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
4616 return true;
4617
4618 const MachineFunction *MF = MI.getParent()->getParent();
4619 const MachineRegisterInfo &MRI = MF->getRegInfo();
4620
4621 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4622 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4623 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4624 int Src3Idx = -1;
4625 if (Src0Idx == -1) {
4626 // VOPD V_DUAL_* instructions use different operand names.
4627 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4628 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4629 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4630 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4631 }
4632
4633 // Make sure the number of operands is correct.
4634 const MCInstrDesc &Desc = get(Opcode);
4635 if (!Desc.isVariadic() &&
4636 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4637 ErrInfo = "Instruction has wrong number of operands.";
4638 return false;
4639 }
4640
4641 if (MI.isInlineAsm()) {
4642 // Verify register classes for inlineasm constraints.
4643 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4644 I != E; ++I) {
4645 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4646 if (!RC)
4647 continue;
4648
4649 const MachineOperand &Op = MI.getOperand(I);
4650 if (!Op.isReg())
4651 continue;
4652
4653 Register Reg = Op.getReg();
4654 if (!Reg.isVirtual() && !RC->contains(Reg)) {
4655 ErrInfo = "inlineasm operand has incorrect register class.";
4656 return false;
4657 }
4658 }
4659
4660 return true;
4661 }
4662
4663 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
4664 ErrInfo = "missing memory operand from image instruction.";
4665 return false;
4666 }
4667
4668 // Make sure the register classes are correct.
4669 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
4670 const MachineOperand &MO = MI.getOperand(i);
4671 if (MO.isFPImm()) {
4672 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
4673 "all fp values to integers.";
4674 return false;
4675 }
4676
4677 int RegClass = Desc.operands()[i].RegClass;
4678
4679 switch (Desc.operands()[i].OperandType) {
4681 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
4682 ErrInfo = "Illegal immediate value for operand.";
4683 return false;
4684 }
4685 break;
4690 break;
4702 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
4703 ErrInfo = "Illegal immediate value for operand.";
4704 return false;
4705 }
4706 break;
4707 }
4709 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
4710 ErrInfo = "Expected inline constant for operand.";
4711 return false;
4712 }
4713 break;
4716 // Check if this operand is an immediate.
4717 // FrameIndex operands will be replaced by immediates, so they are
4718 // allowed.
4719 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
4720 ErrInfo = "Expected immediate, but got non-immediate";
4721 return false;
4722 }
4723 [[fallthrough]];
4724 default:
4725 continue;
4726 }
4727
4728 if (!MO.isReg())
4729 continue;
4730 Register Reg = MO.getReg();
4731 if (!Reg)
4732 continue;
4733
4734 // FIXME: Ideally we would have separate instruction definitions with the
4735 // aligned register constraint.
4736 // FIXME: We do not verify inline asm operands, but custom inline asm
4737 // verification is broken anyway
4738 if (ST.needsAlignedVGPRs()) {
4739 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
4740 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
4741 const TargetRegisterClass *SubRC =
4742 RI.getSubRegisterClass(RC, MO.getSubReg());
4743 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
4744 if (RC)
4745 RC = SubRC;
4746 }
4747
4748 // Check that this is the aligned version of the class.
4749 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
4750 ErrInfo = "Subtarget requires even aligned vector registers";
4751 return false;
4752 }
4753 }
4754
4755 if (RegClass != -1) {
4756 if (Reg.isVirtual())
4757 continue;
4758
4759 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
4760 if (!RC->contains(Reg)) {
4761 ErrInfo = "Operand has incorrect register class.";
4762 return false;
4763 }
4764 }
4765 }
4766
4767 // Verify SDWA
4768 if (isSDWA(MI)) {
4769 if (!ST.hasSDWA()) {
4770 ErrInfo = "SDWA is not supported on this target";
4771 return false;
4772 }
4773
4774 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
4775
4776 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
4777 if (OpIdx == -1)
4778 continue;
4779 const MachineOperand &MO = MI.getOperand(OpIdx);
4780
4781 if (!ST.hasSDWAScalar()) {
4782 // Only VGPRS on VI
4783 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
4784 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
4785 return false;
4786 }
4787 } else {
4788 // No immediates on GFX9
4789 if (!MO.isReg()) {
4790 ErrInfo =
4791 "Only reg allowed as operands in SDWA instructions on GFX9+";
4792 return false;
4793 }
4794 }
4795 }
4796
4797 if (!ST.hasSDWAOmod()) {
4798 // No omod allowed on VI
4799 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4800 if (OMod != nullptr &&
4801 (!OMod->isImm() || OMod->getImm() != 0)) {
4802 ErrInfo = "OMod not allowed in SDWA instructions on VI";
4803 return false;
4804 }
4805 }
4806
4807 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
4808 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
4809 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
4810 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
4811 const MachineOperand *Src0ModsMO =
4812 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4813 unsigned Mods = Src0ModsMO->getImm();
4814 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
4815 Mods & SISrcMods::SEXT) {
4816 ErrInfo = "sext, abs and neg are not allowed on this instruction";
4817 return false;
4818 }
4819 }
4820
4821 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
4822 if (isVOPC(BasicOpcode)) {
4823 if (!ST.hasSDWASdst() && DstIdx != -1) {
4824 // Only vcc allowed as dst on VI for VOPC
4825 const MachineOperand &Dst = MI.getOperand(DstIdx);
4826 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
4827 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
4828 return false;
4829 }
4830 } else if (!ST.hasSDWAOutModsVOPC()) {
4831 // No clamp allowed on GFX9 for VOPC
4832 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4833 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
4834 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
4835 return false;
4836 }
4837
4838 // No omod allowed on GFX9 for VOPC
4839 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4840 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
4841 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
4842 return false;
4843 }
4844 }
4845 }
4846
4847 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
4848 if (DstUnused && DstUnused->isImm() &&
4849 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
4850 const MachineOperand &Dst = MI.getOperand(DstIdx);
4851 if (!Dst.isReg() || !Dst.isTied()) {
4852 ErrInfo = "Dst register should have tied register";
4853 return false;
4854 }
4855
4856 const MachineOperand &TiedMO =
4857 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
4858 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
4859 ErrInfo =
4860 "Dst register should be tied to implicit use of preserved register";
4861 return false;
4862 }
4863 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
4864 ErrInfo = "Dst register should use same physical register as preserved";
4865 return false;
4866 }
4867 }
4868 }
4869
4870 // Verify MIMG / VIMAGE / VSAMPLE
4871 if (isImage(MI.getOpcode()) && !MI.mayStore()) {
4872 // Ensure that the return type used is large enough for all the options
4873 // being used TFE/LWE require an extra result register.
4874 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
4875 if (DMask) {
4876 uint64_t DMaskImm = DMask->getImm();
4877 uint32_t RegCount =
4878 isGather4(MI.getOpcode()) ? 4 : llvm::popcount(DMaskImm);
4879 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
4880 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
4881 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
4882
4883 // Adjust for packed 16 bit values
4884 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
4885 RegCount = divideCeil(RegCount, 2);
4886
4887 // Adjust if using LWE or TFE
4888 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
4889 RegCount += 1;
4890
4891 const uint32_t DstIdx =
4892 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
4893 const MachineOperand &Dst = MI.getOperand(DstIdx);
4894 if (Dst.isReg()) {
4895 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
4896 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
4897 if (RegCount > DstSize) {
4898 ErrInfo = "Image instruction returns too many registers for dst "
4899 "register class";
4900 return false;
4901 }
4902 }
4903 }
4904 }
4905
4906 // Verify VOP*. Ignore multiple sgpr operands on writelane.
4907 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
4908 unsigned ConstantBusCount = 0;
4909 bool UsesLiteral = false;
4910 const MachineOperand *LiteralVal = nullptr;
4911
4912 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
4913 if (ImmIdx != -1) {
4914 ++ConstantBusCount;
4915 UsesLiteral = true;
4916 LiteralVal = &MI.getOperand(ImmIdx);
4917 }
4918
4919 SmallVector<Register, 2> SGPRsUsed;
4920 Register SGPRUsed;
4921
4922 // Only look at the true operands. Only a real operand can use the constant
4923 // bus, and we don't want to check pseudo-operands like the source modifier
4924 // flags.
4925 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
4926 if (OpIdx == -1)
4927 continue;
4928 const MachineOperand &MO = MI.getOperand(OpIdx);
4929 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
4930 if (MO.isReg()) {
4931 SGPRUsed = MO.getReg();
4932 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
4933 ++ConstantBusCount;
4934 SGPRsUsed.push_back(SGPRUsed);
4935 }
4936 } else {
4937 if (!UsesLiteral) {
4938 ++ConstantBusCount;
4939 UsesLiteral = true;
4940 LiteralVal = &MO;
4941 } else if (!MO.isIdenticalTo(*LiteralVal)) {
4942 assert(isVOP2(MI) || isVOP3(MI));
4943 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
4944 return false;
4945 }
4946 }
4947 }
4948 }
4949
4950 SGPRUsed = findImplicitSGPRRead(MI);
4951 if (SGPRUsed) {
4952 // Implicit uses may safely overlap true operands
4953 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
4954 return !RI.regsOverlap(SGPRUsed, SGPR);
4955 })) {
4956 ++ConstantBusCount;
4957 SGPRsUsed.push_back(SGPRUsed);
4958 }
4959 }
4960
4961 // v_writelane_b32 is an exception from constant bus restriction:
4962 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
4963 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
4964 Opcode != AMDGPU::V_WRITELANE_B32) {
4965 ErrInfo = "VOP* instruction violates constant bus restriction";
4966 return false;
4967 }
4968
4969 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
4970 ErrInfo = "VOP3 instruction uses literal";
4971 return false;
4972 }
4973 }
4974
4975 // Special case for writelane - this can break the multiple constant bus rule,
4976 // but still can't use more than one SGPR register
4977 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
4978 unsigned SGPRCount = 0;
4979 Register SGPRUsed;
4980
4981 for (int OpIdx : {Src0Idx, Src1Idx}) {
4982 if (OpIdx == -1)
4983 break;
4984
4985 const MachineOperand &MO = MI.getOperand(OpIdx);
4986
4987 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
4988 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
4989 if (MO.getReg() != SGPRUsed)
4990 ++SGPRCount;
4991 SGPRUsed = MO.getReg();
4992 }
4993 }
4994 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
4995 ErrInfo = "WRITELANE instruction violates constant bus restriction";
4996 return false;
4997 }
4998 }
4999 }
5000
5001 // Verify misc. restrictions on specific instructions.
5002 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5003 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5004 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5005 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5006 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5007 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5008 if (!compareMachineOp(Src0, Src1) &&
5009 !compareMachineOp(Src0, Src2)) {
5010 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5011 return false;
5012 }
5013 }
5014 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5015 SISrcMods::ABS) ||
5016 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5017 SISrcMods::ABS) ||
5018 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5019 SISrcMods::ABS)) {
5020 ErrInfo = "ABS not allowed in VOP3B instructions";
5021 return false;
5022 }
5023 }
5024
5025 if (isSOP2(MI) || isSOPC(MI)) {
5026 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5027 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5028
5029 if (!Src0.isReg() && !Src1.isReg() &&
5030 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5031 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5032 !Src0.isIdenticalTo(Src1)) {
5033 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5034 return false;
5035 }
5036 }
5037
5038 if (isSOPK(MI)) {
5039 auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5040 if (Desc.isBranch()) {
5041 if (!Op->isMBB()) {
5042 ErrInfo = "invalid branch target for SOPK instruction";
5043 return false;
5044 }
5045 } else {
5046 uint64_t Imm = Op->getImm();
5047 if (sopkIsZext(Opcode)) {
5048 if (!isUInt<16>(Imm)) {
5049 ErrInfo = "invalid immediate for SOPK instruction";
5050 return false;
5051 }
5052 } else {
5053 if (!isInt<16>(Imm)) {
5054 ErrInfo = "invalid immediate for SOPK instruction";
5055 return false;
5056 }
5057 }
5058 }
5059 }
5060
5061 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5062 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5063 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5064 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5065 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5066 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5067
5068 const unsigned StaticNumOps =
5069 Desc.getNumOperands() + Desc.implicit_uses().size();
5070 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5071
5072 // Allow additional implicit operands. This allows a fixup done by the post
5073 // RA scheduler where the main implicit operand is killed and implicit-defs
5074 // are added for sub-registers that remain live after this instruction.
5075 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5076 ErrInfo = "missing implicit register operands";
5077 return false;
5078 }
5079
5080 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5081 if (IsDst) {
5082 if (!Dst->isUse()) {
5083 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5084 return false;
5085 }
5086
5087 unsigned UseOpIdx;
5088 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5089 UseOpIdx != StaticNumOps + 1) {
5090 ErrInfo = "movrel implicit operands should be tied";
5091 return false;
5092 }
5093 }
5094
5095 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5096 const MachineOperand &ImpUse
5097 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5098 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5099 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5100 ErrInfo = "src0 should be subreg of implicit vector use";
5101 return false;
5102 }
5103 }
5104
5105 // Make sure we aren't losing exec uses in the td files. This mostly requires
5106 // being careful when using let Uses to try to add other use registers.
5107 if (shouldReadExec(MI)) {
5108 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5109 ErrInfo = "VALU instruction does not implicitly read exec mask";
5110 return false;
5111 }
5112 }
5113
5114 if (isSMRD(MI)) {
5115 if (MI.mayStore() &&
5117 // The register offset form of scalar stores may only use m0 as the
5118 // soffset register.
5119 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5120 if (Soff && Soff->getReg() != AMDGPU::M0) {
5121 ErrInfo = "scalar stores must use m0 as offset register";
5122 return false;
5123 }
5124 }
5125 }
5126
5127 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5128 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5129 if (Offset->getImm() != 0) {
5130 ErrInfo = "subtarget does not support offsets in flat instructions";
5131 return false;
5132 }
5133 }
5134
5135 if (isDS(MI) && !ST.hasGDS()) {
5136 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5137 if (GDSOp && GDSOp->getImm() != 0) {
5138 ErrInfo = "GDS is not supported on this subtarget";
5139 return false;
5140 }
5141 }
5142
5143 if (isImage(MI)) {
5144 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5145 if (DimOp) {
5146 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5147 AMDGPU::OpName::vaddr0);
5148 int RSrcOpName =
5149 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5150 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5151 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5152 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5154 const AMDGPU::MIMGDimInfo *Dim =
5156
5157 if (!Dim) {
5158 ErrInfo = "dim is out of range";
5159 return false;
5160 }
5161
5162 bool IsA16 = false;
5163 if (ST.hasR128A16()) {
5164 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5165 IsA16 = R128A16->getImm() != 0;
5166 } else if (ST.hasA16()) {
5167 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5168 IsA16 = A16->getImm() != 0;
5169 }
5170
5171 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5172
5173 unsigned AddrWords =
5174 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5175
5176 unsigned VAddrWords;
5177 if (IsNSA) {
5178 VAddrWords = RsrcIdx - VAddr0Idx;
5179 if (ST.hasPartialNSAEncoding() &&
5180 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5181 unsigned LastVAddrIdx = RsrcIdx - 1;
5182 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5183 }
5184 } else {
5185 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5186 if (AddrWords > 12)
5187 AddrWords = 16;
5188 }
5189
5190 if (VAddrWords != AddrWords) {
5191 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5192 << " but got " << VAddrWords << "\n");
5193 ErrInfo = "bad vaddr size";
5194 return false;
5195 }
5196 }
5197 }
5198
5199 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5200 if (DppCt) {
5201 using namespace AMDGPU::DPP;
5202
5203 unsigned DC = DppCt->getImm();
5204 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5205 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5206 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5207 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5208 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5209 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5210 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5211 ErrInfo = "Invalid dpp_ctrl value";
5212 return false;
5213 }
5214 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5216 ErrInfo = "Invalid dpp_ctrl value: "
5217 "wavefront shifts are not supported on GFX10+";
5218 return false;
5219 }
5220 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5222 ErrInfo = "Invalid dpp_ctrl value: "
5223 "broadcasts are not supported on GFX10+";
5224 return false;
5225 }
5226 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5228 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5229 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5230 !ST.hasGFX90AInsts()) {
5231 ErrInfo = "Invalid dpp_ctrl value: "
5232 "row_newbroadcast/row_share is not supported before "
5233 "GFX90A/GFX10";
5234 return false;
5235 }
5236 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5237 ErrInfo = "Invalid dpp_ctrl value: "
5238 "row_share and row_xmask are not supported before GFX10";
5239 return false;
5240 }
5241 }
5242
5243 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5245 ErrInfo = "Invalid dpp_ctrl value: "
5246 "DP ALU dpp only support row_newbcast";
5247 return false;
5248 }
5249 }
5250
5251 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5252 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5253 uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0
5254 : AMDGPU::OpName::vdata;
5255 const MachineOperand *Data = getNamedOperand(MI, DataNameIdx);
5256 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5257 if (Data && !Data->isReg())
5258 Data = nullptr;
5259
5260 if (ST.hasGFX90AInsts()) {
5261 if (Dst && Data &&
5262 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5263 ErrInfo = "Invalid register class: "
5264 "vdata and vdst should be both VGPR or AGPR";
5265 return false;
5266 }
5267 if (Data && Data2 &&
5268 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5269 ErrInfo = "Invalid register class: "
5270 "both data operands should be VGPR or AGPR";
5271 return false;
5272 }
5273 } else {
5274 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5275 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5276 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5277 ErrInfo = "Invalid register class: "
5278 "agpr loads and stores not supported on this GPU";
5279 return false;
5280 }
5281 }
5282 }
5283
5284 if (ST.needsAlignedVGPRs()) {
5285 const auto isAlignedReg = [&MI, &MRI, this](unsigned OpName) -> bool {
5287 if (!Op)
5288 return true;
5289 Register Reg = Op->getReg();
5290 if (Reg.isPhysical())
5291 return !(RI.getHWRegIndex(Reg) & 1);
5292 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5293 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5294 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5295 };
5296
5297 if (MI.getOpcode() == AMDGPU::DS_GWS_INIT ||
5298 MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
5299 MI.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
5300
5301 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5302 ErrInfo = "Subtarget requires even aligned vector registers "
5303 "for DS_GWS instructions";
5304 return false;
5305 }
5306 }
5307
5308 if (isMIMG(MI)) {
5309 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5310 ErrInfo = "Subtarget requires even aligned vector registers "
5311 "for vaddr operand of image instructions";
5312 return false;
5313 }
5314 }
5315 }
5316
5317 if (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
5318 !ST.hasGFX90AInsts()) {
5319 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5320 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5321 ErrInfo = "Invalid register class: "
5322 "v_accvgpr_write with an SGPR is not supported on this GPU";
5323 return false;
5324 }
5325 }
5326
5327 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5328 const MachineOperand &SrcOp = MI.getOperand(1);
5329 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5330 ErrInfo = "pseudo expects only physical SGPRs";
5331 return false;
5332 }
5333 }
5334
5335 return true;
5336}
5337
5338// It is more readable to list mapped opcodes on the same line.
5339// clang-format off
5340
5342 switch (MI.getOpcode()) {
5343 default: return AMDGPU::INSTRUCTION_LIST_END;
5344 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5345 case AMDGPU::COPY: return AMDGPU::COPY;
5346 case AMDGPU::PHI: return AMDGPU::PHI;
5347 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5348 case AMDGPU::WQM: return AMDGPU::WQM;
5349 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5350 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5351 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5352 case AMDGPU::S_MOV_B32: {
5353 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5354 return MI.getOperand(1).isReg() ||
5355 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5356 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5357 }
5358 case AMDGPU::S_ADD_I32:
5359 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5360 case AMDGPU::S_ADDC_U32:
5361 return AMDGPU::V_ADDC_U32_e32;
5362 case AMDGPU::S_SUB_I32:
5363 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5364 // FIXME: These are not consistently handled, and selected when the carry is
5365 // used.
5366 case AMDGPU::S_ADD_U32:
5367 return AMDGPU::V_ADD_CO_U32_e32;
5368 case AMDGPU::S_SUB_U32:
5369 return AMDGPU::V_SUB_CO_U32_e32;
5370 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5371 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5372 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5373 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5374 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5375 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5376 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5377 case AMDGPU::S_XNOR_B32:
5378 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5379 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5380 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5381 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5382 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5383 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5384 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5385 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5386 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5387 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5388 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5389 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5390 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5391 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5392 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5393 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5394 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5395 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5396 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5397 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5398 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5399 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5400 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5401 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5402 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5403 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5404 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5405 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5406 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5407 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5408 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5409 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5410 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5411 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5412 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5413 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5414 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5415 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5416 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5417 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5418 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5419 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5420 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5421 case AMDGPU::S_CVT_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
5422 case AMDGPU::S_CVT_HI_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
5423 case AMDGPU::S_CVT_F16_F32: return AMDGPU::V_CVT_F16_F32_t16_e64;
5424 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5425 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5426 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5427 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5428 case AMDGPU::S_CEIL_F16:
5429 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5430 : AMDGPU::V_CEIL_F16_fake16_e64;
5431 case AMDGPU::S_FLOOR_F16:
5432 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5433 : AMDGPU::V_FLOOR_F16_fake16_e64;
5434 case AMDGPU::S_TRUNC_F16:
5435 return AMDGPU::V_TRUNC_F16_fake16_e64;
5436 case AMDGPU::S_RNDNE_F16:
5437 return AMDGPU::V_RNDNE_F16_fake16_e64;
5438 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5439 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5440 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5441 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5442 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5443 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5444 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5445 case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64;
5446 case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
5447 case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
5448 case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
5449 case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64;
5450 case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64;
5451 case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
5452 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5453 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5454 case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_t16_e64;
5455 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5456 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5457 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5458 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5459 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5460 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5461 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5462 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5463 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5464 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5465 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5466 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5467 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5468 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5469 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5470 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5471 case AMDGPU::S_CMP_LT_F16: return AMDGPU::V_CMP_LT_F16_t16_e64;
5472 case AMDGPU::S_CMP_EQ_F16: return AMDGPU::V_CMP_EQ_F16_t16_e64;
5473 case AMDGPU::S_CMP_LE_F16: return AMDGPU::V_CMP_LE_F16_t16_e64;
5474 case AMDGPU::S_CMP_GT_F16: return AMDGPU::V_CMP_GT_F16_t16_e64;
5475 case AMDGPU::S_CMP_LG_F16: return AMDGPU::V_CMP_LG_F16_t16_e64;
5476 case AMDGPU::S_CMP_GE_F16: return AMDGPU::V_CMP_GE_F16_t16_e64;
5477 case AMDGPU::S_CMP_O_F16: return AMDGPU::V_CMP_O_F16_t16_e64;
5478 case AMDGPU::S_CMP_U_F16: return AMDGPU::V_CMP_U_F16_t16_e64;
5479 case AMDGPU::S_CMP_NGE_F16: return AMDGPU::V_CMP_NGE_F16_t16_e64;
5480 case AMDGPU::S_CMP_NLG_F16: return AMDGPU::V_CMP_NLG_F16_t16_e64;
5481 case AMDGPU::S_CMP_NGT_F16: return AMDGPU::V_CMP_NGT_F16_t16_e64;
5482 case AMDGPU::S_CMP_NLE_F16: return AMDGPU::V_CMP_NLE_F16_t16_e64;
5483 case AMDGPU::S_CMP_NEQ_F16: return AMDGPU::V_CMP_NEQ_F16_t16_e64;
5484 case AMDGPU::S_CMP_NLT_F16: return AMDGPU::V_CMP_NLT_F16_t16_e64;
5485 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5486 case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_fake16_e64;
5487 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5488 case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_fake16_e64;
5489 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5490 case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_fake16_e64;
5491 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5492 case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_fake16_e64;
5493 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5494 case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_fake16_e64;
5495 }
5497 "Unexpected scalar opcode without corresponding vector one!");
5498}
5499
5500// clang-format on
5501
5505 const DebugLoc &DL, Register Reg,
5506 bool IsSCCLive,
5507 SlotIndexes *Indexes) const {
5508 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5509 const SIInstrInfo *TII = ST.getInstrInfo();
5510 bool IsWave32 = ST.isWave32();
5511 if (IsSCCLive) {
5512 // Insert two move instructions, one to save the original value of EXEC and
5513 // the other to turn on all bits in EXEC. This is required as we can't use
5514 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5515 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5516 MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5517 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
5518 .addReg(Exec, RegState::Kill);
5519 auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
5520 if (Indexes) {
5521 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5522 Indexes->insertMachineInstrInMaps(*FlipExecMI);
5523 }
5524 } else {
5525 const unsigned OrSaveExec =
5526 IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
5527 auto SaveExec =
5528 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
5529 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5530 if (Indexes)
5531 Indexes->insertMachineInstrInMaps(*SaveExec);
5532 }
5533}
5534
5537 const DebugLoc &DL, Register Reg,
5538 SlotIndexes *Indexes) const {
5539 unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5540 MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5541 auto ExecRestoreMI =
5542 BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
5543 if (Indexes)
5544 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
5545}
5546
5547static const TargetRegisterClass *
5549 const MachineRegisterInfo &MRI,
5550 const MCInstrDesc &TID, unsigned RCID,
5551 bool IsAllocatable) {
5552 if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5553 (((TID.mayLoad() || TID.mayStore()) &&
5554 !(TID.TSFlags & SIInstrFlags::Spill)) ||
5556 switch (RCID) {
5557 case AMDGPU::AV_32RegClassID:
5558 RCID = AMDGPU::VGPR_32RegClassID;
5559 break;
5560 case AMDGPU::AV_64RegClassID:
5561 RCID = AMDGPU::VReg_64RegClassID;
5562 break;
5563 case AMDGPU::AV_96RegClassID:
5564 RCID = AMDGPU::VReg_96RegClassID;
5565 break;
5566 case AMDGPU::AV_128RegClassID:
5567 RCID = AMDGPU::VReg_128RegClassID;
5568 break;
5569 case AMDGPU::AV_160RegClassID:
5570 RCID = AMDGPU::VReg_160RegClassID;
5571 break;
5572 case AMDGPU::AV_512RegClassID:
5573 RCID = AMDGPU::VReg_512RegClassID;
5574 break;
5575 default:
5576 break;
5577 }
5578 }
5579
5580 return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
5581}
5582
5584 unsigned OpNum, const TargetRegisterInfo *TRI,
5585 const MachineFunction &MF)
5586 const {
5587 if (OpNum >= TID.getNumOperands())
5588 return nullptr;
5589 auto RegClass = TID.operands()[OpNum].RegClass;
5590 bool IsAllocatable = false;
5592 // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
5593 // with two data operands. Request register class constrained to VGPR only
5594 // of both operands present as Machine Copy Propagation can not check this
5595 // constraint and possibly other passes too.
5596 //
5597 // The check is limited to FLAT and DS because atomics in non-flat encoding
5598 // have their vdst and vdata tied to be the same register.
5599 const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5600 AMDGPU::OpName::vdst);
5601 const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5602 (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
5603 : AMDGPU::OpName::vdata);
5604 if (DataIdx != -1) {
5605 IsAllocatable = VDstIdx != -1 || AMDGPU::hasNamedOperand(
5606 TID.Opcode, AMDGPU::OpName::data1);
5607 }
5608 }
5609 return adjustAllocatableRegClass(ST, RI, MF.getRegInfo(), TID, RegClass,
5610 IsAllocatable);
5611}
5612
5614 unsigned OpNo) const {
5615 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5616 const MCInstrDesc &Desc = get(MI.getOpcode());
5617 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
5618 Desc.operands()[OpNo].RegClass == -1) {
5619 Register Reg = MI.getOperand(OpNo).getReg();
5620
5621 if (Reg.isVirtual())
5622 return MRI.getRegClass(Reg);
5623 return RI.getPhysRegBaseClass(Reg);
5624 }
5625
5626 unsigned RCID = Desc.operands()[OpNo].RegClass;
5627 return adjustAllocatableRegClass(ST, RI, MRI, Desc, RCID, true);
5628}
5629
5632 MachineBasicBlock *MBB = MI.getParent();
5633 MachineOperand &MO = MI.getOperand(OpIdx);
5635 unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
5636 const TargetRegisterClass *RC = RI.getRegClass(RCID);
5637 unsigned Size = RI.getRegSizeInBits(*RC);
5638 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32;
5639 if (MO.isReg())
5640 Opcode = AMDGPU::COPY;
5641 else if (RI.isSGPRClass(RC))
5642 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
5643
5644 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
5645 Register Reg = MRI.createVirtualRegister(VRC);
5647 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
5648 MO.ChangeToRegister(Reg, false);
5649}
5650
5653 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
5654 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5655 MachineBasicBlock *MBB = MI->getParent();
5656 DebugLoc DL = MI->getDebugLoc();
5657 Register SubReg = MRI.createVirtualRegister(SubRC);
5658
5659 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
5660 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
5661 .addReg(SuperReg.getReg(), 0, NewSubIdx);
5662 return SubReg;
5663}
5664
5667 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
5668 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5669 if (Op.isImm()) {
5670 if (SubIdx == AMDGPU::sub0)
5671 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
5672 if (SubIdx == AMDGPU::sub1)
5673 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
5674
5675 llvm_unreachable("Unhandled register index for immediate");
5676 }
5677
5678 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
5679 SubIdx, SubRC);
5680 return MachineOperand::CreateReg(SubReg, false);
5681}
5682
5683// Change the order of operands from (0, 1, 2) to (0, 2, 1)
5684void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
5685 assert(Inst.getNumExplicitOperands() == 3);
5686 MachineOperand Op1 = Inst.getOperand(1);
5687 Inst.removeOperand(1);
5688 Inst.addOperand(Op1);
5689}
5690
5692 const MCOperandInfo &OpInfo,
5693 const MachineOperand &MO) const {
5694 if (!MO.isReg())
5695 return false;
5696
5697 Register Reg = MO.getReg();
5698
5699 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
5700 if (Reg.isPhysical())
5701 return DRC->contains(Reg);
5702
5703 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
5704
5705 if (MO.getSubReg()) {
5706 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
5707 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
5708 if (!SuperRC)
5709 return false;
5710
5711 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
5712 if (!DRC)
5713 return false;
5714 }
5715 return RC->hasSuperClassEq(DRC);
5716}
5717
5719 const MCOperandInfo &OpInfo,
5720 const MachineOperand &MO) const {
5721 if (MO.isReg())
5722 return isLegalRegOperand(MRI, OpInfo, MO);
5723
5724 // Handle non-register types that are treated like immediates.
5725 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
5726 return true;
5727}
5728
5729bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
5730 const MachineOperand *MO) const {
5731 const MachineFunction &MF = *MI.getParent()->getParent();
5732 const MachineRegisterInfo &MRI = MF.getRegInfo();
5733 const MCInstrDesc &InstDesc = MI.getDesc();
5734 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
5735 const TargetRegisterClass *DefinedRC =
5736 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
5737 if (!MO)
5738 MO = &MI.getOperand(OpIdx);
5739
5740 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
5741 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
5742 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
5743 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo) && !LiteralLimit--)
5744 return false;
5745
5747 if (MO->isReg())
5748 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
5749
5750 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
5751 if (i == OpIdx)
5752 continue;
5753 const MachineOperand &Op = MI.getOperand(i);
5754 if (Op.isReg()) {
5755 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
5756 if (!SGPRsUsed.count(SGPR) &&
5757 // FIXME: This can access off the end of the operands() array.
5758 usesConstantBus(MRI, Op, InstDesc.operands().begin()[i])) {
5759 if (--ConstantBusLimit <= 0)
5760 return false;
5761 SGPRsUsed.insert(SGPR);
5762 }
5763 } else if (AMDGPU::isSISrcOperand(InstDesc, i) &&
5764 !isInlineConstant(Op, InstDesc.operands()[i])) {
5765 if (!LiteralLimit--)
5766 return false;
5767 if (--ConstantBusLimit <= 0)
5768 return false;
5769 }
5770 }
5771 }
5772
5773 if (MO->isReg()) {
5774 if (!DefinedRC)
5775 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
5776 if (!isLegalRegOperand(MRI, OpInfo, *MO))
5777 return false;
5778 bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
5779 if (IsAGPR && !ST.hasMAIInsts())
5780 return false;
5781 unsigned Opc = MI.getOpcode();
5782 if (IsAGPR &&
5783 (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5784 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
5785 return false;
5786 // Atomics should have both vdst and vdata either vgpr or agpr.
5787 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
5788 const int DataIdx = AMDGPU::getNamedOperandIdx(Opc,
5789 isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
5790 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
5791 MI.getOperand(DataIdx).isReg() &&
5792 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
5793 return false;
5794 if ((int)OpIdx == DataIdx) {
5795 if (VDstIdx != -1 &&
5796 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
5797 return false;
5798 // DS instructions with 2 src operands also must have tied RC.
5799 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc,
5800 AMDGPU::OpName::data1);
5801 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
5802 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
5803 return false;
5804 }
5805 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
5806 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
5807 RI.isSGPRReg(MRI, MO->getReg()))
5808 return false;
5809 return true;
5810 }
5811
5812 if (MO->isImm()) {
5813 uint64_t Imm = MO->getImm();
5814 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
5815 bool Is64BitOp = Is64BitFPOp ||
5819 if (Is64BitOp &&
5821 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp))
5822 return false;
5823
5824 // FIXME: We can use sign extended 64-bit literals, but only for signed
5825 // operands. At the moment we do not know if an operand is signed.
5826 // Such operand will be encoded as its low 32 bits and then either
5827 // correctly sign extended or incorrectly zero extended by HW.
5828 if (!Is64BitFPOp && (int32_t)Imm < 0)
5829 return false;
5830 }
5831 }
5832
5833 // Handle non-register types that are treated like immediates.
5834 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
5835
5836 if (!DefinedRC) {
5837 // This operand expects an immediate.
5838 return true;
5839 }
5840
5841 return isImmOperandLegal(MI, OpIdx, *MO);
5842}
5843
5845 MachineInstr &MI) const {
5846 unsigned Opc = MI.getOpcode();
5847 const MCInstrDesc &InstrDesc = get(Opc);
5848
5849 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
5850 MachineOperand &Src0 = MI.getOperand(Src0Idx);
5851
5852 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
5853 MachineOperand &Src1 = MI.getOperand(Src1Idx);
5854
5855 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
5856 // we need to only have one constant bus use before GFX10.
5857 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
5858 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
5859 RI.isSGPRReg(MRI, Src0.getReg()))
5860 legalizeOpWithMove(MI, Src0Idx);
5861
5862 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
5863 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
5864 // src0/src1 with V_READFIRSTLANE.
5865 if (Opc == AMDGPU::V_WRITELANE_B32) {
5866 const DebugLoc &DL = MI.getDebugLoc();
5867 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
5868 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5869 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5870 .add(Src0);
5871 Src0.ChangeToRegister(Reg, false);
5872 }
5873 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
5874 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5875 const DebugLoc &DL = MI.getDebugLoc();
5876 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5877 .add(Src1);
5878 Src1.ChangeToRegister(Reg, false);
5879 }
5880 return;
5881 }
5882
5883 // No VOP2 instructions support AGPRs.
5884 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
5885 legalizeOpWithMove(MI, Src0Idx);
5886
5887 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
5888 legalizeOpWithMove(MI, Src1Idx);
5889
5890 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
5891 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
5892 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
5893 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
5894 legalizeOpWithMove(MI, Src2Idx);
5895 }
5896
5897 // VOP2 src0 instructions support all operand types, so we don't need to check
5898 // their legality. If src1 is already legal, we don't need to do anything.
5899 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
5900 return;
5901
5902 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
5903 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
5904 // select is uniform.
5905 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
5906 RI.isVGPR(MRI, Src1.getReg())) {
5907 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5908 const DebugLoc &DL = MI.getDebugLoc();
5909 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5910 .add(Src1);
5911 Src1.ChangeToRegister(Reg, false);
5912 return;
5913 }
5914
5915 // We do not use commuteInstruction here because it is too aggressive and will
5916 // commute if it is possible. We only want to commute here if it improves
5917 // legality. This can be called a fairly large number of times so don't waste
5918 // compile time pointlessly swapping and checking legality again.
5919 if (HasImplicitSGPR || !MI.isCommutable()) {
5920 legalizeOpWithMove(MI, Src1Idx);
5921 return;
5922 }
5923
5924 // If src0 can be used as src1, commuting will make the operands legal.
5925 // Otherwise we have to give up and insert a move.
5926 //
5927 // TODO: Other immediate-like operand kinds could be commuted if there was a
5928 // MachineOperand::ChangeTo* for them.
5929 if ((!Src1.isImm() && !Src1.isReg()) ||
5930 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
5931 legalizeOpWithMove(MI, Src1Idx);
5932 return;
5933 }
5934
5935 int CommutedOpc = commuteOpcode(MI);
5936 if (CommutedOpc == -1) {
5937 legalizeOpWithMove(MI, Src1Idx);
5938 return;
5939 }
5940
5941 MI.setDesc(get(CommutedOpc));
5942
5943 Register Src0Reg = Src0.getReg();
5944 unsigned Src0SubReg = Src0.getSubReg();
5945 bool Src0Kill = Src0.isKill();
5946
5947 if (Src1.isImm())
5948 Src0.ChangeToImmediate(Src1.getImm());
5949 else if (Src1.isReg()) {
5950 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
5951 Src0.setSubReg(Src1.getSubReg());
5952 } else
5953 llvm_unreachable("Should only have register or immediate operands");
5954
5955 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
5956 Src1.setSubReg(Src0SubReg);
5958}
5959
5960// Legalize VOP3 operands. All operand types are supported for any operand
5961// but only one literal constant and only starting from GFX10.
5963 MachineInstr &MI) const {
5964 unsigned Opc = MI.getOpcode();
5965
5966 int VOP3Idx[3] = {
5967 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
5968 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
5969 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
5970 };
5971
5972 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
5973 Opc == AMDGPU::V_PERMLANEX16_B32_e64) {
5974 // src1 and src2 must be scalar
5975 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
5976 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
5977 const DebugLoc &DL = MI.getDebugLoc();
5978 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
5979 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5980 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5981 .add(Src1);
5982 Src1.ChangeToRegister(Reg, false);
5983 }
5984 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
5985 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5986 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5987 .add(Src2);
5988 Src2.ChangeToRegister(Reg, false);
5989 }
5990 }
5991
5992 // Find the one SGPR operand we are allowed to use.
5993 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
5994 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
5995 SmallDenseSet<unsigned> SGPRsUsed;
5996 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
5997 if (SGPRReg) {
5998 SGPRsUsed.insert(SGPRReg);
5999 --ConstantBusLimit;
6000 }
6001
6002 for (int Idx : VOP3Idx) {
6003 if (Idx == -1)
6004 break;
6005 MachineOperand &MO = MI.getOperand(Idx);
6006
6007 if (!MO.isReg()) {
6008 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6009 continue;
6010
6011 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6012 --LiteralLimit;
6013 --ConstantBusLimit;
6014 continue;
6015 }
6016
6017 --LiteralLimit;
6018 --ConstantBusLimit;
6020 continue;
6021 }
6022
6023 if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&
6024 !isOperandLegal(MI, Idx, &MO)) {
6026 continue;
6027 }
6028
6029 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6030 continue; // VGPRs are legal
6031
6032 // We can use one SGPR in each VOP3 instruction prior to GFX10
6033 // and two starting from GFX10.
6034 if (SGPRsUsed.count(MO.getReg()))
6035 continue;
6036 if (ConstantBusLimit > 0) {
6037 SGPRsUsed.insert(MO.getReg());
6038 --ConstantBusLimit;
6039 continue;
6040 }
6041
6042 // If we make it this far, then the operand is not legal and we must
6043 // legalize it.
6045 }
6046
6047 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6048 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6049 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6050 legalizeOpWithMove(MI, VOP3Idx[2]);
6051}
6052
6054 MachineRegisterInfo &MRI) const {
6055 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6056 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6057 Register DstReg = MRI.createVirtualRegister(SRC);
6058 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6059
6060 if (RI.hasAGPRs(VRC)) {
6061 VRC = RI.getEquivalentVGPRClass(VRC);
6062 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6063 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6064 get(TargetOpcode::COPY), NewSrcReg)
6065 .addReg(SrcReg);
6066 SrcReg = NewSrcReg;
6067 }
6068
6069 if (SubRegs == 1) {
6070 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6071 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6072 .addReg(SrcReg);
6073 return DstReg;
6074 }
6075
6077 for (unsigned i = 0; i < SubRegs; ++i) {
6078 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6079 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6080 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6081 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6082 SRegs.push_back(SGPR);
6083 }
6084
6086 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6087 get(AMDGPU::REG_SEQUENCE), DstReg);
6088 for (unsigned i = 0; i < SubRegs; ++i) {
6089 MIB.addReg(SRegs[i]);
6090 MIB.addImm(RI.getSubRegFromChannel(i));
6091 }
6092 return DstReg;
6093}
6094
6096 MachineInstr &MI) const {
6097
6098 // If the pointer is store in VGPRs, then we need to move them to
6099 // SGPRs using v_readfirstlane. This is safe because we only select
6100 // loads with uniform pointers to SMRD instruction so we know the
6101 // pointer value is uniform.
6102 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6103 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6104 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6105 SBase->setReg(SGPR);
6106 }
6107 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6108 if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
6109 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6110 SOff->setReg(SGPR);
6111 }
6112}
6113
6115 unsigned Opc = Inst.getOpcode();
6116 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6117 if (OldSAddrIdx < 0)
6118 return false;
6119
6121
6122 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6123 if (NewOpc < 0)
6125 if (NewOpc < 0)
6126 return false;
6127
6129 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6130 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6131 return false;
6132
6133 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6134 if (NewVAddrIdx < 0)
6135 return false;
6136
6137 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6138
6139 // Check vaddr, it shall be zero or absent.
6140 MachineInstr *VAddrDef = nullptr;
6141 if (OldVAddrIdx >= 0) {
6142 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6143 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6144 if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
6145 !VAddrDef->getOperand(1).isImm() ||
6146 VAddrDef->getOperand(1).getImm() != 0)
6147 return false;
6148 }
6149
6150 const MCInstrDesc &NewDesc = get(NewOpc);
6151 Inst.setDesc(NewDesc);
6152
6153 // Callers expect iterator to be valid after this call, so modify the
6154 // instruction in place.
6155 if (OldVAddrIdx == NewVAddrIdx) {
6156 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6157 // Clear use list from the old vaddr holding a zero register.
6158 MRI.removeRegOperandFromUseList(&NewVAddr);
6159 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6160 Inst.removeOperand(OldSAddrIdx);
6161 // Update the use list with the pointer we have just moved from vaddr to
6162 // saddr position. Otherwise new vaddr will be missing from the use list.
6163 MRI.removeRegOperandFromUseList(&NewVAddr);
6164 MRI.addRegOperandToUseList(&NewVAddr);
6165 } else {
6166 assert(OldSAddrIdx == NewVAddrIdx);
6167
6168 if (OldVAddrIdx >= 0) {
6169 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6170 AMDGPU::OpName::vdst_in);
6171
6172 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6173 // it asserts. Untie the operands for now and retie them afterwards.
6174 if (NewVDstIn != -1) {
6175 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6176 Inst.untieRegOperand(OldVDstIn);
6177 }
6178
6179 Inst.removeOperand(OldVAddrIdx);
6180
6181 if (NewVDstIn != -1) {
6182 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6183 Inst.tieOperands(NewVDst, NewVDstIn);
6184 }
6185 }
6186 }
6187
6188 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6189 VAddrDef->eraseFromParent();
6190
6191 return true;
6192}
6193
6194// FIXME: Remove this when SelectionDAG is obsoleted.
6196 MachineInstr &MI) const {
6198 return;
6199
6200 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6201 // thinks they are uniform, so a readfirstlane should be valid.
6202 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6203 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6204 return;
6205
6207 return;
6208
6209 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI);
6210 SAddr->setReg(ToSGPR);
6211}
6212
6215 const TargetRegisterClass *DstRC,
6218 const DebugLoc &DL) const {
6219 Register OpReg = Op.getReg();
6220 unsigned OpSubReg = Op.getSubReg();
6221
6222 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6223 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6224
6225 // Check if operand is already the correct register class.
6226 if (DstRC == OpRC)
6227 return;
6228
6229 Register DstReg = MRI.createVirtualRegister(DstRC);
6230 auto Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
6231
6232 Op.setReg(DstReg);
6233 Op.setSubReg(0);
6234
6235 MachineInstr *Def = MRI.getVRegDef(OpReg);
6236 if (!Def)
6237 return;
6238
6239 // Try to eliminate the copy if it is copying an immediate value.
6240 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6241 foldImmediate(*Copy, *Def, OpReg, &MRI);
6242
6243 bool ImpDef = Def->isImplicitDef();
6244 while (!ImpDef && Def && Def->isCopy()) {
6245 if (Def->getOperand(1).getReg().isPhysical())
6246 break;
6247 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6248 ImpDef = Def && Def->isImplicitDef();
6249 }
6250 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6251 !ImpDef)
6252 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6253}
6254
6255// Emit the actual waterfall loop, executing the wrapped instruction for each
6256// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6257// iteration, in the worst case we execute 64 (once per lane).
6260 MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL,
6261 ArrayRef<MachineOperand *> ScalarOps) {
6262 MachineFunction &MF = *OrigBB.getParent();
6263 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6264 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6265 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6266 unsigned SaveExecOpc =
6267 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
6268 unsigned XorTermOpc =
6269 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
6270 unsigned AndOpc =
6271 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6272 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6273
6275
6276 SmallVector<Register, 8> ReadlanePieces;
6277 Register CondReg;
6278
6279 for (MachineOperand *ScalarOp : ScalarOps) {
6280 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6281 unsigned NumSubRegs = RegSize / 32;
6282 Register VScalarOp = ScalarOp->getReg();
6283
6284 if (NumSubRegs == 1) {
6285 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6286
6287 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6288 .addReg(VScalarOp);
6289
6290 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6291
6292 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6293 .addReg(CurReg)
6294 .addReg(VScalarOp);
6295
6296 // Combine the comparison results with AND.
6297 if (!CondReg) // First.
6298 CondReg = NewCondReg;
6299 else { // If not the first, we create an AND.
6300 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6301 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6302 .addReg(CondReg)
6303 .addReg(NewCondReg);
6304 CondReg = AndReg;
6305 }
6306
6307 // Update ScalarOp operand to use the SGPR ScalarOp.
6308 ScalarOp->setReg(CurReg);
6309 ScalarOp->setIsKill();
6310 } else {
6311 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6312 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6313 "Unhandled register size");
6314
6315 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6316 Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6317 Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6318
6319 // Read the next variant <- also loop target.
6320 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6321 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6322
6323 // Read the next variant <- also loop target.
6324 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6325 .addReg(VScalarOp, VScalarOpUndef,
6326 TRI->getSubRegFromChannel(Idx + 1));
6327
6328 ReadlanePieces.push_back(CurRegLo);
6329 ReadlanePieces.push_back(CurRegHi);
6330
6331 // Comparison is to be done as 64-bit.
6332 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6333 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6334 .addReg(CurRegLo)
6335 .addImm(AMDGPU::sub0)
6336 .addReg(CurRegHi)
6337 .addImm(AMDGPU::sub1);
6338
6339 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6340 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6341 NewCondReg)
6342 .addReg(CurReg);
6343 if (NumSubRegs <= 2)
6344 Cmp.addReg(VScalarOp);
6345 else
6346 Cmp.addReg(VScalarOp, VScalarOpUndef,
6347 TRI->getSubRegFromChannel(Idx, 2));
6348
6349 // Combine the comparison results with AND.
6350 if (!CondReg) // First.
6351 CondReg = NewCondReg;
6352 else { // If not the first, we create an AND.
6353 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6354 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6355 .addReg(CondReg)
6356 .addReg(NewCondReg);
6357 CondReg = AndReg;
6358 }
6359 } // End for loop.
6360
6361 auto SScalarOpRC =
6362 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6363 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6364
6365 // Build scalar ScalarOp.
6366 auto Merge =
6367 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6368 unsigned Channel = 0;
6369 for (Register Piece : ReadlanePieces) {
6370 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6371 }
6372
6373 // Update ScalarOp operand to use the SGPR ScalarOp.
6374 ScalarOp->setReg(SScalarOp);
6375 ScalarOp->setIsKill();
6376 }
6377 }
6378
6379 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6380 MRI.setSimpleHint(SaveExec, CondReg);
6381
6382 // Update EXEC to matching lanes, saving original to SaveExec.
6383 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
6384 .addReg(CondReg, RegState::Kill);
6385
6386 // The original instruction is here; we insert the terminators after it.
6387 I = BodyBB.end();
6388
6389 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6390 BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec)
6391 .addReg(Exec)
6392 .addReg(SaveExec);
6393
6394 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
6395}
6396
6397// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6398// with SGPRs by iterating over all unique values across all lanes.
6399// Returns the loop basic block that now contains \p MI.
6400static MachineBasicBlock *
6404 MachineBasicBlock::iterator Begin = nullptr,
6405 MachineBasicBlock::iterator End = nullptr) {
6406 MachineBasicBlock &MBB = *MI.getParent();
6407 MachineFunction &MF = *MBB.getParent();
6408 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6409 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6411 if (!Begin.isValid())
6412 Begin = &MI;
6413 if (!End.isValid()) {
6414 End = &MI;
6415 ++End;
6416 }
6417 const DebugLoc &DL = MI.getDebugLoc();
6418 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6419 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6420 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6421
6422 // Save SCC. Waterfall Loop may overwrite SCC.
6423 Register SaveSCCReg;
6424
6425 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
6426 // rather than unlimited scan everywhere
6427 bool SCCNotDead =
6428 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
6429 std::numeric_limits<unsigned>::max()) !=
6431 if (SCCNotDead) {
6432 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6433 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
6434 .addImm(1)
6435 .addImm(0);
6436 }
6437
6438 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6439
6440 // Save the EXEC mask
6441 BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
6442
6443 // Killed uses in the instruction we are waterfalling around will be
6444 // incorrect due to the added control-flow.
6446 ++AfterMI;
6447 for (auto I = Begin; I != AfterMI; I++) {
6448 for (auto &MO : I->all_uses())
6449 MRI.clearKillFlags(MO.getReg());
6450 }
6451
6452 // To insert the loop we need to split the block. Move everything after this
6453 // point to a new block, and insert a new empty block between the two.
6456 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
6458 ++MBBI;
6459
6460 MF.insert(MBBI, LoopBB);
6461 MF.insert(MBBI, BodyBB);
6462 MF.insert(MBBI, RemainderBB);
6463
6464 LoopBB->addSuccessor(BodyBB);
6465 BodyBB->addSuccessor(LoopBB);
6466 BodyBB->addSuccessor(RemainderBB);
6467
6468 // Move Begin to MI to the BodyBB, and the remainder of the block to
6469 // RemainderBB.
6470 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
6471 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
6472 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
6473
6474 MBB.addSuccessor(LoopBB);
6475
6476 // Update dominators. We know that MBB immediately dominates LoopBB, that
6477 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
6478 // RemainderBB. RemainderBB immediately dominates all of the successors
6479 // transferred to it from MBB that MBB used to properly dominate.
6480 if (MDT) {
6481 MDT->addNewBlock(LoopBB, &MBB);
6482 MDT->addNewBlock(BodyBB, LoopBB);
6483 MDT->addNewBlock(RemainderBB, BodyBB);
6484 for (auto &Succ : RemainderBB->successors()) {
6485 if (MDT->properlyDominates(&MBB, Succ)) {
6486 MDT->changeImmediateDominator(Succ, RemainderBB);
6487 }
6488 }
6489 }
6490
6491 emitLoadScalarOpsFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, ScalarOps);
6492
6493 MachineBasicBlock::iterator First = RemainderBB->begin();
6494 // Restore SCC
6495 if (SCCNotDead) {
6496 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
6497 .addReg(SaveSCCReg, RegState::Kill)
6498 .addImm(0);
6499 }
6500
6501 // Restore the EXEC mask
6502 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
6503 return BodyBB;
6504}
6505
6506// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
6507static std::tuple<unsigned, unsigned>
6509 MachineBasicBlock &MBB = *MI.getParent();
6510 MachineFunction &MF = *MBB.getParent();
6512
6513 // Extract the ptr from the resource descriptor.
6514 unsigned RsrcPtr =
6515 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
6516 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
6517
6518 // Create an empty resource descriptor
6519 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6520 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6521 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6522 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6523 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
6524
6525 // Zero64 = 0
6526 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
6527 .addImm(0);
6528
6529 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
6530 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
6531 .addImm(RsrcDataFormat & 0xFFFFFFFF);
6532
6533 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
6534 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
6535 .addImm(RsrcDataFormat >> 32);
6536
6537 // NewSRsrc = {Zero64, SRsrcFormat}
6538 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
6539 .addReg(Zero64)
6540 .addImm(AMDGPU::sub0_sub1)
6541 .addReg(SRsrcFormatLo)
6542 .addImm(AMDGPU::sub2)
6543 .addReg(SRsrcFormatHi)
6544 .addImm(AMDGPU::sub3);
6545
6546 return std::tuple(RsrcPtr, NewSRsrc);
6547}
6548
6551 MachineDominatorTree *MDT) const {
6552 MachineFunction &MF = *MI.getParent()->getParent();
6554 MachineBasicBlock *CreatedBB = nullptr;
6555
6556 // Legalize VOP2
6557 if (isVOP2(MI) || isVOPC(MI)) {
6559 return CreatedBB;
6560 }
6561
6562 // Legalize VOP3
6563 if (isVOP3(MI)) {
6565 return CreatedBB;
6566 }
6567
6568 // Legalize SMRD
6569 if (isSMRD(MI)) {
6571 return CreatedBB;
6572 }
6573
6574 // Legalize FLAT
6575 if (isFLAT(MI)) {
6577 return CreatedBB;
6578 }
6579
6580 // Legalize REG_SEQUENCE and PHI
6581 // The register class of the operands much be the same type as the register
6582 // class of the output.
6583 if (MI.getOpcode() == AMDGPU::PHI) {
6584 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
6585 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
6586 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
6587 continue;
6588 const TargetRegisterClass *OpRC =
6589 MRI.getRegClass(MI.getOperand(i).getReg());
6590 if (RI.hasVectorRegisters(OpRC)) {
6591 VRC = OpRC;
6592 } else {
6593 SRC = OpRC;
6594 }
6595 }
6596
6597 // If any of the operands are VGPR registers, then they all most be
6598 // otherwise we will create illegal VGPR->SGPR copies when legalizing
6599 // them.
6600 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
6601 if (!VRC) {
6602 assert(SRC);
6603 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
6604 VRC = &AMDGPU::VReg_1RegClass;
6605 } else
6606 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6607 ? RI.getEquivalentAGPRClass(SRC)
6608 : RI.getEquivalentVGPRClass(SRC);
6609 } else {
6610 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6611 ? RI.getEquivalentAGPRClass(VRC)
6612 : RI.getEquivalentVGPRClass(VRC);
6613 }
6614 RC = VRC;
6615 } else {
6616 RC = SRC;
6617 }
6618
6619 // Update all the operands so they have the same type.
6620 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6621 MachineOperand &Op = MI.getOperand(I);
6622 if (!Op.isReg() || !Op.getReg().isVirtual())
6623 continue;
6624
6625 // MI is a PHI instruction.
6626 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
6628
6629 // Avoid creating no-op copies with the same src and dst reg class. These
6630 // confuse some of the machine passes.
6631 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
6632 }
6633 }
6634
6635 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
6636 // VGPR dest type and SGPR sources, insert copies so all operands are
6637 // VGPRs. This seems to help operand folding / the register coalescer.
6638 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
6639 MachineBasicBlock *MBB = MI.getParent();
6640 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
6641 if (RI.hasVGPRs(DstRC)) {
6642 // Update all the operands so they are VGPR register classes. These may
6643 // not be the same register class because REG_SEQUENCE supports mixing
6644 // subregister index types e.g. sub0_sub1 + sub2 + sub3
6645 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6646 MachineOperand &Op = MI.getOperand(I);
6647 if (!Op.isReg() || !Op.getReg().isVirtual())
6648 continue;
6649
6650 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
6651 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
6652 if (VRC == OpRC)
6653 continue;
6654
6655 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
6656 Op.setIsKill();
6657 }
6658 }
6659
6660 return CreatedBB;
6661 }
6662
6663 // Legalize INSERT_SUBREG
6664 // src0 must have the same register class as dst
6665 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
6666 Register Dst = MI.getOperand(0).getReg();
6667 Register Src0 = MI.getOperand(1).getReg();
6668 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
6669 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
6670 if (DstRC != Src0RC) {
6671 MachineBasicBlock *MBB = MI.getParent();
6672 MachineOperand &Op = MI.getOperand(1);
6673 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
6674 }
6675 return CreatedBB;
6676 }
6677
6678 // Legalize SI_INIT_M0
6679 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
6680 MachineOperand &Src = MI.getOperand(0);
6681 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6682 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6683 return CreatedBB;
6684 }
6685
6686 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
6687 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
6688 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
6689 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
6690 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
6691 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
6692 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
6693 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
6694 MachineOperand &Src = MI.getOperand(1);
6695 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6696 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6697 return CreatedBB;
6698 }
6699
6700 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
6701 //
6702 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
6703 // scratch memory access. In both cases, the legalization never involves
6704 // conversion to the addr64 form.
6706 (isMUBUF(MI) || isMTBUF(MI)))) {
6707 int RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI)) ? AMDGPU::OpName::rsrc
6708 : AMDGPU::OpName::srsrc;
6709 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
6710 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
6711 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
6712
6713 int SampOpName = isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
6714 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
6715 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
6716 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
6717
6718 return CreatedBB;
6719 }
6720
6721 // Legalize SI_CALL
6722 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
6723 MachineOperand *Dest = &MI.getOperand(0);
6724 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
6725 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
6726 // following copies, we also need to move copies from and to physical
6727 // registers into the loop block.
6728 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
6729 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
6730
6731 // Also move the copies to physical registers into the loop block
6732 MachineBasicBlock &MBB = *MI.getParent();
6734 while (Start->getOpcode() != FrameSetupOpcode)
6735 --Start;
6737 while (End->getOpcode() != FrameDestroyOpcode)
6738 ++End;
6739 // Also include following copies of the return value
6740 ++End;
6741 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
6742 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
6743 ++End;
6744 CreatedBB =
6745 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
6746 }
6747 }
6748
6749 // Legalize s_sleep_var.
6750 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
6751 const DebugLoc &DL = MI.getDebugLoc();
6752 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6753 int Src0Idx =
6754 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
6755 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6756 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6757 .add(Src0);
6758 Src0.ChangeToRegister(Reg, false);
6759 return nullptr;
6760 }
6761
6762 // Legalize MUBUF instructions.
6763 bool isSoffsetLegal = true;
6764 int SoffsetIdx =
6765 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
6766 if (SoffsetIdx != -1) {
6767 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
6768 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
6769 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
6770 isSoffsetLegal = false;
6771 }
6772 }
6773
6774 bool isRsrcLegal = true;
6775 int RsrcIdx =
6776 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
6777 if (RsrcIdx != -1) {
6778 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6779 if (Rsrc->isReg() && !RI.isSGPRClass(MRI.getRegClass(Rsrc->getReg()))) {
6780 isRsrcLegal = false;
6781 }
6782 }
6783
6784 // The operands are legal.
6785 if (isRsrcLegal && isSoffsetLegal)
6786 return CreatedBB;
6787
6788 if (!isRsrcLegal) {
6789 // Legalize a VGPR Rsrc
6790 //
6791 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
6792 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
6793 // a zero-value SRsrc.
6794 //
6795 // If the instruction is _OFFSET (both idxen and offen disabled), and we
6796 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
6797 // above.
6798 //
6799 // Otherwise we are on non-ADDR64 hardware, and/or we have
6800 // idxen/offen/bothen and we fall back to a waterfall loop.
6801
6802 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6803 MachineBasicBlock &MBB = *MI.getParent();
6804
6805 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
6806 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
6807 // This is already an ADDR64 instruction so we need to add the pointer
6808 // extracted from the resource descriptor to the current value of VAddr.
6809 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6810 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6811 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6812
6813 const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6814 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
6815 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
6816
6817 unsigned RsrcPtr, NewSRsrc;
6818 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
6819
6820 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
6821 const DebugLoc &DL = MI.getDebugLoc();
6822 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
6823 .addDef(CondReg0)
6824 .addReg(RsrcPtr, 0, AMDGPU::sub0)
6825 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
6826 .addImm(0);
6827
6828 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
6829 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
6830 .addDef(CondReg1, RegState::Dead)
6831 .addReg(RsrcPtr, 0, AMDGPU::sub1)
6832 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
6833 .addReg(CondReg0, RegState::Kill)
6834 .addImm(0);
6835
6836 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6837 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
6838 .addReg(NewVAddrLo)
6839 .addImm(AMDGPU::sub0)
6840 .addReg(NewVAddrHi)
6841 .addImm(AMDGPU::sub1);
6842
6843 VAddr->setReg(NewVAddr);
6844 Rsrc->setReg(NewSRsrc);
6845 } else if (!VAddr && ST.hasAddr64()) {
6846 // This instructions is the _OFFSET variant, so we need to convert it to
6847 // ADDR64.
6849 "FIXME: Need to emit flat atomics here");
6850
6851 unsigned RsrcPtr, NewSRsrc;
6852 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
6853
6854 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6855 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
6856 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
6857 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6858 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
6859
6860 // Atomics with return have an additional tied operand and are
6861 // missing some of the special bits.
6862 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
6863 MachineInstr *Addr64;
6864
6865 if (!VDataIn) {
6866 // Regular buffer load / store.
6868 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
6869 .add(*VData)
6870 .addReg(NewVAddr)
6871 .addReg(NewSRsrc)
6872 .add(*SOffset)
6873 .add(*Offset);
6874
6875 if (const MachineOperand *CPol =
6876 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
6877 MIB.addImm(CPol->getImm());
6878 }
6879
6880 if (const MachineOperand *TFE =
6881 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
6882 MIB.addImm(TFE->getImm());
6883 }
6884
6885 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
6886
6887 MIB.cloneMemRefs(MI);
6888 Addr64 = MIB;
6889 } else {
6890 // Atomics with return.
6891 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
6892 .add(*VData)
6893 .add(*VDataIn)
6894 .addReg(NewVAddr)
6895 .addReg(NewSRsrc)
6896 .add(*SOffset)
6897 .add(*Offset)
6898 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
6899 .cloneMemRefs(MI);
6900 }
6901
6902 MI.removeFromParent();
6903
6904 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6905 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
6906 NewVAddr)
6907 .addReg(RsrcPtr, 0, AMDGPU::sub0)
6908 .addImm(AMDGPU::sub0)
6909 .addReg(RsrcPtr, 0, AMDGPU::sub1)
6910 .addImm(AMDGPU::sub1);
6911 } else {
6912 // Legalize a VGPR Rsrc and soffset together.
6913 if (!isSoffsetLegal) {
6914 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6915 CreatedBB =
6916 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
6917 return CreatedBB;
6918 }
6919 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
6920 return CreatedBB;
6921 }
6922 }
6923
6924 // Legalize a VGPR soffset.
6925 if (!isSoffsetLegal) {
6926 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6927 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
6928 return CreatedBB;
6929 }
6930 return CreatedBB;
6931}
6932
6934 InstrList.insert(MI);
6935 // Add MBUF instructiosn to deferred list.
6936 int RsrcIdx =
6937 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
6938 if (RsrcIdx != -1) {
6939 DeferredList.insert(MI);
6940 }
6941}
6942
6944 return DeferredList.contains(MI);
6945}
6946
6948 MachineDominatorTree *MDT) const {
6949
6950 while (!Worklist.empty()) {
6951 MachineInstr &Inst = *Worklist.top();
6952 Worklist.erase_top();
6953 // Skip MachineInstr in the deferred list.
6954 if (Worklist.isDeferred(&Inst))
6955 continue;
6956 moveToVALUImpl(Worklist, MDT, Inst);
6957 }
6958
6959 // Deferred list of instructions will be processed once
6960 // all the MachineInstr in the worklist are done.
6961 for (MachineInstr *Inst : Worklist.getDeferredList()) {
6962 moveToVALUImpl(Worklist, MDT, *Inst);
6963 assert(Worklist.empty() &&
6964 "Deferred MachineInstr are not supposed to re-populate worklist");
6965 }
6966}
6967
6970 MachineInstr &Inst) const {
6971
6973 if (!MBB)
6974 return;
6976 unsigned Opcode = Inst.getOpcode();
6977 unsigned NewOpcode = getVALUOp(Inst);
6978 // Handle some special cases
6979 switch (Opcode) {
6980 default:
6981 break;
6982 case AMDGPU::S_ADD_U64_PSEUDO:
6983 NewOpcode = AMDGPU::V_ADD_U64_PSEUDO;
6984 break;
6985 case AMDGPU::S_SUB_U64_PSEUDO:
6986 NewOpcode = AMDGPU::V_SUB_U64_PSEUDO;
6987 break;
6988 case AMDGPU::S_ADD_I32:
6989 case AMDGPU::S_SUB_I32: {
6990 // FIXME: The u32 versions currently selected use the carry.
6991 bool Changed;
6992 MachineBasicBlock *CreatedBBTmp = nullptr;
6993 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
6994 if (Changed)
6995 return;
6996
6997 // Default handling
6998 break;
6999 }
7000
7001 case AMDGPU::S_MUL_U64:
7002 // Split s_mul_u64 in 32-bit vector multiplications.
7003 splitScalarSMulU64(Worklist, Inst, MDT);
7004 Inst.eraseFromParent();
7005 return;
7006
7007 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7008 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7009 // This is a special case of s_mul_u64 where all the operands are either
7010 // zero extended or sign extended.
7011 splitScalarSMulPseudo(Worklist, Inst, MDT);
7012 Inst.eraseFromParent();
7013 return;
7014
7015 case AMDGPU::S_AND_B64:
7016 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7017 Inst.eraseFromParent();
7018 return;
7019
7020 case AMDGPU::S_OR_B64:
7021 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7022 Inst.eraseFromParent();
7023 return;
7024
7025 case AMDGPU::S_XOR_B64:
7026 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7027 Inst.eraseFromParent();
7028 return;
7029
7030 case AMDGPU::S_NAND_B64:
7031 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7032 Inst.eraseFromParent();
7033 return;
7034
7035 case AMDGPU::S_NOR_B64:
7036 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7037 Inst.eraseFromParent();
7038 return;
7039
7040 case AMDGPU::S_XNOR_B64:
7041 if (ST.hasDLInsts())
7042 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7043 else
7044 splitScalar64BitXnor(Worklist, Inst, MDT);
7045 Inst.eraseFromParent();
7046 return;
7047
7048 case AMDGPU::S_ANDN2_B64:
7049 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7050 Inst.eraseFromParent();
7051 return;
7052
7053 case AMDGPU::S_ORN2_B64:
7054 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7055 Inst.eraseFromParent();
7056 return;
7057
7058 case AMDGPU::S_BREV_B64:
7059 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7060 Inst.eraseFromParent();
7061 return;
7062
7063 case AMDGPU::S_NOT_B64:
7064 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7065 Inst.eraseFromParent();
7066 return;
7067
7068 case AMDGPU::S_BCNT1_I32_B64:
7069 splitScalar64BitBCNT(Worklist, Inst);
7070 Inst.eraseFromParent();
7071 return;
7072
7073 case AMDGPU::S_BFE_I64:
7074 splitScalar64BitBFE(Worklist, Inst);
7075 Inst.eraseFromParent();
7076 return;
7077
7078 case AMDGPU::S_FLBIT_I32_B64:
7079 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7080 Inst.eraseFromParent();
7081 return;
7082 case AMDGPU::S_FF1_I32_B64:
7083 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7084 Inst.eraseFromParent();
7085 return;
7086
7087 case AMDGPU::S_LSHL_B32:
7088 if (ST.hasOnlyRevVALUShifts()) {
7089 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7090 swapOperands(Inst);
7091 }
7092 break;
7093 case AMDGPU::S_ASHR_I32:
7094 if (ST.hasOnlyRevVALUShifts()) {
7095 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7096 swapOperands(Inst);
7097 }
7098 break;
7099 case AMDGPU::S_LSHR_B32:
7100 if (ST.hasOnlyRevVALUShifts()) {
7101 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7102 swapOperands(Inst);
7103 }
7104 break;
7105 case AMDGPU::S_LSHL_B64:
7106 if (ST.hasOnlyRevVALUShifts()) {
7107 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7108 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7109 : AMDGPU::V_LSHLREV_B64_e64;
7110 swapOperands(Inst);
7111 }
7112 break;
7113 case AMDGPU::S_ASHR_I64:
7114 if (ST.hasOnlyRevVALUShifts()) {
7115 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7116 swapOperands(Inst);
7117 }
7118 break;
7119 case AMDGPU::S_LSHR_B64:
7120 if (ST.hasOnlyRevVALUShifts()) {
7121 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7122 swapOperands(Inst);
7123 }
7124 break;
7125
7126 case AMDGPU::S_ABS_I32:
7127 lowerScalarAbs(Worklist, Inst);
7128 Inst.eraseFromParent();
7129 return;
7130
7131 case AMDGPU::S_CBRANCH_SCC0:
7132 case AMDGPU::S_CBRANCH_SCC1: {
7133 // Clear unused bits of vcc
7134 Register CondReg = Inst.getOperand(1).getReg();
7135 bool IsSCC = CondReg == AMDGPU::SCC;
7136 Register VCC = RI.getVCC();
7137 Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
7138 unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
7139 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
7140 .addReg(EXEC)
7141 .addReg(IsSCC ? VCC : CondReg);
7142 Inst.removeOperand(1);
7143 } break;
7144
7145 case AMDGPU::S_BFE_U64:
7146 case AMDGPU::S_BFM_B64:
7147 llvm_unreachable("Moving this op to VALU not implemented");
7148
7149 case AMDGPU::S_PACK_LL_B32_B16:
7150 case AMDGPU::S_PACK_LH_B32_B16:
7151 case AMDGPU::S_PACK_HL_B32_B16:
7152 case AMDGPU::S_PACK_HH_B32_B16:
7153 movePackToVALU(Worklist, MRI, Inst);
7154 Inst.eraseFromParent();
7155 return;
7156
7157 case AMDGPU::S_XNOR_B32:
7158 lowerScalarXnor(Worklist, Inst);
7159 Inst.eraseFromParent();
7160 return;
7161
7162 case AMDGPU::S_NAND_B32:
7163 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7164 Inst.eraseFromParent();
7165 return;
7166
7167 case AMDGPU::S_NOR_B32:
7168 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7169 Inst.eraseFromParent();
7170 return;
7171
7172 case AMDGPU::S_ANDN2_B32:
7173 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7174 Inst.eraseFromParent();
7175 return;
7176
7177 case AMDGPU::S_ORN2_B32:
7178 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7179 Inst.eraseFromParent();
7180 return;
7181
7182 // TODO: remove as soon as everything is ready
7183 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7184 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7185 // can only be selected from the uniform SDNode.
7186 case AMDGPU::S_ADD_CO_PSEUDO:
7187 case AMDGPU::S_SUB_CO_PSEUDO: {
7188 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7189 ? AMDGPU::V_ADDC_U32_e64
7190 : AMDGPU::V_SUBB_U32_e64;
7191 const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
7192
7193 Register CarryInReg = Inst.getOperand(4).getReg();
7194 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7195 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7196 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7197 .addReg(CarryInReg);
7198 }
7199
7200 Register CarryOutReg = Inst.getOperand(1).getReg();
7201
7202 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7203 MRI.getRegClass(Inst.getOperand(0).getReg())));
7204 MachineInstr *CarryOp =
7205 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7206 .addReg(CarryOutReg, RegState::Define)
7207 .add(Inst.getOperand(2))
7208 .add(Inst.getOperand(3))
7209 .addReg(CarryInReg)
7210 .addImm(0);
7211 legalizeOperands(*CarryOp);
7212 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7213 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7214 Inst.eraseFromParent();
7215 }
7216 return;
7217 case AMDGPU::S_UADDO_PSEUDO:
7218 case AMDGPU::S_USUBO_PSEUDO: {
7219 const DebugLoc &DL = Inst.getDebugLoc();
7220 MachineOperand &Dest0 = Inst.getOperand(0);
7221 MachineOperand &Dest1 = Inst.getOperand(1);
7222 MachineOperand &Src0 = Inst.getOperand(2);
7223 MachineOperand &Src1 = Inst.getOperand(3);
7224
7225 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7226 ? AMDGPU::V_ADD_CO_U32_e64
7227 : AMDGPU::V_SUB_CO_U32_e64;
7228 const TargetRegisterClass *NewRC =
7229 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7230 Register DestReg = MRI.createVirtualRegister(NewRC);
7231 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7232 .addReg(Dest1.getReg(), RegState::Define)
7233 .add(Src0)
7234 .add(Src1)
7235 .addImm(0); // clamp bit
7236
7237 legalizeOperands(*NewInstr, MDT);
7238 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7239 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7240 Worklist);
7241 Inst.eraseFromParent();
7242 }
7243 return;
7244
7245 case AMDGPU::S_CSELECT_B32:
7246 case AMDGPU::S_CSELECT_B64:
7247 lowerSelect(Worklist, Inst, MDT);
7248 Inst.eraseFromParent();
7249 return;
7250 case AMDGPU::S_CMP_EQ_I32:
7251 case AMDGPU::S_CMP_LG_I32:
7252 case AMDGPU::S_CMP_GT_I32:
7253 case AMDGPU::S_CMP_GE_I32:
7254 case AMDGPU::S_CMP_LT_I32:
7255 case AMDGPU::S_CMP_LE_I32:
7256 case AMDGPU::S_CMP_EQ_U32:
7257 case AMDGPU::S_CMP_LG_U32:
7258 case AMDGPU::S_CMP_GT_U32:
7259 case AMDGPU::S_CMP_GE_U32:
7260 case AMDGPU::S_CMP_LT_U32:
7261 case AMDGPU::S_CMP_LE_U32:
7262 case AMDGPU::S_CMP_EQ_U64:
7263 case AMDGPU::S_CMP_LG_U64:
7264 case AMDGPU::S_CMP_LT_F32:
7265 case AMDGPU::S_CMP_EQ_F32:
7266 case AMDGPU::S_CMP_LE_F32:
7267 case AMDGPU::S_CMP_GT_F32:
7268 case AMDGPU::S_CMP_LG_F32:
7269 case AMDGPU::S_CMP_GE_F32:
7270 case AMDGPU::S_CMP_O_F32:
7271 case AMDGPU::S_CMP_U_F32:
7272 case AMDGPU::S_CMP_NGE_F32:
7273 case AMDGPU::S_CMP_NLG_F32:
7274 case AMDGPU::S_CMP_NGT_F32:
7275 case AMDGPU::S_CMP_NLE_F32:
7276 case AMDGPU::S_CMP_NEQ_F32:
7277 case AMDGPU::S_CMP_NLT_F32:
7278 case AMDGPU::S_CMP_LT_F16:
7279 case AMDGPU::S_CMP_EQ_F16:
7280 case AMDGPU::S_CMP_LE_F16:
7281 case AMDGPU::S_CMP_GT_F16:
7282 case AMDGPU::S_CMP_LG_F16:
7283 case AMDGPU::S_CMP_GE_F16:
7284 case AMDGPU::S_CMP_O_F16:
7285 case AMDGPU::S_CMP_U_F16:
7286 case AMDGPU::S_CMP_NGE_F16:
7287 case AMDGPU::S_CMP_NLG_F16:
7288 case AMDGPU::S_CMP_NGT_F16:
7289 case AMDGPU::S_CMP_NLE_F16:
7290 case AMDGPU::S_CMP_NEQ_F16:
7291 case AMDGPU::S_CMP_NLT_F16: {
7292 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7293 auto NewInstr =
7294 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7295 .setMIFlags(Inst.getFlags());
7296 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7297 AMDGPU::OpName::src0_modifiers) >= 0) {
7298 NewInstr
7299 .addImm(0) // src0_modifiers
7300 .add(Inst.getOperand(0)) // src0
7301 .addImm(0) // src1_modifiers
7302 .add(Inst.getOperand(1)) // src1
7303 .addImm(0); // clamp
7304 } else {
7305 NewInstr
7306 .add(Inst.getOperand(0))
7307 .add(Inst.getOperand(1));
7308 }
7309 legalizeOperands(*NewInstr, MDT);
7310 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7311 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7312 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7313 Inst.eraseFromParent();
7314 return;
7315 }
7316 case AMDGPU::S_CVT_HI_F32_F16: {
7317 const DebugLoc &DL = Inst.getDebugLoc();
7318 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7319 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7320 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
7321 .addImm(16)
7322 .add(Inst.getOperand(1));
7323 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7324 .addImm(0) // src0_modifiers
7325 .addReg(TmpReg)
7326 .addImm(0) // clamp
7327 .addImm(0); // omod
7328
7329 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7330 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7331 Inst.eraseFromParent();
7332 return;
7333 }
7334 case AMDGPU::S_MINIMUM_F32:
7335 case AMDGPU::S_MAXIMUM_F32:
7336 case AMDGPU::S_MINIMUM_F16:
7337 case AMDGPU::S_MAXIMUM_F16: {
7338 const DebugLoc &DL = Inst.getDebugLoc();
7339 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7340 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7341 .addImm(0) // src0_modifiers
7342 .add(Inst.getOperand(1))
7343 .addImm(0) // src1_modifiers
7344 .add(Inst.getOperand(2))
7345 .addImm(0) // clamp
7346 .addImm(0); // omod
7347 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7348
7349 legalizeOperands(*NewInstr, MDT);
7350 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7351 Inst.eraseFromParent();
7352 return;
7353 }
7354 }
7355
7356 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
7357 // We cannot move this instruction to the VALU, so we should try to
7358 // legalize its operands instead.
7359 legalizeOperands(Inst, MDT);
7360 return;
7361 }
7362 // Handle converting generic instructions like COPY-to-SGPR into
7363 // COPY-to-VGPR.
7364 if (NewOpcode == Opcode) {
7365 Register DstReg = Inst.getOperand(0).getReg();
7366 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
7367
7368 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7369 // hope for the best.
7370 if (Inst.isCopy() && DstReg.isPhysical() &&
7371 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
7372 // TODO: Only works for 32 bit registers.
7373 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7374 get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
7375 .add(Inst.getOperand(1));
7376 Inst.eraseFromParent();
7377 return;
7378 }
7379
7380 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
7381 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
7382 // Instead of creating a copy where src and dst are the same register
7383 // class, we just replace all uses of dst with src. These kinds of
7384 // copies interfere with the heuristics MachineSink uses to decide
7385 // whether or not to split a critical edge. Since the pass assumes
7386 // that copies will end up as machine instructions and not be
7387 // eliminated.
7388 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
7389 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
7390 MRI.clearKillFlags(Inst.getOperand(1).getReg());
7391 Inst.getOperand(0).setReg(DstReg);
7392 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
7393 // these are deleted later, but at -O0 it would leave a suspicious
7394 // looking illegal copy of an undef register.
7395 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
7396 Inst.removeOperand(I);
7397 Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
7398 return;
7399 }
7400 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
7401 MRI.replaceRegWith(DstReg, NewDstReg);
7402 legalizeOperands(Inst, MDT);
7403 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7404 return;
7405 }
7406
7407 // Use the new VALU Opcode.
7408 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
7409 .setMIFlags(Inst.getFlags());
7410 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
7411 // Intersperse VOP3 modifiers among the SALU operands.
7412 NewInstr->addOperand(Inst.getOperand(0));
7413 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7414 AMDGPU::OpName::src0_modifiers) >= 0)
7415 NewInstr.addImm(0);
7416 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
7417 MachineOperand Src = Inst.getOperand(1);
7418 if (AMDGPU::isTrue16Inst(NewOpcode) && ST.useRealTrue16Insts() &&
7419 Src.isReg() && RI.isVGPR(MRI, Src.getReg()))
7420 NewInstr.addReg(Src.getReg(), 0, AMDGPU::lo16);
7421 else
7422 NewInstr->addOperand(Src);
7423 }
7424
7425 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
7426 // We are converting these to a BFE, so we need to add the missing
7427 // operands for the size and offset.
7428 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
7429 NewInstr.addImm(0);
7430 NewInstr.addImm(Size);
7431 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
7432 // The VALU version adds the second operand to the result, so insert an
7433 // extra 0 operand.
7434 NewInstr.addImm(0);
7435 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
7436 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
7437 // If we need to move this to VGPRs, we need to unpack the second
7438 // operand back into the 2 separate ones for bit offset and width.
7439 assert(OffsetWidthOp.isImm() &&
7440 "Scalar BFE is only implemented for constant width and offset");
7441 uint32_t Imm = OffsetWidthOp.getImm();
7442
7443 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
7444 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
7445 NewInstr.addImm(Offset);
7446 NewInstr.addImm(BitWidth);
7447 } else {
7448 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7449 AMDGPU::OpName::src1_modifiers) >= 0)
7450 NewInstr.addImm(0);
7451 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
7452 NewInstr->addOperand(Inst.getOperand(2));
7453 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7454 AMDGPU::OpName::src2_modifiers) >= 0)
7455 NewInstr.addImm(0);
7456 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
7457 NewInstr->addOperand(Inst.getOperand(3));
7458 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
7459 NewInstr.addImm(0);
7460 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
7461 NewInstr.addImm(0);
7462 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
7463 NewInstr.addImm(0);
7464 }
7465 } else {
7466 // Just copy the SALU operands.
7467 for (const MachineOperand &Op : Inst.explicit_operands())
7468 NewInstr->addOperand(Op);
7469 }
7470
7471 // Remove any references to SCC. Vector instructions can't read from it, and
7472 // We're just about to add the implicit use / defs of VCC, and we don't want
7473 // both.
7474 for (MachineOperand &Op : Inst.implicit_operands()) {
7475 if (Op.getReg() == AMDGPU::SCC) {
7476 // Only propagate through live-def of SCC.
7477 if (Op.isDef() && !Op.isDead())
7478 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
7479 if (Op.isUse())
7480 addSCCDefsToVALUWorklist(NewInstr, Worklist);
7481 }
7482 }
7483 Inst.eraseFromParent();
7484 Register NewDstReg;
7485 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
7486 Register DstReg = NewInstr->getOperand(0).getReg();
7487 assert(DstReg.isVirtual());
7488 // Update the destination register class.
7489 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
7490 assert(NewDstRC);
7491 NewDstReg = MRI.createVirtualRegister(NewDstRC);
7492 MRI.replaceRegWith(DstReg, NewDstReg);
7493 }
7494 fixImplicitOperands(*NewInstr);
7495 // Legalize the operands
7496 legalizeOperands(*NewInstr, MDT);
7497 if (NewDstReg)
7498 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7499}
7500
7501// Add/sub require special handling to deal with carry outs.
7502std::pair<bool, MachineBasicBlock *>
7503SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
7504 MachineDominatorTree *MDT) const {
7505 if (ST.hasAddNoCarry()) {
7506 // Assume there is no user of scc since we don't select this in that case.
7507 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
7508 // is used.
7509
7510 MachineBasicBlock &MBB = *Inst.getParent();
7512
7513 Register OldDstReg = Inst.getOperand(0).getReg();
7514 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7515
7516 unsigned Opc = Inst.getOpcode();
7517 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
7518
7519 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
7520 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
7521
7522 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
7523 Inst.removeOperand(3);
7524
7525 Inst.setDesc(get(NewOpc));
7526 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
7528 MRI.replaceRegWith(OldDstReg, ResultReg);
7529 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
7530
7531 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7532 return std::pair(true, NewBB);
7533 }
7534
7535 return std::pair(false, nullptr);
7536}
7537
7538void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
7539 MachineDominatorTree *MDT) const {
7540
7541 MachineBasicBlock &MBB = *Inst.getParent();
7543 MachineBasicBlock::iterator MII = Inst;
7544 DebugLoc DL = Inst.getDebugLoc();
7545
7546 MachineOperand &Dest = Inst.getOperand(0);
7547 MachineOperand &Src0 = Inst.getOperand(1);
7548 MachineOperand &Src1 = Inst.getOperand(2);
7549 MachineOperand &Cond = Inst.getOperand(3);
7550
7551 Register CondReg = Cond.getReg();
7552 bool IsSCC = (CondReg == AMDGPU::SCC);
7553
7554 // If this is a trivial select where the condition is effectively not SCC
7555 // (CondReg is a source of copy to SCC), then the select is semantically
7556 // equivalent to copying CondReg. Hence, there is no need to create
7557 // V_CNDMASK, we can just use that and bail out.
7558 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
7559 (Src1.getImm() == 0)) {
7560 MRI.replaceRegWith(Dest.getReg(), CondReg);
7561 return;
7562 }
7563
7564 Register NewCondReg = CondReg;
7565 if (IsSCC) {
7566 const TargetRegisterClass *TC =
7567 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
7568 NewCondReg = MRI.createVirtualRegister(TC);
7569
7570 // Now look for the closest SCC def if it is a copy
7571 // replacing the CondReg with the COPY source register
7572 bool CopyFound = false;
7573 for (MachineInstr &CandI :
7575 Inst.getParent()->rend())) {
7576 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
7577 -1) {
7578 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
7579 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
7580 .addReg(CandI.getOperand(1).getReg());
7581 CopyFound = true;
7582 }
7583 break;
7584 }
7585 }
7586 if (!CopyFound) {
7587 // SCC def is not a copy
7588 // Insert a trivial select instead of creating a copy, because a copy from
7589 // SCC would semantically mean just copying a single bit, but we may need
7590 // the result to be a vector condition mask that needs preserving.
7591 unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
7592 : AMDGPU::S_CSELECT_B32;
7593 auto NewSelect =
7594 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
7595 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
7596 }
7597 }
7598
7599 Register NewDestReg = MRI.createVirtualRegister(
7600 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
7601 MachineInstr *NewInst;
7602 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
7603 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
7604 .addImm(0)
7605 .add(Src1) // False
7606 .addImm(0)
7607 .add(Src0) // True
7608 .addReg(NewCondReg);
7609 } else {
7610 NewInst =
7611 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
7612 .add(Src1) // False
7613 .add(Src0) // True
7614 .addReg(NewCondReg);
7615 }
7616 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
7617 legalizeOperands(*NewInst, MDT);
7618 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
7619}
7620
7621void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
7622 MachineInstr &Inst) const {
7623 MachineBasicBlock &MBB = *Inst.getParent();
7625 MachineBasicBlock::iterator MII = Inst;
7626 DebugLoc DL = Inst.getDebugLoc();
7627
7628 MachineOperand &Dest = Inst.getOperand(0);
7629 MachineOperand &Src = Inst.getOperand(1);
7630 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7631 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7632
7633 unsigned SubOp = ST.hasAddNoCarry() ?
7634 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
7635
7636 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
7637 .addImm(0)
7638 .addReg(Src.getReg());
7639
7640 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
7641 .addReg(Src.getReg())
7642 .addReg(TmpReg);
7643
7644 MRI.replaceRegWith(Dest.getReg(), ResultReg);
7645 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7646}
7647
7648void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
7649 MachineInstr &Inst) const {
7650 MachineBasicBlock &MBB = *Inst.getParent();
7652 MachineBasicBlock::iterator MII = Inst;
7653 const DebugLoc &DL = Inst.getDebugLoc();
7654
7655 MachineOperand &Dest = Inst.getOperand(0);
7656 MachineOperand &Src0 = Inst.getOperand(1);
7657 MachineOperand &Src1 = Inst.getOperand(2);
7658
7659 if (ST.hasDLInsts()) {
7660 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7661 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
7662 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
7663
7664 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
7665 .add(Src0)
7666 .add(Src1);
7667
7668 MRI.replaceRegWith(Dest.getReg(), NewDest);
7669 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7670 } else {
7671 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
7672 // invert either source and then perform the XOR. If either source is a
7673 // scalar register, then we can leave the inversion on the scalar unit to
7674 // achieve a better distribution of scalar and vector instructions.
7675 bool Src0IsSGPR = Src0.isReg() &&
7676 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
7677 bool Src1IsSGPR = Src1.isReg() &&
7678 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
7680 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7681 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7682
7683 // Build a pair of scalar instructions and add them to the work list.
7684 // The next iteration over the work list will lower these to the vector
7685 // unit as necessary.
7686 if (Src0IsSGPR) {
7687 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
7688 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7689 .addReg(Temp)
7690 .add(Src1);
7691 } else if (Src1IsSGPR) {
7692 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
7693 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7694 .add(Src0)
7695 .addReg(Temp);
7696 } else {
7697 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
7698 .add(Src0)
7699 .add(Src1);
7700 MachineInstr *Not =
7701 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
7702 Worklist.insert(Not);
7703 }
7704
7705 MRI.replaceRegWith(Dest.getReg(), NewDest);
7706
7707 Worklist.insert(Xor);
7708
7709 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7710 }
7711}
7712
7713void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
7714 MachineInstr &Inst,
7715 unsigned Opcode) const {
7716 MachineBasicBlock &MBB = *Inst.getParent();
7718 MachineBasicBlock::iterator MII = Inst;
7719 const DebugLoc &DL = Inst.getDebugLoc();
7720
7721 MachineOperand &Dest = Inst.getOperand(0);
7722 MachineOperand &Src0 = Inst.getOperand(1);
7723 MachineOperand &Src1 = Inst.getOperand(2);
7724
7725 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7726 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7727
7728 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
7729 .add(Src0)
7730 .add(Src1);
7731
7732 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
7733 .addReg(Interm);
7734
7735 Worklist.insert(&Op);
7736 Worklist.insert(&Not);
7737
7738 MRI.replaceRegWith(Dest.getReg(), NewDest);
7739 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7740}
7741
7742void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
7743 MachineInstr &Inst,
7744 unsigned Opcode) const {
7745 MachineBasicBlock &MBB = *Inst.getParent();
7747 MachineBasicBlock::iterator MII = Inst;
7748 const DebugLoc &DL = Inst.getDebugLoc();
7749
7750 MachineOperand &Dest = Inst.getOperand(0);
7751 MachineOperand &Src0 = Inst.getOperand(1);
7752 MachineOperand &Src1 = Inst.getOperand(2);
7753
7754 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7755 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7756
7757 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
7758 .add(Src1);
7759
7760 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
7761 .add(Src0)
7762 .addReg(Interm);
7763
7764 Worklist.insert(&Not);
7765 Worklist.insert(&Op);
7766
7767 MRI.replaceRegWith(Dest.getReg(), NewDest);
7768 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7769}
7770
7771void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
7772 MachineInstr &Inst, unsigned Opcode,
7773 bool Swap) const {
7774 MachineBasicBlock &MBB = *Inst.getParent();
7776
7777 MachineOperand &Dest = Inst.getOperand(0);
7778 MachineOperand &Src0 = Inst.getOperand(1);
7779 DebugLoc DL = Inst.getDebugLoc();
7780
7781 MachineBasicBlock::iterator MII = Inst;
7782
7783 const MCInstrDesc &InstDesc = get(Opcode);
7784 const TargetRegisterClass *Src0RC = Src0.isReg() ?
7785 MRI.getRegClass(Src0.getReg()) :
7786 &AMDGPU::SGPR_32RegClass;
7787
7788 const TargetRegisterClass *Src0SubRC =
7789 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7790
7791 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7792 AMDGPU::sub0, Src0SubRC);
7793
7794 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
7795 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
7796 const TargetRegisterClass *NewDestSubRC =
7797 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
7798
7799 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
7800 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
7801
7802 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7803 AMDGPU::sub1, Src0SubRC);
7804
7805 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
7806 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
7807
7808 if (Swap)
7809 std::swap(DestSub0, DestSub1);
7810
7811 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
7812 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7813 .addReg(DestSub0)
7814 .addImm(AMDGPU::sub0)
7815 .addReg(DestSub1)
7816 .addImm(AMDGPU::sub1);
7817
7818 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7819
7820 Worklist.insert(&LoHalf);
7821 Worklist.insert(&HiHalf);
7822
7823 // We don't need to legalizeOperands here because for a single operand, src0
7824 // will support any kind of input.
7825
7826 // Move all users of this moved value.
7827 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
7828}
7829
7830// There is not a vector equivalent of s_mul_u64. For this reason, we need to
7831// split the s_mul_u64 in 32-bit vector multiplications.
7832void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
7833 MachineInstr &Inst,
7834 MachineDominatorTree *MDT) const {
7835 MachineBasicBlock &MBB = *Inst.getParent();
7837
7838 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7839 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7840 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7841
7842 MachineOperand &Dest = Inst.getOperand(0);
7843 MachineOperand &Src0 = Inst.getOperand(1);
7844 MachineOperand &Src1 = Inst.getOperand(2);
7845 const DebugLoc &DL = Inst.getDebugLoc();
7846 MachineBasicBlock::iterator MII = Inst;
7847
7848 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
7849 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
7850 const TargetRegisterClass *Src0SubRC =
7851 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7852 if (RI.isSGPRClass(Src0SubRC))
7853 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
7854 const TargetRegisterClass *Src1SubRC =
7855 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
7856 if (RI.isSGPRClass(Src1SubRC))
7857 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
7858
7859 // First, we extract the low 32-bit and high 32-bit values from each of the
7860 // operands.
7861 MachineOperand Op0L =
7862 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
7863 MachineOperand Op1L =
7864 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
7865 MachineOperand Op0H =
7866 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
7867 MachineOperand Op1H =
7868 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
7869
7870 // The multilication is done as follows:
7871 //
7872 // Op1H Op1L
7873 // * Op0H Op0L
7874 // --------------------
7875 // Op1H*Op0L Op1L*Op0L
7876 // + Op1H*Op0H Op1L*Op0H
7877 // -----------------------------------------
7878 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
7879 //
7880 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
7881 // value and that would overflow.
7882 // The low 32-bit value is Op1L*Op0L.
7883 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
7884
7885 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7886 MachineInstr *Op1L_Op0H =
7887 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
7888 .add(Op1L)
7889 .add(Op0H);
7890
7891 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7892 MachineInstr *Op1H_Op0L =
7893 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
7894 .add(Op1H)
7895 .add(Op0L);
7896
7897 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7898 MachineInstr *Carry =
7899 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
7900 .add(Op1L)
7901 .add(Op0L);
7902
7903 MachineInstr *LoHalf =
7904 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
7905 .add(Op1L)
7906 .add(Op0L);
7907
7908 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7909 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
7910 .addReg(Op1L_Op0H_Reg)
7911 .addReg(Op1H_Op0L_Reg);
7912
7913 MachineInstr *HiHalf =
7914 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
7915 .addReg(AddReg)
7916 .addReg(CarryReg);
7917
7918 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7919 .addReg(DestSub0)
7920 .addImm(AMDGPU::sub0)
7921 .addReg(DestSub1)
7922 .addImm(AMDGPU::sub1);
7923
7924 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7925
7926 // Try to legalize the operands in case we need to swap the order to keep it
7927 // valid.
7928 legalizeOperands(*Op1L_Op0H, MDT);
7929 legalizeOperands(*Op1H_Op0L, MDT);
7930 legalizeOperands(*Carry, MDT);
7931 legalizeOperands(*LoHalf, MDT);
7932 legalizeOperands(*Add, MDT);
7933 legalizeOperands(*HiHalf, MDT);
7934
7935 // Move all users of this moved value.
7936 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
7937}
7938
7939// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
7940// multiplications.
7941void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
7942 MachineInstr &Inst,
7943 MachineDominatorTree *MDT) const {
7944 MachineBasicBlock &MBB = *Inst.getParent();
7946
7947 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7948 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7949 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7950
7951 MachineOperand &Dest = Inst.getOperand(0);
7952 MachineOperand &Src0 = Inst.getOperand(1);
7953 MachineOperand &Src1 = Inst.getOperand(2);
7954 const DebugLoc &DL = Inst.getDebugLoc();
7955 MachineBasicBlock::iterator MII = Inst;
7956
7957 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
7958 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
7959 const TargetRegisterClass *Src0SubRC =
7960 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7961 if (RI.isSGPRClass(Src0SubRC))
7962 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
7963 const TargetRegisterClass *Src1SubRC =
7964 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
7965 if (RI.isSGPRClass(Src1SubRC))
7966 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
7967
7968 // First, we extract the low 32-bit and high 32-bit values from each of the
7969 // operands.
7970 MachineOperand Op0L =
7971 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
7972 MachineOperand Op1L =
7973 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
7974
7975 unsigned Opc = Inst.getOpcode();
7976 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
7977 ? AMDGPU::V_MUL_HI_U32_e64
7978 : AMDGPU::V_MUL_HI_I32_e64;
7979 MachineInstr *HiHalf =
7980 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
7981
7982 MachineInstr *LoHalf =
7983 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
7984 .add(Op1L)
7985 .add(Op0L);
7986
7987 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7988 .addReg(DestSub0)
7989 .addImm(AMDGPU::sub0)
7990 .addReg(DestSub1)
7991 .addImm(AMDGPU::sub1);
7992
7993 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7994
7995 // Try to legalize the operands in case we need to swap the order to keep it
7996 // valid.
7997 legalizeOperands(*HiHalf, MDT);
7998 legalizeOperands(*LoHalf, MDT);
7999
8000 // Move all users of this moved value.
8001 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8002}
8003
8004void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8005 MachineInstr &Inst, unsigned Opcode,
8006 MachineDominatorTree *MDT) const {
8007 MachineBasicBlock &MBB = *Inst.getParent();
8009
8010 MachineOperand &Dest = Inst.getOperand(0);
8011 MachineOperand &Src0 = Inst.getOperand(1);
8012 MachineOperand &Src1 = Inst.getOperand(2);
8013 DebugLoc DL = Inst.getDebugLoc();
8014
8015 MachineBasicBlock::iterator MII = Inst;
8016
8017 const MCInstrDesc &InstDesc = get(Opcode);
8018 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8019 MRI.getRegClass(Src0.getReg()) :
8020 &AMDGPU::SGPR_32RegClass;
8021
8022 const TargetRegisterClass *Src0SubRC =
8023 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8024 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8025 MRI.getRegClass(Src1.getReg()) :
8026 &AMDGPU::SGPR_32RegClass;
8027
8028 const TargetRegisterClass *Src1SubRC =
8029 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8030
8031 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8032 AMDGPU::sub0, Src0SubRC);
8033 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8034 AMDGPU::sub0, Src1SubRC);
8035 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8036 AMDGPU::sub1, Src0SubRC);
8037 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8038 AMDGPU::sub1, Src1SubRC);
8039
8040 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8041 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8042 const TargetRegisterClass *NewDestSubRC =
8043 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8044
8045 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8046 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8047 .add(SrcReg0Sub0)
8048 .add(SrcReg1Sub0);
8049
8050 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8051 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8052 .add(SrcReg0Sub1)
8053 .add(SrcReg1Sub1);
8054
8055 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8056 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8057 .addReg(DestSub0)
8058 .addImm(AMDGPU::sub0)
8059 .addReg(DestSub1)
8060 .addImm(AMDGPU::sub1);
8061
8062 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8063
8064 Worklist.insert(&LoHalf);
8065 Worklist.insert(&HiHalf);
8066
8067 // Move all users of this moved value.
8068 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8069}
8070
8071void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8072 MachineInstr &Inst,
8073 MachineDominatorTree *MDT) const {
8074 MachineBasicBlock &MBB = *Inst.getParent();
8076
8077 MachineOperand &Dest = Inst.getOperand(0);
8078 MachineOperand &Src0 = Inst.getOperand(1);
8079 MachineOperand &Src1 = Inst.getOperand(2);
8080 const DebugLoc &DL = Inst.getDebugLoc();
8081
8082 MachineBasicBlock::iterator MII = Inst;
8083
8084 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8085
8086 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8087
8088 MachineOperand* Op0;
8089 MachineOperand* Op1;
8090
8091 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8092 Op0 = &Src0;
8093 Op1 = &Src1;
8094 } else {
8095 Op0 = &Src1;
8096 Op1 = &Src0;
8097 }
8098
8099 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8100 .add(*Op0);
8101
8102 Register NewDest = MRI.createVirtualRegister(DestRC);
8103
8104 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8105 .addReg(Interm)
8106 .add(*Op1);
8107
8108 MRI.replaceRegWith(Dest.getReg(), NewDest);
8109
8110 Worklist.insert(&Xor);
8111}
8112
8113void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8114 MachineInstr &Inst) const {
8115 MachineBasicBlock &MBB = *Inst.getParent();
8117
8118 MachineBasicBlock::iterator MII = Inst;
8119 const DebugLoc &DL = Inst.getDebugLoc();
8120
8121 MachineOperand &Dest = Inst.getOperand(0);
8122 MachineOperand &Src = Inst.getOperand(1);
8123
8124 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8125 const TargetRegisterClass *SrcRC = Src.isReg() ?
8126 MRI.getRegClass(Src.getReg()) :
8127 &AMDGPU::SGPR_32RegClass;
8128
8129 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8130 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8131
8132 const TargetRegisterClass *SrcSubRC =
8133 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8134
8135 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8136 AMDGPU::sub0, SrcSubRC);
8137 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8138 AMDGPU::sub1, SrcSubRC);
8139
8140 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8141
8142 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8143
8144 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8145
8146 // We don't need to legalize operands here. src0 for either instruction can be
8147 // an SGPR, and the second input is unused or determined here.
8148 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8149}
8150
8151void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8152 MachineInstr &Inst) const {
8153 MachineBasicBlock &MBB = *Inst.getParent();
8155 MachineBasicBlock::iterator MII = Inst;
8156 const DebugLoc &DL = Inst.getDebugLoc();
8157
8158 MachineOperand &Dest = Inst.getOperand(0);
8159 uint32_t Imm = Inst.getOperand(2).getImm();
8160 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8161 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8162
8163 (void) Offset;
8164
8165 // Only sext_inreg cases handled.
8166 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8167 Offset == 0 && "Not implemented");
8168
8169 if (BitWidth < 32) {
8170 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8171 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8172 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8173
8174 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8175 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8176 .addImm(0)
8177 .addImm(BitWidth);
8178
8179 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8180 .addImm(31)
8181 .addReg(MidRegLo);
8182
8183 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8184 .addReg(MidRegLo)
8185 .addImm(AMDGPU::sub0)
8186 .addReg(MidRegHi)
8187 .addImm(AMDGPU::sub1);
8188
8189 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8190 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8191 return;
8192 }
8193
8194 MachineOperand &Src = Inst.getOperand(1);
8195 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8196 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8197
8198 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8199 .addImm(31)
8200 .addReg(Src.getReg(), 0, AMDGPU::sub0);
8201
8202 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8203 .addReg(Src.getReg(), 0, AMDGPU::sub0)
8204 .addImm(AMDGPU::sub0)
8205 .addReg(TmpReg)
8206 .addImm(AMDGPU::sub1);
8207
8208 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8209 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8210}
8211
8212void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8213 MachineInstr &Inst, unsigned Opcode,
8214 MachineDominatorTree *MDT) const {
8215 // (S_FLBIT_I32_B64 hi:lo) ->
8216 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8217 // (S_FF1_I32_B64 hi:lo) ->
8218 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8219
8220 MachineBasicBlock &MBB = *Inst.getParent();
8222 MachineBasicBlock::iterator MII = Inst;
8223 const DebugLoc &DL = Inst.getDebugLoc();
8224
8225 MachineOperand &Dest = Inst.getOperand(0);
8226 MachineOperand &Src = Inst.getOperand(1);
8227
8228 const MCInstrDesc &InstDesc = get(Opcode);
8229
8230 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
8231 unsigned OpcodeAdd =
8232 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
8233
8234 const TargetRegisterClass *SrcRC =
8235 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
8236 const TargetRegisterClass *SrcSubRC =
8237 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8238
8239 MachineOperand SrcRegSub0 =
8240 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
8241 MachineOperand SrcRegSub1 =
8242 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
8243
8244 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8245 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8246 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8247 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8248
8249 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
8250
8251 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
8252
8253 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
8254 .addReg(IsCtlz ? MidReg1 : MidReg2)
8255 .addImm(32)
8256 .addImm(1); // enable clamp
8257
8258 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
8259 .addReg(MidReg3)
8260 .addReg(IsCtlz ? MidReg2 : MidReg1);
8261
8262 MRI.replaceRegWith(Dest.getReg(), MidReg4);
8263
8264 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
8265}
8266
8267void SIInstrInfo::addUsersToMoveToVALUWorklist(
8269 SIInstrWorklist &Worklist) const {
8270 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
8271 E = MRI.use_end(); I != E;) {
8272 MachineInstr &UseMI = *I->getParent();
8273
8274 unsigned OpNo = 0;
8275
8276 switch (UseMI.getOpcode()) {
8277 case AMDGPU::COPY:
8278 case AMDGPU::WQM:
8279 case AMDGPU::SOFT_WQM:
8280 case AMDGPU::STRICT_WWM:
8281 case AMDGPU::STRICT_WQM:
8282 case AMDGPU::REG_SEQUENCE:
8283 case AMDGPU::PHI:
8284 case AMDGPU::INSERT_SUBREG:
8285 break;
8286 default:
8287 OpNo = I.getOperandNo();
8288 break;
8289 }
8290
8291 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) {
8292 Worklist.insert(&UseMI);
8293
8294 do {
8295 ++I;
8296 } while (I != E && I->getParent() == &UseMI);
8297 } else {
8298 ++I;
8299 }
8300 }
8301}
8302
8303void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
8305 MachineInstr &Inst) const {
8306 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8308 MachineOperand &Src0 = Inst.getOperand(1);
8309 MachineOperand &Src1 = Inst.getOperand(2);
8310 const DebugLoc &DL = Inst.getDebugLoc();
8311
8312 switch (Inst.getOpcode()) {
8313 case AMDGPU::S_PACK_LL_B32_B16: {
8314 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8315 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8316
8317 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
8318 // 0.
8319 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8320 .addImm(0xffff);
8321
8322 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
8323 .addReg(ImmReg, RegState::Kill)
8324 .add(Src0);
8325
8326 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8327 .add(Src1)
8328 .addImm(16)
8329 .addReg(TmpReg, RegState::Kill);
8330 break;
8331 }
8332 case AMDGPU::S_PACK_LH_B32_B16: {
8333 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8334 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8335 .addImm(0xffff);
8336 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
8337 .addReg(ImmReg, RegState::Kill)
8338 .add(Src0)
8339 .add(Src1);
8340 break;
8341 }
8342 case AMDGPU::S_PACK_HL_B32_B16: {
8343 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8344 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8345 .addImm(16)
8346 .add(Src0);
8347 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8348 .add(Src1)
8349 .addImm(16)
8350 .addReg(TmpReg, RegState::Kill);
8351 break;
8352 }
8353 case AMDGPU::S_PACK_HH_B32_B16: {
8354 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8355 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8356 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8357 .addImm(16)
8358 .add(Src0);
8359 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8360 .addImm(0xffff0000);
8361 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
8362 .add(Src1)
8363 .addReg(ImmReg, RegState::Kill)
8364 .addReg(TmpReg, RegState::Kill);
8365 break;
8366 }
8367 default:
8368 llvm_unreachable("unhandled s_pack_* instruction");
8369 }
8370
8371 MachineOperand &Dest = Inst.getOperand(0);
8372 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8373 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8374}
8375
8376void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
8377 MachineInstr &SCCDefInst,
8378 SIInstrWorklist &Worklist,
8379 Register NewCond) const {
8380
8381 // Ensure that def inst defines SCC, which is still live.
8382 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
8383 !Op.isDead() && Op.getParent() == &SCCDefInst);
8384 SmallVector<MachineInstr *, 4> CopyToDelete;
8385 // This assumes that all the users of SCC are in the same block
8386 // as the SCC def.
8387 for (MachineInstr &MI : // Skip the def inst itself.
8388 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
8389 SCCDefInst.getParent()->end())) {
8390 // Check if SCC is used first.
8391 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
8392 if (SCCIdx != -1) {
8393 if (MI.isCopy()) {
8394 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8395 Register DestReg = MI.getOperand(0).getReg();
8396
8397 MRI.replaceRegWith(DestReg, NewCond);
8398 CopyToDelete.push_back(&MI);
8399 } else {
8400
8401 if (NewCond.isValid())
8402 MI.getOperand(SCCIdx).setReg(NewCond);
8403
8404 Worklist.insert(&MI);
8405 }
8406 }
8407 // Exit if we find another SCC def.
8408 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
8409 break;
8410 }
8411 for (auto &Copy : CopyToDelete)
8412 Copy->eraseFromParent();
8413}
8414
8415// Instructions that use SCC may be converted to VALU instructions. When that
8416// happens, the SCC register is changed to VCC_LO. The instruction that defines
8417// SCC must be changed to an instruction that defines VCC. This function makes
8418// sure that the instruction that defines SCC is added to the moveToVALU
8419// worklist.
8420void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
8421 SIInstrWorklist &Worklist) const {
8422 // Look for a preceding instruction that either defines VCC or SCC. If VCC
8423 // then there is nothing to do because the defining instruction has been
8424 // converted to a VALU already. If SCC then that instruction needs to be
8425 // converted to a VALU.
8426 for (MachineInstr &MI :
8427 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
8428 SCCUseInst->getParent()->rend())) {
8429 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
8430 break;
8431 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
8432 Worklist.insert(&MI);
8433 break;
8434 }
8435 }
8436}
8437
8438const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
8439 const MachineInstr &Inst) const {
8440 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
8441
8442 switch (Inst.getOpcode()) {
8443 // For target instructions, getOpRegClass just returns the virtual register
8444 // class associated with the operand, so we need to find an equivalent VGPR
8445 // register class in order to move the instruction to the VALU.
8446 case AMDGPU::COPY:
8447 case AMDGPU::PHI:
8448 case AMDGPU::REG_SEQUENCE:
8449 case AMDGPU::INSERT_SUBREG:
8450 case AMDGPU::WQM:
8451 case AMDGPU::SOFT_WQM:
8452 case AMDGPU::STRICT_WWM:
8453 case AMDGPU::STRICT_WQM: {
8454 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
8455 if (RI.isAGPRClass(SrcRC)) {
8456 if (RI.isAGPRClass(NewDstRC))
8457 return nullptr;
8458
8459 switch (Inst.getOpcode()) {
8460 case AMDGPU::PHI:
8461 case AMDGPU::REG_SEQUENCE:
8462 case AMDGPU::INSERT_SUBREG:
8463 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
8464 break;
8465 default:
8466 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8467 }
8468
8469 if (!NewDstRC)
8470 return nullptr;
8471 } else {
8472 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
8473 return nullptr;
8474
8475 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8476 if (!NewDstRC)
8477 return nullptr;
8478 }
8479
8480 return NewDstRC;
8481 }
8482 default:
8483 return NewDstRC;
8484 }
8485}
8486
8487// Find the one SGPR operand we are allowed to use.
8488Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
8489 int OpIndices[3]) const {
8490 const MCInstrDesc &Desc = MI.getDesc();
8491
8492 // Find the one SGPR operand we are allowed to use.
8493 //
8494 // First we need to consider the instruction's operand requirements before
8495 // legalizing. Some operands are required to be SGPRs, such as implicit uses
8496 // of VCC, but we are still bound by the constant bus requirement to only use
8497 // one.
8498 //
8499 // If the operand's class is an SGPR, we can never move it.
8500
8501 Register SGPRReg = findImplicitSGPRRead(MI);
8502 if (SGPRReg)
8503 return SGPRReg;
8504
8505 Register UsedSGPRs[3] = {Register()};
8506 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8507
8508 for (unsigned i = 0; i < 3; ++i) {
8509 int Idx = OpIndices[i];
8510 if (Idx == -1)
8511 break;
8512
8513 const MachineOperand &MO = MI.getOperand(Idx);
8514 if (!MO.isReg())
8515 continue;
8516
8517 // Is this operand statically required to be an SGPR based on the operand
8518 // constraints?
8519 const TargetRegisterClass *OpRC =
8520 RI.getRegClass(Desc.operands()[Idx].RegClass);
8521 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
8522 if (IsRequiredSGPR)
8523 return MO.getReg();
8524
8525 // If this could be a VGPR or an SGPR, Check the dynamic register class.
8526 Register Reg = MO.getReg();
8527 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
8528 if (RI.isSGPRClass(RegRC))
8529 UsedSGPRs[i] = Reg;
8530 }
8531
8532 // We don't have a required SGPR operand, so we have a bit more freedom in
8533 // selecting operands to move.
8534
8535 // Try to select the most used SGPR. If an SGPR is equal to one of the
8536 // others, we choose that.
8537 //
8538 // e.g.
8539 // V_FMA_F32 v0, s0, s0, s0 -> No moves
8540 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
8541
8542 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
8543 // prefer those.
8544
8545 if (UsedSGPRs[0]) {
8546 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
8547 SGPRReg = UsedSGPRs[0];
8548 }
8549
8550 if (!SGPRReg && UsedSGPRs[1]) {
8551 if (UsedSGPRs[1] == UsedSGPRs[2])
8552 SGPRReg = UsedSGPRs[1];
8553 }
8554
8555 return SGPRReg;
8556}
8557
8559 unsigned OperandName) const {
8560 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
8561 if (Idx == -1)
8562 return nullptr;
8563
8564 return &MI.getOperand(Idx);
8565}
8566
8572 return (Format << 44) |
8573 (1ULL << 56) | // RESOURCE_LEVEL = 1
8574 (3ULL << 60); // OOB_SELECT = 3
8575 }
8576
8577 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
8578 if (ST.isAmdHsaOS()) {
8579 // Set ATC = 1. GFX9 doesn't have this bit.
8581 RsrcDataFormat |= (1ULL << 56);
8582
8583 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
8584 // BTW, it disables TC L2 and therefore decreases performance.
8586 RsrcDataFormat |= (2ULL << 59);
8587 }
8588
8589 return RsrcDataFormat;
8590}
8591
8595 0xffffffff; // Size;
8596
8597 // GFX9 doesn't have ELEMENT_SIZE.
8599 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
8600 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
8601 }
8602
8603 // IndexStride = 64 / 32.
8604 uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2;
8605 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
8606
8607 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
8608 // Clear them unless we want a huge stride.
8611 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
8612
8613 return Rsrc23;
8614}
8615
8617 unsigned Opc = MI.getOpcode();
8618
8619 return isSMRD(Opc);
8620}
8621
8623 return get(Opc).mayLoad() &&
8624 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
8625}
8626
8628 int &FrameIndex) const {
8629 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
8630 if (!Addr || !Addr->isFI())
8631 return Register();
8632
8633 assert(!MI.memoperands_empty() &&
8634 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
8635
8636 FrameIndex = Addr->getIndex();
8637 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
8638}
8639
8641 int &FrameIndex) const {
8642 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
8643 assert(Addr && Addr->isFI());
8644 FrameIndex = Addr->getIndex();
8645 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
8646}
8647
8649 int &FrameIndex) const {
8650 if (!MI.mayLoad())
8651 return Register();
8652
8653 if (isMUBUF(MI) || isVGPRSpill(MI))
8654 return isStackAccess(MI, FrameIndex);
8655
8656 if (isSGPRSpill(MI))
8657 return isSGPRStackAccess(MI, FrameIndex);
8658
8659 return Register();
8660}
8661
8663 int &FrameIndex) const {
8664 if (!MI.mayStore())
8665 return Register();
8666
8667 if (isMUBUF(MI) || isVGPRSpill(MI))
8668 return isStackAccess(MI, FrameIndex);
8669
8670 if (isSGPRSpill(MI))
8671 return isSGPRStackAccess(MI, FrameIndex);
8672
8673 return Register();
8674}
8675
8677 unsigned Size = 0;
8679 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
8680 while (++I != E && I->isInsideBundle()) {
8681 assert(!I->isBundle() && "No nested bundle!");
8683 }
8684
8685 return Size;
8686}
8687
8689 unsigned Opc = MI.getOpcode();
8691 unsigned DescSize = Desc.getSize();
8692
8693 // If we have a definitive size, we can use it. Otherwise we need to inspect
8694 // the operands to know the size.
8695 if (isFixedSize(MI)) {
8696 unsigned Size = DescSize;
8697
8698 // If we hit the buggy offset, an extra nop will be inserted in MC so
8699 // estimate the worst case.
8700 if (MI.isBranch() && ST.hasOffset3fBug())
8701 Size += 4;
8702
8703 return Size;
8704 }
8705
8706 // Instructions may have a 32-bit literal encoded after them. Check
8707 // operands that could ever be literals.
8708 if (isVALU(MI) || isSALU(MI)) {
8709 if (isDPP(MI))
8710 return DescSize;
8711 bool HasLiteral = false;
8712 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
8713 const MachineOperand &Op = MI.getOperand(I);
8714 const MCOperandInfo &OpInfo = Desc.operands()[I];
8715 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
8716 HasLiteral = true;
8717 break;
8718 }
8719 }
8720 return HasLiteral ? DescSize + 4 : DescSize;
8721 }
8722
8723 // Check whether we have extra NSA words.
8724 if (isMIMG(MI)) {
8725 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
8726 if (VAddr0Idx < 0)
8727 return 8;
8728
8729 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
8730 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
8731 }
8732
8733 switch (Opc) {
8734 case TargetOpcode::BUNDLE:
8735 return getInstBundleSize(MI);
8736 case TargetOpcode::INLINEASM:
8737 case TargetOpcode::INLINEASM_BR: {
8738 const MachineFunction *MF = MI.getParent()->getParent();
8739 const char *AsmStr = MI.getOperand(0).getSymbolName();
8740 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
8741 }
8742 default:
8743 if (MI.isMetaInstruction())
8744 return 0;
8745 return DescSize;
8746 }
8747}
8748
8750 if (!isFLAT(MI))
8751 return false;
8752
8753 if (MI.memoperands_empty())
8754 return true;
8755
8756 for (const MachineMemOperand *MMO : MI.memoperands()) {
8757 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
8758 return true;
8759 }
8760 return false;
8761}
8762
8764 return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
8765}
8766
8768 MachineBasicBlock *IfEnd) const {
8770 assert(TI != IfEntry->end());
8771
8772 MachineInstr *Branch = &(*TI);
8773 MachineFunction *MF = IfEntry->getParent();
8775
8776 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
8777 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
8778 MachineInstr *SIIF =
8779 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
8780 .add(Branch->getOperand(0))
8781 .add(Branch->getOperand(1));
8782 MachineInstr *SIEND =
8783 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
8784 .addReg(DstReg);
8785
8786 IfEntry->erase(TI);
8787 IfEntry->insert(IfEntry->end(), SIIF);
8788 IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
8789 }
8790}
8791
8793 MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
8795 // We expect 2 terminators, one conditional and one unconditional.
8796 assert(TI != LoopEnd->end());
8797
8798 MachineInstr *Branch = &(*TI);
8799 MachineFunction *MF = LoopEnd->getParent();
8801
8802 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
8803
8804 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
8805 Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC());
8806 MachineInstrBuilder HeaderPHIBuilder =
8807 BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
8808 for (MachineBasicBlock *PMBB : LoopEntry->predecessors()) {
8809 if (PMBB == LoopEnd) {
8810 HeaderPHIBuilder.addReg(BackEdgeReg);
8811 } else {
8812 Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC());
8813 materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
8814 ZeroReg, 0);
8815 HeaderPHIBuilder.addReg(ZeroReg);
8816 }
8817 HeaderPHIBuilder.addMBB(PMBB);
8818 }
8819 MachineInstr *HeaderPhi = HeaderPHIBuilder;
8820 MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
8821 get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
8822 .addReg(DstReg)
8823 .add(Branch->getOperand(0));
8824 MachineInstr *SILOOP =
8825 BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
8826 .addReg(BackEdgeReg)
8827 .addMBB(LoopEntry);
8828
8829 LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
8830 LoopEnd->erase(TI);
8831 LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
8832 LoopEnd->insert(LoopEnd->end(), SILOOP);
8833 }
8834}
8835
8838 static const std::pair<int, const char *> TargetIndices[] = {
8839 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
8840 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
8841 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
8842 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
8843 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
8844 return ArrayRef(TargetIndices);
8845}
8846
8847/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
8848/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
8851 const ScheduleDAG *DAG) const {
8852 return new GCNHazardRecognizer(DAG->MF);
8853}
8854
8855/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
8856/// pass.
8859 return new GCNHazardRecognizer(MF);
8860}
8861
8862// Called during:
8863// - pre-RA scheduling and post-RA scheduling
8866 const ScheduleDAGMI *DAG) const {
8867 // Borrowed from Arm Target
8868 // We would like to restrict this hazard recognizer to only
8869 // post-RA scheduling; we can tell that we're post-RA because we don't
8870 // track VRegLiveness.
8871 if (!DAG->hasVRegLiveness())
8872 return new GCNHazardRecognizer(DAG->MF);
8874}
8875
8876std::pair<unsigned, unsigned>
8878 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
8879}
8880
8883 static const std::pair<unsigned, const char *> TargetFlags[] = {
8884 { MO_GOTPCREL, "amdgpu-gotprel" },
8885 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
8886 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
8887 { MO_REL32_LO, "amdgpu-rel32-lo" },
8888 { MO_REL32_HI, "amdgpu-rel32-hi" },
8889 { MO_ABS32_LO, "amdgpu-abs32-lo" },
8890 { MO_ABS32_HI, "amdgpu-abs32-hi" },
8891 };
8892
8893 return ArrayRef(TargetFlags);
8894}
8895
8898 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
8899 {
8900 {MONoClobber, "amdgpu-noclobber"},
8901 {MOLastUse, "amdgpu-last-use"},
8902 };
8903
8904 return ArrayRef(TargetFlags);
8905}
8906
8908 const MachineFunction &MF) const {
8910 assert(SrcReg.isVirtual());
8911 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
8912 return AMDGPU::WWM_COPY;
8913
8914 return AMDGPU::COPY;
8915}
8916
8918 Register Reg) const {
8919 // We need to handle instructions which may be inserted during register
8920 // allocation to handle the prolog. The initial prolog instruction may have
8921 // been separated from the start of the block by spills and copies inserted
8922 // needed by the prolog. However, the insertions for scalar registers can
8923 // always be placed at the BB top as they are independent of the exec mask
8924 // value.
8925 bool IsNullOrVectorRegister = true;
8926 if (Reg) {
8927 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8928 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
8929 }
8930
8931 uint16_t Opcode = MI.getOpcode();
8932 // FIXME: Copies inserted in the block prolog for live-range split should also
8933 // be included.
8934 return IsNullOrVectorRegister &&
8935 (isSpill(Opcode) || (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
8936 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
8937}
8938
8942 const DebugLoc &DL,
8943 Register DestReg) const {
8944 if (ST.hasAddNoCarry())
8945 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
8946
8948 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
8949 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
8950
8951 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
8952 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
8953}
8954
8957 const DebugLoc &DL,
8958 Register DestReg,
8959 RegScavenger &RS) const {
8960 if (ST.hasAddNoCarry())
8961 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
8962
8963 // If available, prefer to use vcc.
8964 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
8965 ? Register(RI.getVCC())
8967 *RI.getBoolRC(), I, /* RestoreAfter */ false,
8968 0, /* AllowSpill */ false);
8969
8970 // TODO: Users need to deal with this.
8971 if (!UnusedCarry.isValid())
8972 return MachineInstrBuilder();
8973
8974 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
8975 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
8976}
8977
8978bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
8979 switch (Opcode) {
8980 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
8981 case AMDGPU::SI_KILL_I1_TERMINATOR:
8982 return true;
8983 default:
8984 return false;
8985 }
8986}
8987
8989 switch (Opcode) {
8990 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
8991 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
8992 case AMDGPU::SI_KILL_I1_PSEUDO:
8993 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
8994 default:
8995 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
8996 }
8997}
8998
8999bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9000 return Imm <= getMaxMUBUFImmOffset(ST);
9001}
9002
9004 // GFX12 field is non-negative 24-bit signed byte offset.
9005 const unsigned OffsetBits =
9006 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9007 return (1 << OffsetBits) - 1;
9008}
9009
9011 if (!ST.isWave32())
9012 return;
9013
9014 if (MI.isInlineAsm())
9015 return;
9016
9017 for (auto &Op : MI.implicit_operands()) {
9018 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9019 Op.setReg(AMDGPU::VCC_LO);
9020 }
9021}
9022
9024 if (!isSMRD(MI))
9025 return false;
9026
9027 // Check that it is using a buffer resource.
9028 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9029 if (Idx == -1) // e.g. s_memtime
9030 return false;
9031
9032 const auto RCID = MI.getDesc().operands()[Idx].RegClass;
9033 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9034}
9035
9036// Given Imm, split it into the values to put into the SOffset and ImmOffset
9037// fields in an MUBUF instruction. Return false if it is not possible (due to a
9038// hardware bug needing a workaround).
9039//
9040// The required alignment ensures that individual address components remain
9041// aligned if they are aligned to begin with. It also ensures that additional
9042// offsets within the given alignment can be added to the resulting ImmOffset.
9044 uint32_t &ImmOffset, Align Alignment) const {
9045 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9046 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9047 uint32_t Overflow = 0;
9048
9049 if (Imm > MaxImm) {
9050 if (Imm <= MaxImm + 64) {
9051 // Use an SOffset inline constant for 4..64
9052 Overflow = Imm - MaxImm;
9053 Imm = MaxImm;
9054 } else {
9055 // Try to keep the same value in SOffset for adjacent loads, so that
9056 // the corresponding register contents can be re-used.
9057 //
9058 // Load values with all low-bits (except for alignment bits) set into
9059 // SOffset, so that a larger range of values can be covered using
9060 // s_movk_i32.
9061 //
9062 // Atomic operations fail to work correctly when individual address
9063 // components are unaligned, even if their sum is aligned.
9064 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9065 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9066 Imm = Low;
9067 Overflow = High - Alignment.value();
9068 }
9069 }
9070
9071 if (Overflow > 0) {
9072 // There is a hardware bug in SI and CI which prevents address clamping in
9073 // MUBUF instructions from working correctly with SOffsets. The immediate
9074 // offset is unaffected.
9076 return false;
9077
9078 // It is not possible to set immediate in SOffset field on some targets.
9079 if (ST.hasRestrictedSOffset())
9080 return false;
9081 }
9082
9083 ImmOffset = Imm;
9084 SOffset = Overflow;
9085 return true;
9086}
9087
9088// Depending on the used address space and instructions, some immediate offsets
9089// are allowed and some are not.
9090// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9091// scratch instruction offsets can also be negative. On GFX12, offsets can be
9092// negative for all variants.
9093//
9094// There are several bugs related to these offsets:
9095// On gfx10.1, flat instructions that go into the global address space cannot
9096// use an offset.
9097//
9098// For scratch instructions, the address can be either an SGPR or a VGPR.
9099// The following offsets can be used, depending on the architecture (x means
9100// cannot be used):
9101// +----------------------------+------+------+
9102// | Address-Mode | SGPR | VGPR |
9103// +----------------------------+------+------+
9104// | gfx9 | | |
9105// | negative, 4-aligned offset | x | ok |
9106// | negative, unaligned offset | x | ok |
9107// +----------------------------+------+------+
9108// | gfx10 | | |
9109// | negative, 4-aligned offset | ok | ok |
9110// | negative, unaligned offset | ok | x |
9111// +----------------------------+------+------+
9112// | gfx10.3 | | |
9113// | negative, 4-aligned offset | ok | ok |
9114// | negative, unaligned offset | ok | ok |
9115// +----------------------------+------+------+
9116//
9117// This function ignores the addressing mode, so if an offset cannot be used in
9118// one addressing mode, it is considered illegal.
9119bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9120 uint64_t FlatVariant) const {
9121 // TODO: Should 0 be special cased?
9122 if (!ST.hasFlatInstOffsets())
9123 return false;
9124
9125 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9126 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9127 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9128 return false;
9129
9131 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9132 (Offset % 4) != 0) {
9133 return false;
9134 }
9135
9136 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9137 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9138 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9139}
9140
9141// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9142std::pair<int64_t, int64_t>
9143SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9144 uint64_t FlatVariant) const {
9145 int64_t RemainderOffset = COffsetVal;
9146 int64_t ImmField = 0;
9147
9148 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9149 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9150
9151 if (AllowNegative) {
9152 // Use signed division by a power of two to truncate towards 0.
9153 int64_t D = 1LL << NumBits;
9154 RemainderOffset = (COffsetVal / D) * D;
9155 ImmField = COffsetVal - RemainderOffset;
9156
9158 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9159 (ImmField % 4) != 0) {
9160 // Make ImmField a multiple of 4
9161 RemainderOffset += ImmField % 4;
9162 ImmField -= ImmField % 4;
9163 }
9164 } else if (COffsetVal >= 0) {
9165 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9166 RemainderOffset = COffsetVal - ImmField;
9167 }
9168
9169 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9170 assert(RemainderOffset + ImmField == COffsetVal);
9171 return {ImmField, RemainderOffset};
9172}
9173
9175 if (ST.hasNegativeScratchOffsetBug() &&
9176 FlatVariant == SIInstrFlags::FlatScratch)
9177 return false;
9178
9179 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9180}
9181
9182static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9183 switch (ST.getGeneration()) {
9184 default:
9185 break;
9188 return SIEncodingFamily::SI;
9191 return SIEncodingFamily::VI;
9198 }
9199 llvm_unreachable("Unknown subtarget generation!");
9200}
9201
9202bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9203 switch(MCOp) {
9204 // These opcodes use indirect register addressing so
9205 // they need special handling by codegen (currently missing).
9206 // Therefore it is too risky to allow these opcodes
9207 // to be selected by dpp combiner or sdwa peepholer.
9208 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
9209 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
9210 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
9211 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
9212 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
9213 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
9214 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
9215 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
9216 return true;
9217 default:
9218 return false;
9219 }
9220}
9221
9222int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
9223 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
9224
9225 unsigned Gen = subtargetEncodingFamily(ST);
9226
9227 if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
9230
9231 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
9232 // subtarget has UnpackedD16VMem feature.
9233 // TODO: remove this when we discard GFX80 encoding.
9234 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
9236
9237 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
9238 switch (ST.getGeneration()) {
9239 default:
9241 break;
9244 break;
9247 break;
9248 }
9249 }
9250
9251 if (isMAI(Opcode)) {
9252 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
9253 if (MFMAOp != -1)
9254 Opcode = MFMAOp;
9255 }
9256
9257 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
9258
9259 // -1 means that Opcode is already a native instruction.
9260 if (MCOp == -1)
9261 return Opcode;
9262
9263 if (ST.hasGFX90AInsts()) {
9264 uint16_t NMCOp = (uint16_t)-1;
9265 if (ST.hasGFX940Insts())
9267 if (NMCOp == (uint16_t)-1)
9269 if (NMCOp == (uint16_t)-1)
9271 if (NMCOp != (uint16_t)-1)
9272 MCOp = NMCOp;
9273 }
9274
9275 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
9276 // no encoding in the given subtarget generation.
9277 if (MCOp == (uint16_t)-1)
9278 return -1;
9279
9280 if (isAsmOnlyOpcode(MCOp))
9281 return -1;
9282
9283 return MCOp;
9284}
9285
9286static
9288 assert(RegOpnd.isReg());
9289 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
9290 getRegSubRegPair(RegOpnd);
9291}
9292
9295 assert(MI.isRegSequence());
9296 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
9297 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
9298 auto &RegOp = MI.getOperand(1 + 2 * I);
9299 return getRegOrUndef(RegOp);
9300 }
9302}
9303
9304// Try to find the definition of reg:subreg in subreg-manipulation pseudos
9305// Following a subreg of reg:subreg isn't supported
9308 if (!RSR.SubReg)
9309 return false;
9310 switch (MI.getOpcode()) {
9311 default: break;
9312 case AMDGPU::REG_SEQUENCE:
9313 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
9314 return true;
9315 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
9316 case AMDGPU::INSERT_SUBREG:
9317 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
9318 // inserted the subreg we're looking for
9319 RSR = getRegOrUndef(MI.getOperand(2));
9320 else { // the subreg in the rest of the reg
9321 auto R1 = getRegOrUndef(MI.getOperand(1));
9322 if (R1.SubReg) // subreg of subreg isn't supported
9323 return false;
9324 RSR.Reg = R1.Reg;
9325 }
9326 return true;
9327 }
9328 return false;
9329}
9330
9333 assert(MRI.isSSA());
9334 if (!P.Reg.isVirtual())
9335 return nullptr;
9336
9337 auto RSR = P;
9338 auto *DefInst = MRI.getVRegDef(RSR.Reg);
9339 while (auto *MI = DefInst) {
9340 DefInst = nullptr;
9341 switch (MI->getOpcode()) {
9342 case AMDGPU::COPY:
9343 case AMDGPU::V_MOV_B32_e32: {
9344 auto &Op1 = MI->getOperand(1);
9345 if (Op1.isReg() && Op1.getReg().isVirtual()) {
9346 if (Op1.isUndef())
9347 return nullptr;
9348 RSR = getRegSubRegPair(Op1);
9349 DefInst = MRI.getVRegDef(RSR.Reg);
9350 }
9351 break;
9352 }
9353 default:
9354 if (followSubRegDef(*MI, RSR)) {
9355 if (!RSR.Reg)
9356 return nullptr;
9357 DefInst = MRI.getVRegDef(RSR.Reg);
9358 }
9359 }
9360 if (!DefInst)
9361 return MI;
9362 }
9363 return nullptr;
9364}
9365
9367 Register VReg,
9368 const MachineInstr &DefMI,
9369 const MachineInstr &UseMI) {
9370 assert(MRI.isSSA() && "Must be run on SSA");
9371
9372 auto *TRI = MRI.getTargetRegisterInfo();
9373 auto *DefBB = DefMI.getParent();
9374
9375 // Don't bother searching between blocks, although it is possible this block
9376 // doesn't modify exec.
9377 if (UseMI.getParent() != DefBB)
9378 return true;
9379
9380 const int MaxInstScan = 20;
9381 int NumInst = 0;
9382
9383 // Stop scan at the use.
9384 auto E = UseMI.getIterator();
9385 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
9386 if (I->isDebugInstr())
9387 continue;
9388
9389 if (++NumInst > MaxInstScan)
9390 return true;
9391
9392 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
9393 return true;
9394 }
9395
9396 return false;
9397}
9398
9400 Register VReg,
9401 const MachineInstr &DefMI) {
9402 assert(MRI.isSSA() && "Must be run on SSA");
9403
9404 auto *TRI = MRI.getTargetRegisterInfo();
9405 auto *DefBB = DefMI.getParent();
9406
9407 const int MaxUseScan = 10;
9408 int NumUse = 0;
9409
9410 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
9411 auto &UseInst = *Use.getParent();
9412 // Don't bother searching between blocks, although it is possible this block
9413 // doesn't modify exec.
9414 if (UseInst.getParent() != DefBB || UseInst.isPHI())
9415 return true;
9416
9417 if (++NumUse > MaxUseScan)
9418 return true;
9419 }
9420
9421 if (NumUse == 0)
9422 return false;
9423
9424 const int MaxInstScan = 20;
9425 int NumInst = 0;
9426
9427 // Stop scan when we have seen all the uses.
9428 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
9429 assert(I != DefBB->end());
9430
9431 if (I->isDebugInstr())
9432 continue;
9433
9434 if (++NumInst > MaxInstScan)
9435 return true;
9436
9437 for (const MachineOperand &Op : I->operands()) {
9438 // We don't check reg masks here as they're used only on calls:
9439 // 1. EXEC is only considered const within one BB
9440 // 2. Call should be a terminator instruction if present in a BB
9441
9442 if (!Op.isReg())
9443 continue;
9444
9445 Register Reg = Op.getReg();
9446 if (Op.isUse()) {
9447 if (Reg == VReg && --NumUse == 0)
9448 return false;
9449 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
9450 return true;
9451 }
9452 }
9453}
9454
9457 const DebugLoc &DL, Register Src, Register Dst) const {
9458 auto Cur = MBB.begin();
9459 if (Cur != MBB.end())
9460 do {
9461 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
9462 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
9463 ++Cur;
9464 } while (Cur != MBB.end() && Cur != LastPHIIt);
9465
9466 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
9467 Dst);
9468}
9469
9472 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
9473 if (InsPt != MBB.end() &&
9474 (InsPt->getOpcode() == AMDGPU::SI_IF ||
9475 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
9476 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
9477 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
9478 InsPt++;
9479 return BuildMI(MBB, InsPt, DL,
9480 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
9481 : AMDGPU::S_MOV_B64_term),
9482 Dst)
9483 .addReg(Src, 0, SrcSubReg)
9484 .addReg(AMDGPU::EXEC, RegState::Implicit);
9485 }
9486 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
9487 Dst);
9488}
9489
9490bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
9491
9494 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
9495 VirtRegMap *VRM) const {
9496 // This is a bit of a hack (copied from AArch64). Consider this instruction:
9497 //
9498 // %0:sreg_32 = COPY $m0
9499 //
9500 // We explicitly chose SReg_32 for the virtual register so such a copy might
9501 // be eliminated by RegisterCoalescer. However, that may not be possible, and
9502 // %0 may even spill. We can't spill $m0 normally (it would require copying to
9503 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
9504 // TargetInstrInfo::foldMemoryOperand() is going to try.
9505 // A similar issue also exists with spilling and reloading $exec registers.
9506 //
9507 // To prevent that, constrain the %0 register class here.
9508 if (isFullCopyInstr(MI)) {
9509 Register DstReg = MI.getOperand(0).getReg();
9510 Register SrcReg = MI.getOperand(1).getReg();
9511 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
9512 (DstReg.isVirtual() != SrcReg.isVirtual())) {
9514 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
9515 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
9516 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
9517 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
9518 return nullptr;
9519 }
9520 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
9521 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
9522 return nullptr;
9523 }
9524 }
9525 }
9526
9527 return nullptr;
9528}
9529
9531 const MachineInstr &MI,
9532 unsigned *PredCost) const {
9533 if (MI.isBundle()) {
9535 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
9536 unsigned Lat = 0, Count = 0;
9537 for (++I; I != E && I->isBundledWithPred(); ++I) {
9538 ++Count;
9539 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
9540 }
9541 return Lat + Count - 1;
9542 }
9543
9544 return SchedModel.computeInstrLatency(&MI);
9545}
9546
9549 unsigned opcode = MI.getOpcode();
9550 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
9551 auto IID = GI->getIntrinsicID();
9556
9557 switch (IID) {
9558 case Intrinsic::amdgcn_if:
9559 case Intrinsic::amdgcn_else:
9560 // FIXME: Uniform if second result
9561 break;
9562 }
9563
9565 }
9566
9567 // Loads from the private and flat address spaces are divergent, because
9568 // threads can execute the load instruction with the same inputs and get
9569 // different results.
9570 //
9571 // All other loads are not divergent, because if threads issue loads with the
9572 // same arguments, they will always get the same result.
9573 if (opcode == AMDGPU::G_LOAD) {
9574 if (MI.memoperands_empty())
9575 return InstructionUniformity::NeverUniform; // conservative assumption
9576
9577 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9578 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9579 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9580 })) {
9581 // At least one MMO in a non-global address space.
9583 }
9585 }
9586
9587 if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) ||
9588 opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
9589 opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
9590 AMDGPU::isGenericAtomic(opcode)) {
9592 }
9594}
9595
9598
9599 if (isNeverUniform(MI))
9601
9602 unsigned opcode = MI.getOpcode();
9603 if (opcode == AMDGPU::V_READLANE_B32 ||
9604 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
9605 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
9607
9608 if (isCopyInstr(MI)) {
9609 const MachineOperand &srcOp = MI.getOperand(1);
9610 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
9611 const TargetRegisterClass *regClass =
9612 RI.getPhysRegBaseClass(srcOp.getReg());
9615 }
9617 }
9618
9619 // GMIR handling
9620 if (MI.isPreISelOpcode())
9622
9623 // Atomics are divergent because they are executed sequentially: when an
9624 // atomic operation refers to the same address in each thread, then each
9625 // thread after the first sees the value written by the previous thread as
9626 // original value.
9627
9628 if (isAtomic(MI))
9630
9631 // Loads from the private and flat address spaces are divergent, because
9632 // threads can execute the load instruction with the same inputs and get
9633 // different results.
9634 if (isFLAT(MI) && MI.mayLoad()) {
9635 if (MI.memoperands_empty())
9636 return InstructionUniformity::NeverUniform; // conservative assumption
9637
9638 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9639 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9640 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9641 })) {
9642 // At least one MMO in a non-global address space.
9644 }
9645
9647 }
9648
9649 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9650 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
9651
9652 // FIXME: It's conceptually broken to report this for an instruction, and not
9653 // a specific def operand. For inline asm in particular, there could be mixed
9654 // uniform and divergent results.
9655 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
9656 const MachineOperand &SrcOp = MI.getOperand(I);
9657 if (!SrcOp.isReg())
9658 continue;
9659
9660 Register Reg = SrcOp.getReg();
9661 if (!Reg || !SrcOp.readsReg())
9662 continue;
9663
9664 // If RegBank is null, this is unassigned or an unallocatable special
9665 // register, which are all scalars.
9666 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
9667 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
9669 }
9670
9671 // TODO: Uniformity check condtions above can be rearranged for more
9672 // redability
9673
9674 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
9675 // currently turned into no-op COPYs by SelectionDAG ISel and are
9676 // therefore no longer recognizable.
9677
9679}
9680
9682 switch (MF.getFunction().getCallingConv()) {
9684 return 1;
9686 return 2;
9688 return 3;
9692 report_fatal_error("ds_ordered_count unsupported for this calling conv");
9695 case CallingConv::C:
9696 case CallingConv::Fast:
9697 default:
9698 // Assume other calling conventions are various compute callable functions
9699 return 0;
9700 }
9701}
9702
9704 Register &SrcReg2, int64_t &CmpMask,
9705 int64_t &CmpValue) const {
9706 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
9707 return false;
9708
9709 switch (MI.getOpcode()) {
9710 default:
9711 break;
9712 case AMDGPU::S_CMP_EQ_U32:
9713 case AMDGPU::S_CMP_EQ_I32:
9714 case AMDGPU::S_CMP_LG_U32:
9715 case AMDGPU::S_CMP_LG_I32:
9716 case AMDGPU::S_CMP_LT_U32:
9717 case AMDGPU::S_CMP_LT_I32:
9718 case AMDGPU::S_CMP_GT_U32:
9719 case AMDGPU::S_CMP_GT_I32:
9720 case AMDGPU::S_CMP_LE_U32:
9721 case AMDGPU::S_CMP_LE_I32:
9722 case AMDGPU::S_CMP_GE_U32:
9723 case AMDGPU::S_CMP_GE_I32:
9724 case AMDGPU::S_CMP_EQ_U64:
9725 case AMDGPU::S_CMP_LG_U64:
9726 SrcReg = MI.getOperand(0).getReg();
9727 if (MI.getOperand(1).isReg()) {
9728 if (MI.getOperand(1).getSubReg())
9729 return false;
9730 SrcReg2 = MI.getOperand(1).getReg();
9731 CmpValue = 0;
9732 } else if (MI.getOperand(1).isImm()) {
9733 SrcReg2 = Register();
9734 CmpValue = MI.getOperand(1).getImm();
9735 } else {
9736 return false;
9737 }
9738 CmpMask = ~0;
9739 return true;
9740 case AMDGPU::S_CMPK_EQ_U32:
9741 case AMDGPU::S_CMPK_EQ_I32:
9742 case AMDGPU::S_CMPK_LG_U32:
9743 case AMDGPU::S_CMPK_LG_I32:
9744 case AMDGPU::S_CMPK_LT_U32:
9745 case AMDGPU::S_CMPK_LT_I32:
9746 case AMDGPU::S_CMPK_GT_U32:
9747 case AMDGPU::S_CMPK_GT_I32:
9748 case AMDGPU::S_CMPK_LE_U32:
9749 case AMDGPU::S_CMPK_LE_I32:
9750 case AMDGPU::S_CMPK_GE_U32:
9751 case AMDGPU::S_CMPK_GE_I32:
9752 SrcReg = MI.getOperand(0).getReg();
9753 SrcReg2 = Register();
9754 CmpValue = MI.getOperand(1).getImm();
9755 CmpMask = ~0;
9756 return true;
9757 }
9758
9759 return false;
9760}
9761
9763 Register SrcReg2, int64_t CmpMask,
9764 int64_t CmpValue,
9765 const MachineRegisterInfo *MRI) const {
9766 if (!SrcReg || SrcReg.isPhysical())
9767 return false;
9768
9769 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
9770 return false;
9771
9772 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
9773 this](int64_t ExpectedValue, unsigned SrcSize,
9774 bool IsReversible, bool IsSigned) -> bool {
9775 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9776 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9777 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9778 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9779 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
9780 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9781 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9782 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9783 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9784 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
9785 //
9786 // Signed ge/gt are not used for the sign bit.
9787 //
9788 // If result of the AND is unused except in the compare:
9789 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
9790 //
9791 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9792 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9793 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
9794 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9795 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9796 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
9797
9798 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
9799 if (!Def || Def->getParent() != CmpInstr.getParent())
9800 return false;
9801
9802 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
9803 Def->getOpcode() != AMDGPU::S_AND_B64)
9804 return false;
9805
9806 int64_t Mask;
9807 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
9808 if (MO->isImm())
9809 Mask = MO->getImm();
9810 else if (!getFoldableImm(MO, Mask))
9811 return false;
9812 Mask &= maxUIntN(SrcSize);
9813 return isPowerOf2_64(Mask);
9814 };
9815
9816 MachineOperand *SrcOp = &Def->getOperand(1);
9817 if (isMask(SrcOp))
9818 SrcOp = &Def->getOperand(2);
9819 else if (isMask(&Def->getOperand(2)))
9820 SrcOp = &Def->getOperand(1);
9821 else
9822 return false;
9823
9824 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
9825 if (IsSigned && BitNo == SrcSize - 1)
9826 return false;
9827
9828 ExpectedValue <<= BitNo;
9829
9830 bool IsReversedCC = false;
9831 if (CmpValue != ExpectedValue) {
9832 if (!IsReversible)
9833 return false;
9834 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
9835 if (!IsReversedCC)
9836 return false;
9837 }
9838
9839 Register DefReg = Def->getOperand(0).getReg();
9840 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
9841 return false;
9842
9843 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
9844 I != E; ++I) {
9845 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
9846 I->killsRegister(AMDGPU::SCC, &RI))
9847 return false;
9848 }
9849
9850 MachineOperand *SccDef =
9851 Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
9852 SccDef->setIsDead(false);
9853 CmpInstr.eraseFromParent();
9854
9855 if (!MRI->use_nodbg_empty(DefReg)) {
9856 assert(!IsReversedCC);
9857 return true;
9858 }
9859
9860 // Replace AND with unused result with a S_BITCMP.
9861 MachineBasicBlock *MBB = Def->getParent();
9862
9863 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
9864 : AMDGPU::S_BITCMP1_B32
9865 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
9866 : AMDGPU::S_BITCMP1_B64;
9867
9868 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
9869 .add(*SrcOp)
9870 .addImm(BitNo);
9871 Def->eraseFromParent();
9872
9873 return true;
9874 };
9875
9876 switch (CmpInstr.getOpcode()) {
9877 default:
9878 break;
9879 case AMDGPU::S_CMP_EQ_U32:
9880 case AMDGPU::S_CMP_EQ_I32:
9881 case AMDGPU::S_CMPK_EQ_U32:
9882 case AMDGPU::S_CMPK_EQ_I32:
9883 return optimizeCmpAnd(1, 32, true, false);
9884 case AMDGPU::S_CMP_GE_U32:
9885 case AMDGPU::S_CMPK_GE_U32:
9886 return optimizeCmpAnd(1, 32, false, false);
9887 case AMDGPU::S_CMP_GE_I32:
9888 case AMDGPU::S_CMPK_GE_I32:
9889 return optimizeCmpAnd(1, 32, false, true);
9890 case AMDGPU::S_CMP_EQ_U64:
9891 return optimizeCmpAnd(1, 64, true, false);
9892 case AMDGPU::S_CMP_LG_U32:
9893 case AMDGPU::S_CMP_LG_I32:
9894 case AMDGPU::S_CMPK_LG_U32:
9895 case AMDGPU::S_CMPK_LG_I32:
9896 return optimizeCmpAnd(0, 32, true, false);
9897 case AMDGPU::S_CMP_GT_U32:
9898 case AMDGPU::S_CMPK_GT_U32:
9899 return optimizeCmpAnd(0, 32, false, false);
9900 case AMDGPU::S_CMP_GT_I32:
9901 case AMDGPU::S_CMPK_GT_I32:
9902 return optimizeCmpAnd(0, 32, false, true);
9903 case AMDGPU::S_CMP_LG_U64:
9904 return optimizeCmpAnd(0, 64, true, false);
9905 }
9906
9907 return false;
9908}
9909
9911 unsigned OpName) const {
9912 if (!ST.needsAlignedVGPRs())
9913 return;
9914
9915 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
9916 if (OpNo < 0)
9917 return;
9918 MachineOperand &Op = MI.getOperand(OpNo);
9919 if (getOpSize(MI, OpNo) > 4)
9920 return;
9921
9922 // Add implicit aligned super-reg to force alignment on the data operand.
9923 const DebugLoc &DL = MI.getDebugLoc();
9924 MachineBasicBlock *BB = MI.getParent();
9926 Register DataReg = Op.getReg();
9927 bool IsAGPR = RI.isAGPR(MRI, DataReg);
9928 Register Undef = MRI.createVirtualRegister(
9929 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
9930 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
9931 Register NewVR =
9932 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
9933 : &AMDGPU::VReg_64_Align2RegClass);
9934 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
9935 .addReg(DataReg, 0, Op.getSubReg())
9936 .addImm(AMDGPU::sub0)
9937 .addReg(Undef)
9938 .addImm(AMDGPU::sub1);
9939 Op.setReg(NewVR);
9940 Op.setSubReg(AMDGPU::sub0);
9941 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
9942}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
unsigned RegSize
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
amdgpu AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
TargetInstrInfo::RegSubRegPair RegSubRegPair
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillRestoreOpcode(unsigned Size)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, const MachineRegisterInfo &MRI, const MCInstrDesc &TID, unsigned RCID, bool IsAllocatable)
static unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillSaveOpcode(unsigned Size)
static bool resultDependsOnExec(const MachineInstr &MI)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, unsigned OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
Definition: SIInstrInfo.cpp:82
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static constexpr unsigned ModifierOpNames[]
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
Definition: SIInstrInfo.cpp:73
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool IsDead
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool has16BitInsts() const
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
Class for arbitrary precision integers.
Definition: APInt.h:78
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:807
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1522
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:168
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:274
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:358
bool useVGPRIndexMode() const
bool hasSDWAOmod() const
Definition: GCNSubtarget.h:745
bool hasA16() const
bool hasSDWAScalar() const
Definition: GCNSubtarget.h:749
bool hasScalarCompareEq64() const
Definition: GCNSubtarget.h:999
bool hasOnlyRevVALUShifts() const
Definition: GCNSubtarget.h:391
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:627
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:765
bool hasMAIInsts() const
Definition: GCNSubtarget.h:815
bool hasMFMAInlineLiteralBug() const
bool hasNegativeScratchOffsetBug() const
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasPkMovB32() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasR128A16() const
bool hasOffset3fBug() const
bool hasGetPCZeroExtension() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:278
const AMDGPURegisterBankInfo * getRegBankInfo() const override
Definition: GCNSubtarget.h:298
bool hasSDWAOutModsVOPC() const
Definition: GCNSubtarget.h:761
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:680
bool hasGFX940Insts() const
bool hasSDWASdst() const
Definition: GCNSubtarget.h:753
bool hasMovB64() const
bool isWave32() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:344
bool hasNegativeUnalignedScratchOffsetBug() const
bool hasG16() const
unsigned getNSAMaxSize(bool HasSampler=false) const
Generation getGeneration() const
Definition: GCNSubtarget.h:317
bool hasVOP3Literal() const
Definition: GCNSubtarget.h:924
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:732
bool hasAddr64() const
Definition: GCNSubtarget.h:381
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:724
bool hasGDS() const
bool hasPartialNSAEncoding() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
A possibly irreducible generalization of a Loop.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:157
iterator find(SlotIndex Pos)
find - Return an iterator pointing to the first segment that ends after Pos, or end().
void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:537
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:607
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:617
static const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition: MCExpr.cpp:193
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
bool mayStore() const
Return true if this instruction could possibly modify memory.
Definition: MCInstrDesc.h:444
bool mayLoad() const
Return true if this instruction could possibly read memory.
Definition: MCInstrDesc.h:438
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
Definition: MCInstrDesc.h:248
unsigned short Opcode
Definition: MCInstrDesc.h:205
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
Definition: MCInstrDesc.h:565
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
uint8_t OperandType
Information about the type of the operand.
Definition: MCInstrDesc.h:97
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition: MCInstrDesc.h:91
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx)
Definition: MCExpr.h:393
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:41
void setVariableValue(const MCExpr *Value)
Definition: MCSymbol.cpp:47
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
unsigned pred_size() const
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
reverse_iterator rend()
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< iterator > terminators()
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineDomTreeNode * addNewBlock(MachineBasicBlock *BB, MachineBasicBlock *DomBB)
addNewBlock - Add a new node to the dominator tree information.
bool properlyDominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
void changeImmediateDominator(MachineBasicBlock *N, MachineBasicBlock *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:346
void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:572
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
iterator_range< mop_iterator > explicit_operands()
Definition: MachineInstr.h:691
unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
Definition: MachineInstr.h:815
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:800
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:782
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:498
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
iterator_range< mop_iterator > implicit_operands()
Definition: MachineInstr.h:699
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
uint32_t getFlags() const
Return the MI flags bitvector.
Definition: MachineInstr.h:391
int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
bool addRegisterDead(Register Reg, const TargetRegisterInfo *RegInfo, bool AddIfNotFound=false)
We have determined MI defined a register without a use.
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
reg_begin/reg_end - Provide iteration support to walk over all definitions and uses of a register wit...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
void enterBasicBlockEnd(MachineBasicBlock &MBB)
Start tracking liveness from the end of basic block MBB.
bool isRegUsed(Register Reg, bool includeReserved=true) const
Return if a specific register is currently used.
void setRegUsed(Register Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Tell the scavenger a register is used.
void backward()
Update internal register state and move MBB iterator backwards.
void enterBasicBlock(MachineBasicBlock &MBB)
Start tracking liveness from the begin of basic block MBB.
Register scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, bool AllowSpill=true)
Make a register of the specific register class available from the current position backwards to the p...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
static bool isMAI(const MachineInstr &MI)
Definition: SIInstrInfo.h:792
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:554
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
bool isNonUniformBranchInstr(MachineInstr &Instr) const
static bool isVOP3(const MachineInstr &MI)
Definition: SIInstrInfo.h:504
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
Definition: SIInstrInfo.h:924
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
Definition: SIInstrInfo.h:1144
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:636
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:544
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const
Get required immediate operand.
Definition: SIInstrInfo.h:1272
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:536
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:649
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:408
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:496
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:512
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
Definition: SIInstrInfo.h:604
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:520
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
bool isSpill(uint16_t Opcode) const
Definition: SIInstrInfo.h:740
static bool isVIMAGE(const MachineInstr &MI)
Definition: SIInstrInfo.h:588
static bool isSOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:448
static bool isGWS(const MachineInstr &MI)
Definition: SIInstrInfo.h:570
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, MachineOperand &Src1, unsigned Src1OpName) const
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:628
static bool isVSAMPLE(const MachineInstr &MI)
Definition: SIInstrInfo.h:596
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
unsigned isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isImage(const MachineInstr &MI)
Definition: SIInstrInfo.h:424
static bool isSOPK(const MachineInstr &MI)
Definition: SIInstrInfo.h:464
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
Definition: SIInstrInfo.h:957
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const override
bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, const MachineOperand &MO) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:528
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void convertNonUniformLoopRegion(MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const override final
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:618
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool isDPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:760
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
bool isLowLatencyInstruction(const MachineInstr &MI) const
void materializeImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, int64_t Value) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
bool isAlwaysGDS(uint16_t Opcode) const
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool isVGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:716
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool isVGPRCopy(const MachineInstr &MI) const
Definition: SIInstrInfo.h:1003
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:580
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description.
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
const TargetRegisterClass * getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI, const MachineFunction &MF) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc) const override
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
void convertNonUniformIfRegion(MachineBasicBlock *IfEntry, MachineBasicBlock *IfEnd) const
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
static bool isAtomic(const MachineInstr &MI)
Definition: SIInstrInfo.h:681
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
static bool sopkIsZext(unsigned Opcode)
Definition: SIInstrInfo.h:863
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const override
static bool isSGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:728
static bool isWMMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:809
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:456
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:612
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:416
bool isBarrier(unsigned Opcode) const
Definition: SIInstrInfo.h:939
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
Definition: SIInstrInfo.h:1285
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
bool hasModifiersSet(const MachineInstr &MI, unsigned OpName) const
static bool isFixedSize(const MachineInstr &MI)
Definition: SIInstrInfo.h:880
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isLDSDMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:562
unsigned isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isVOP1(const MachineInstr &MI)
Definition: SIInstrInfo.h:488
SIInstrInfo(const GCNSubtarget &ST)
Definition: SIInstrInfo.cpp:63
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
const TargetRegisterClass * getRegClass(unsigned RCID) const
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
static bool hasVectorRegisters(const TargetRegisterClass *RC)
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool opCanUseInlineConstant(unsigned OpType) const
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
bool opCanUseLiteralConstant(unsigned OpType) const
static bool hasVGPRs(const TargetRegisterClass *RC)
static bool isVGPRClass(const TargetRegisterClass *RC)
unsigned getHWRegIndex(MCRegister Reg) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
const TargetRegisterClass * getBoolRC() const
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getChannelFromSubReg(unsigned SubReg) const
MCRegister getVCC() const
static bool hasAGPRs(const TargetRegisterClass *RC)
const TargetRegisterClass * getWaveMaskRegClass() const
bool spillSGPRToVGPR() const
const TargetRegisterClass * getVGPR64Class() const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:577
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:65
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
Definition: SlotIndexes.h:237
SlotIndexes pass.
Definition: SlotIndexes.h:297
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:531
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
const TargetRegisterClass * getAllocatableClass(const TargetRegisterClass *RC) const
Return the maximal subclass of the given register class that is allocatable or NULL.
unsigned getSubRegIdxSize(unsigned Idx) const
Get the size of the bit range covered by a sub-register index.
unsigned getSubRegIdxOffset(unsigned Idx) const
Get the offset of the bit range covered by a sub-register index.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
Definition: SIInstrInfo.h:1563
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isDPALU_DPP(const MCInstrDesc &OpDesc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
Definition: SIInstrInfo.h:1564
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
bool isTrue16Inst(unsigned Opc)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo)
Is this an AMDGPU specific source operand? These include registers, inline constants,...
const uint64_t RSRC_TID_ENABLE
Definition: SIInstrInfo.h:1566
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isGenericAtomic(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isHi(unsigned Reg, const MCRegisterInfo &MRI)
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition: SIDefines.h:234
@ OPERAND_REG_IMM_INT64
Definition: SIDefines.h:201
@ OPERAND_REG_IMM_V2FP16
Definition: SIDefines.h:211
@ OPERAND_REG_INLINE_C_V2INT32
Definition: SIDefines.h:227
@ OPERAND_REG_INLINE_C_FP64
Definition: SIDefines.h:223
@ OPERAND_REG_INLINE_C_BF16
Definition: SIDefines.h:220
@ OPERAND_REG_INLINE_C_V2BF16
Definition: SIDefines.h:225
@ OPERAND_REG_IMM_V2INT16
Definition: SIDefines.h:212
@ OPERAND_REG_IMM_BF16
Definition: SIDefines.h:205
@ OPERAND_REG_INLINE_AC_V2FP16
Definition: SIDefines.h:246
@ OPERAND_REG_IMM_INT32
Operands with register or 32-bit immediate.
Definition: SIDefines.h:200
@ OPERAND_REG_IMM_V2BF16
Definition: SIDefines.h:210
@ OPERAND_REG_IMM_BF16_DEFERRED
Definition: SIDefines.h:207
@ OPERAND_REG_IMM_FP16
Definition: SIDefines.h:206
@ OPERAND_REG_INLINE_C_INT64
Definition: SIDefines.h:219
@ OPERAND_REG_INLINE_AC_BF16
Definition: SIDefines.h:240
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition: SIDefines.h:217
@ OPERAND_REG_INLINE_AC_INT16
Operands with an AccVGPR register or inline constant.
Definition: SIDefines.h:238
@ OPERAND_REG_IMM_FP64
Definition: SIDefines.h:204
@ OPERAND_REG_INLINE_C_V2FP16
Definition: SIDefines.h:226
@ OPERAND_REG_INLINE_AC_V2INT16
Definition: SIDefines.h:244
@ OPERAND_REG_INLINE_AC_FP16
Definition: SIDefines.h:241
@ OPERAND_REG_INLINE_AC_INT32
Definition: SIDefines.h:239
@ OPERAND_REG_INLINE_AC_FP32
Definition: SIDefines.h:242
@ OPERAND_REG_INLINE_AC_V2BF16
Definition: SIDefines.h:245
@ OPERAND_REG_IMM_V2INT32
Definition: SIDefines.h:213
@ OPERAND_REG_IMM_FP32
Definition: SIDefines.h:203
@ OPERAND_INPUT_MODS
Definition: SIDefines.h:251
@ OPERAND_REG_INLINE_C_FP32
Definition: SIDefines.h:222
@ OPERAND_REG_INLINE_C_INT32
Definition: SIDefines.h:218
@ OPERAND_REG_INLINE_C_V2INT16
Definition: SIDefines.h:224
@ OPERAND_REG_IMM_V2FP32
Definition: SIDefines.h:214
@ OPERAND_REG_INLINE_AC_FP64
Definition: SIDefines.h:243
@ OPERAND_REG_INLINE_C_FP16
Definition: SIDefines.h:221
@ OPERAND_REG_IMM_INT16
Definition: SIDefines.h:202
@ OPERAND_REG_INLINE_C_V2FP32
Definition: SIDefines.h:228
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition: SIDefines.h:231
@ OPERAND_REG_IMM_FP32_DEFERRED
Definition: SIDefines.h:209
@ OPERAND_REG_IMM_FP16_DEFERRED
Definition: SIDefines.h:208
@ TI_SCRATCH_RSRC_DWORD1
Definition: AMDGPU.h:409
@ TI_SCRATCH_RSRC_DWORD3
Definition: AMDGPU.h:411
@ TI_SCRATCH_RSRC_DWORD0
Definition: AMDGPU.h:408
@ TI_SCRATCH_RSRC_DWORD2
Definition: AMDGPU.h:410
@ TI_CONSTDATA_START
Definition: AMDGPU.h:407
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
int getMCOpcode(uint16_t Opcode, unsigned Gen)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
Definition: SIInstrInfo.h:1565
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
bool isGraphics(CallingConv::ID cc)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ OPERAND_GENERIC_4
Definition: MCInstrDesc.h:70
@ OPERAND_GENERIC_2
Definition: MCInstrDesc.h:68
@ OPERAND_GENERIC_1
Definition: MCInstrDesc.h:67
@ OPERAND_REGISTER
Definition: MCInstrDesc.h:61
@ OPERAND_GENERIC_3
Definition: MCInstrDesc.h:69
@ OPERAND_IMMEDIATE
Definition: MCInstrDesc.h:60
@ OPERAND_UNKNOWN
Definition: MCInstrDesc.h:59
@ OPERAND_GENERIC_0
Definition: MCInstrDesc.h:66
@ OPERAND_GENERIC_5
Definition: MCInstrDesc.h:71
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:138
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:480
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1454
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:547
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:296
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:41
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:260
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
Definition: TargetOpcodes.h:36
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Error
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:45
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition: Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
@ NeverUniform
The result values can never be assumed to be uniform.
@ Default
The result values are uniform if and only if all operands are uniform.
uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition: MathExtras.h:219
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition: APFloat.cpp:235
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Description of the encoding of one expression Op.
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
Definition: LiveVariables.h:83
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition: SIInstrInfo.h:49
MachineInstr * top() const
Definition: SIInstrInfo.h:54
bool empty() const
Definition: SIInstrInfo.h:64
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition: SIInstrInfo.h:73
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.