LLVM 20.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "GCNHazardRecognizer.h"
18#include "GCNSubtarget.h"
31#include "llvm/IR/IntrinsicsAMDGPU.h"
32#include "llvm/MC/MCContext.h"
35
36using namespace llvm;
37
38#define DEBUG_TYPE "si-instr-info"
39
40#define GET_INSTRINFO_CTOR_DTOR
41#include "AMDGPUGenInstrInfo.inc"
42
43namespace llvm::AMDGPU {
44#define GET_D16ImageDimIntrinsics_IMPL
45#define GET_ImageDimIntrinsicTable_IMPL
46#define GET_RsrcIntrinsics_IMPL
47#include "AMDGPUGenSearchableTables.inc"
48} // namespace llvm::AMDGPU
49
50// Must be at least 4 to be able to branch over minimum unconditional branch
51// code. This is only for making it possible to write reasonably small tests for
52// long branches.
54BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
55 cl::desc("Restrict range of branch instructions (DEBUG)"));
56
58 "amdgpu-fix-16-bit-physreg-copies",
59 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
60 cl::init(true),
62
64 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
65 RI(ST), ST(ST) {
66 SchedModel.init(&ST);
67}
68
69//===----------------------------------------------------------------------===//
70// TargetInstrInfo callbacks
71//===----------------------------------------------------------------------===//
72
73static unsigned getNumOperandsNoGlue(SDNode *Node) {
74 unsigned N = Node->getNumOperands();
75 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
76 --N;
77 return N;
78}
79
80/// Returns true if both nodes have the same value for the given
81/// operand \p Op, or if both nodes do not have this operand.
82static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
83 unsigned Opc0 = N0->getMachineOpcode();
84 unsigned Opc1 = N1->getMachineOpcode();
85
86 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
87 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
88
89 if (Op0Idx == -1 && Op1Idx == -1)
90 return true;
91
92
93 if ((Op0Idx == -1 && Op1Idx != -1) ||
94 (Op1Idx == -1 && Op0Idx != -1))
95 return false;
96
97 // getNamedOperandIdx returns the index for the MachineInstr's operands,
98 // which includes the result as the first operand. We are indexing into the
99 // MachineSDNode's operands, so we need to skip the result operand to get
100 // the real index.
101 --Op0Idx;
102 --Op1Idx;
103
104 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
105}
106
107static bool canRemat(const MachineInstr &MI) {
108
112 return true;
113
114 if (SIInstrInfo::isSMRD(MI)) {
115 return !MI.memoperands_empty() &&
116 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
117 return MMO->isLoad() && MMO->isInvariant();
118 });
119 }
120
121 return false;
122}
123
125 const MachineInstr &MI) const {
126
127 if (canRemat(MI)) {
128 // Normally VALU use of exec would block the rematerialization, but that
129 // is OK in this case to have an implicit exec read as all VALU do.
130 // We really want all of the generic logic for this except for this.
131
132 // Another potential implicit use is mode register. The core logic of
133 // the RA will not attempt rematerialization if mode is set anywhere
134 // in the function, otherwise it is safe since mode is not changed.
135
136 // There is difference to generic method which does not allow
137 // rematerialization if there are virtual register uses. We allow this,
138 // therefore this method includes SOP instructions as well.
139 if (!MI.hasImplicitDef() &&
140 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
141 !MI.mayRaiseFPException())
142 return true;
143 }
144
146}
147
148// Returns true if the scalar result of a VALU instruction depends on exec.
150 // Ignore comparisons which are only used masked with exec.
151 // This allows some hoisting/sinking of VALU comparisons.
152 if (MI.isCompare()) {
153 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
154 Register DstReg = MI.getOperand(0).getReg();
155 if (!DstReg.isVirtual())
156 return true;
157 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
158 switch (Use.getOpcode()) {
159 case AMDGPU::S_AND_SAVEEXEC_B32:
160 case AMDGPU::S_AND_SAVEEXEC_B64:
161 break;
162 case AMDGPU::S_AND_B32:
163 case AMDGPU::S_AND_B64:
164 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
165 return true;
166 break;
167 default:
168 return true;
169 }
170 }
171 return false;
172 }
173
174 switch (MI.getOpcode()) {
175 default:
176 break;
177 case AMDGPU::V_READFIRSTLANE_B32:
178 return true;
179 }
180
181 return false;
182}
183
185 // Any implicit use of exec by VALU is not a real register read.
186 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
188}
189
191 MachineBasicBlock *SuccToSinkTo,
192 MachineCycleInfo *CI) const {
193 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
194 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
195 return true;
196
197 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
198 // Check if sinking of MI would create temporal divergent use.
199 for (auto Op : MI.uses()) {
200 if (Op.isReg() && Op.getReg().isVirtual() &&
201 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
202 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
203
204 // SgprDef defined inside cycle
205 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
206 if (FromCycle == nullptr)
207 continue;
208
209 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
210 // Check if there is a FromCycle that contains SgprDef's basic block but
211 // does not contain SuccToSinkTo and also has divergent exit condition.
212 while (FromCycle && !FromCycle->contains(ToCycle)) {
214 FromCycle->getExitingBlocks(ExitingBlocks);
215
216 // FromCycle has divergent exit condition.
217 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
218 if (hasDivergentBranch(ExitingBlock))
219 return false;
220 }
221
222 FromCycle = FromCycle->getParentCycle();
223 }
224 }
225 }
226
227 return true;
228}
229
231 int64_t &Offset0,
232 int64_t &Offset1) const {
233 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
234 return false;
235
236 unsigned Opc0 = Load0->getMachineOpcode();
237 unsigned Opc1 = Load1->getMachineOpcode();
238
239 // Make sure both are actually loads.
240 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
241 return false;
242
243 // A mayLoad instruction without a def is not a load. Likely a prefetch.
244 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
245 return false;
246
247 if (isDS(Opc0) && isDS(Opc1)) {
248
249 // FIXME: Handle this case:
250 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
251 return false;
252
253 // Check base reg.
254 if (Load0->getOperand(0) != Load1->getOperand(0))
255 return false;
256
257 // Skip read2 / write2 variants for simplicity.
258 // TODO: We should report true if the used offsets are adjacent (excluded
259 // st64 versions).
260 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
261 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
262 if (Offset0Idx == -1 || Offset1Idx == -1)
263 return false;
264
265 // XXX - be careful of dataless loads
266 // getNamedOperandIdx returns the index for MachineInstrs. Since they
267 // include the output in the operand list, but SDNodes don't, we need to
268 // subtract the index by one.
269 Offset0Idx -= get(Opc0).NumDefs;
270 Offset1Idx -= get(Opc1).NumDefs;
271 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
272 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
273 return true;
274 }
275
276 if (isSMRD(Opc0) && isSMRD(Opc1)) {
277 // Skip time and cache invalidation instructions.
278 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
279 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
280 return false;
281
282 unsigned NumOps = getNumOperandsNoGlue(Load0);
283 if (NumOps != getNumOperandsNoGlue(Load1))
284 return false;
285
286 // Check base reg.
287 if (Load0->getOperand(0) != Load1->getOperand(0))
288 return false;
289
290 // Match register offsets, if both register and immediate offsets present.
291 assert(NumOps == 4 || NumOps == 5);
292 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
293 return false;
294
295 const ConstantSDNode *Load0Offset =
296 dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3));
297 const ConstantSDNode *Load1Offset =
298 dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3));
299
300 if (!Load0Offset || !Load1Offset)
301 return false;
302
303 Offset0 = Load0Offset->getZExtValue();
304 Offset1 = Load1Offset->getZExtValue();
305 return true;
306 }
307
308 // MUBUF and MTBUF can access the same addresses.
309 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
310
311 // MUBUF and MTBUF have vaddr at different indices.
312 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
313 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
314 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
315 return false;
316
317 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
318 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
319
320 if (OffIdx0 == -1 || OffIdx1 == -1)
321 return false;
322
323 // getNamedOperandIdx returns the index for MachineInstrs. Since they
324 // include the output in the operand list, but SDNodes don't, we need to
325 // subtract the index by one.
326 OffIdx0 -= get(Opc0).NumDefs;
327 OffIdx1 -= get(Opc1).NumDefs;
328
329 SDValue Off0 = Load0->getOperand(OffIdx0);
330 SDValue Off1 = Load1->getOperand(OffIdx1);
331
332 // The offset might be a FrameIndexSDNode.
333 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
334 return false;
335
336 Offset0 = Off0->getAsZExtVal();
337 Offset1 = Off1->getAsZExtVal();
338 return true;
339 }
340
341 return false;
342}
343
344static bool isStride64(unsigned Opc) {
345 switch (Opc) {
346 case AMDGPU::DS_READ2ST64_B32:
347 case AMDGPU::DS_READ2ST64_B64:
348 case AMDGPU::DS_WRITE2ST64_B32:
349 case AMDGPU::DS_WRITE2ST64_B64:
350 return true;
351 default:
352 return false;
353 }
354}
355
358 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
359 const TargetRegisterInfo *TRI) const {
360 if (!LdSt.mayLoadOrStore())
361 return false;
362
363 unsigned Opc = LdSt.getOpcode();
364 OffsetIsScalable = false;
365 const MachineOperand *BaseOp, *OffsetOp;
366 int DataOpIdx;
367
368 if (isDS(LdSt)) {
369 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
370 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
371 if (OffsetOp) {
372 // Normal, single offset LDS instruction.
373 if (!BaseOp) {
374 // DS_CONSUME/DS_APPEND use M0 for the base address.
375 // TODO: find the implicit use operand for M0 and use that as BaseOp?
376 return false;
377 }
378 BaseOps.push_back(BaseOp);
379 Offset = OffsetOp->getImm();
380 // Get appropriate operand, and compute width accordingly.
381 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
382 if (DataOpIdx == -1)
383 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
384 Width = getOpSize(LdSt, DataOpIdx);
385 } else {
386 // The 2 offset instructions use offset0 and offset1 instead. We can treat
387 // these as a load with a single offset if the 2 offsets are consecutive.
388 // We will use this for some partially aligned loads.
389 const MachineOperand *Offset0Op =
390 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
391 const MachineOperand *Offset1Op =
392 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
393
394 unsigned Offset0 = Offset0Op->getImm() & 0xff;
395 unsigned Offset1 = Offset1Op->getImm() & 0xff;
396 if (Offset0 + 1 != Offset1)
397 return false;
398
399 // Each of these offsets is in element sized units, so we need to convert
400 // to bytes of the individual reads.
401
402 unsigned EltSize;
403 if (LdSt.mayLoad())
404 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
405 else {
406 assert(LdSt.mayStore());
407 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
408 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
409 }
410
411 if (isStride64(Opc))
412 EltSize *= 64;
413
414 BaseOps.push_back(BaseOp);
415 Offset = EltSize * Offset0;
416 // Get appropriate operand(s), and compute width accordingly.
417 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
418 if (DataOpIdx == -1) {
419 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
420 Width = getOpSize(LdSt, DataOpIdx);
421 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
422 Width = Width.getValue() + getOpSize(LdSt, DataOpIdx);
423 } else {
424 Width = getOpSize(LdSt, DataOpIdx);
425 }
426 }
427 return true;
428 }
429
430 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
431 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
432 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
433 return false;
434 BaseOps.push_back(RSrc);
435 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
436 if (BaseOp && !BaseOp->isFI())
437 BaseOps.push_back(BaseOp);
438 const MachineOperand *OffsetImm =
439 getNamedOperand(LdSt, AMDGPU::OpName::offset);
440 Offset = OffsetImm->getImm();
441 const MachineOperand *SOffset =
442 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
443 if (SOffset) {
444 if (SOffset->isReg())
445 BaseOps.push_back(SOffset);
446 else
447 Offset += SOffset->getImm();
448 }
449 // Get appropriate operand, and compute width accordingly.
450 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
451 if (DataOpIdx == -1)
452 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
453 if (DataOpIdx == -1) // LDS DMA
454 return false;
455 Width = getOpSize(LdSt, DataOpIdx);
456 return true;
457 }
458
459 if (isImage(LdSt)) {
460 auto RsrcOpName =
461 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
462 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
463 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
464 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
465 if (VAddr0Idx >= 0) {
466 // GFX10 possible NSA encoding.
467 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
468 BaseOps.push_back(&LdSt.getOperand(I));
469 } else {
470 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
471 }
472 Offset = 0;
473 // Get appropriate operand, and compute width accordingly.
474 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
475 if (DataOpIdx == -1)
476 return false; // no return sampler
477 Width = getOpSize(LdSt, DataOpIdx);
478 return true;
479 }
480
481 if (isSMRD(LdSt)) {
482 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
483 if (!BaseOp) // e.g. S_MEMTIME
484 return false;
485 BaseOps.push_back(BaseOp);
486 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
487 Offset = OffsetOp ? OffsetOp->getImm() : 0;
488 // Get appropriate operand, and compute width accordingly.
489 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
490 if (DataOpIdx == -1)
491 return false;
492 Width = getOpSize(LdSt, DataOpIdx);
493 return true;
494 }
495
496 if (isFLAT(LdSt)) {
497 // Instructions have either vaddr or saddr or both or none.
498 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
499 if (BaseOp)
500 BaseOps.push_back(BaseOp);
501 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
502 if (BaseOp)
503 BaseOps.push_back(BaseOp);
504 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
505 // Get appropriate operand, and compute width accordingly.
506 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
507 if (DataOpIdx == -1)
508 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
509 if (DataOpIdx == -1) // LDS DMA
510 return false;
511 Width = getOpSize(LdSt, DataOpIdx);
512 return true;
513 }
514
515 return false;
516}
517
518static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
520 const MachineInstr &MI2,
522 // Only examine the first "base" operand of each instruction, on the
523 // assumption that it represents the real base address of the memory access.
524 // Other operands are typically offsets or indices from this base address.
525 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
526 return true;
527
528 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
529 return false;
530
531 auto MO1 = *MI1.memoperands_begin();
532 auto MO2 = *MI2.memoperands_begin();
533 if (MO1->getAddrSpace() != MO2->getAddrSpace())
534 return false;
535
536 auto Base1 = MO1->getValue();
537 auto Base2 = MO2->getValue();
538 if (!Base1 || !Base2)
539 return false;
540 Base1 = getUnderlyingObject(Base1);
541 Base2 = getUnderlyingObject(Base2);
542
543 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
544 return false;
545
546 return Base1 == Base2;
547}
548
550 int64_t Offset1, bool OffsetIsScalable1,
552 int64_t Offset2, bool OffsetIsScalable2,
553 unsigned ClusterSize,
554 unsigned NumBytes) const {
555 // If the mem ops (to be clustered) do not have the same base ptr, then they
556 // should not be clustered
557 if (!BaseOps1.empty() && !BaseOps2.empty()) {
558 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
559 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
560 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
561 return false;
562 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
563 // If only one base op is empty, they do not have the same base ptr
564 return false;
565 }
566
567 // In order to avoid register pressure, on an average, the number of DWORDS
568 // loaded together by all clustered mem ops should not exceed 8. This is an
569 // empirical value based on certain observations and performance related
570 // experiments.
571 // The good thing about this heuristic is - it avoids clustering of too many
572 // sub-word loads, and also avoids clustering of wide loads. Below is the
573 // brief summary of how the heuristic behaves for various `LoadSize`.
574 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
575 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
576 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
577 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
578 // (5) LoadSize >= 17: do not cluster
579 const unsigned LoadSize = NumBytes / ClusterSize;
580 const unsigned NumDWORDs = ((LoadSize + 3) / 4) * ClusterSize;
581 return NumDWORDs <= 8;
582}
583
584// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
585// the first 16 loads will be interleaved with the stores, and the next 16 will
586// be clustered as expected. It should really split into 2 16 store batches.
587//
588// Loads are clustered until this returns false, rather than trying to schedule
589// groups of stores. This also means we have to deal with saying different
590// address space loads should be clustered, and ones which might cause bank
591// conflicts.
592//
593// This might be deprecated so it might not be worth that much effort to fix.
595 int64_t Offset0, int64_t Offset1,
596 unsigned NumLoads) const {
597 assert(Offset1 > Offset0 &&
598 "Second offset should be larger than first offset!");
599 // If we have less than 16 loads in a row, and the offsets are within 64
600 // bytes, then schedule together.
601
602 // A cacheline is 64 bytes (for global memory).
603 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
604}
605
608 const DebugLoc &DL, MCRegister DestReg,
609 MCRegister SrcReg, bool KillSrc,
610 const char *Msg = "illegal VGPR to SGPR copy") {
612 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
614 C.diagnose(IllegalCopy);
615
616 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
617 .addReg(SrcReg, getKillRegState(KillSrc));
618}
619
620/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
621/// possible to have a direct copy in these cases on GFX908, so an intermediate
622/// VGPR copy is required.
626 const DebugLoc &DL, MCRegister DestReg,
627 MCRegister SrcReg, bool KillSrc,
628 RegScavenger &RS, bool RegsOverlap,
629 Register ImpDefSuperReg = Register(),
630 Register ImpUseSuperReg = Register()) {
631 assert((TII.getSubtarget().hasMAIInsts() &&
632 !TII.getSubtarget().hasGFX90AInsts()) &&
633 "Expected GFX908 subtarget.");
634
635 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
636 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
637 "Source register of the copy should be either an SGPR or an AGPR.");
638
639 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
640 "Destination register of the copy should be an AGPR.");
641
642 const SIRegisterInfo &RI = TII.getRegisterInfo();
643
644 // First try to find defining accvgpr_write to avoid temporary registers.
645 // In the case of copies of overlapping AGPRs, we conservatively do not
646 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
647 // an accvgpr_write used for this same copy due to implicit-defs
648 if (!RegsOverlap) {
649 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
650 --Def;
651
652 if (!Def->modifiesRegister(SrcReg, &RI))
653 continue;
654
655 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
656 Def->getOperand(0).getReg() != SrcReg)
657 break;
658
659 MachineOperand &DefOp = Def->getOperand(1);
660 assert(DefOp.isReg() || DefOp.isImm());
661
662 if (DefOp.isReg()) {
663 bool SafeToPropagate = true;
664 // Check that register source operand is not clobbered before MI.
665 // Immediate operands are always safe to propagate.
666 for (auto I = Def; I != MI && SafeToPropagate; ++I)
667 if (I->modifiesRegister(DefOp.getReg(), &RI))
668 SafeToPropagate = false;
669
670 if (!SafeToPropagate)
671 break;
672
673 DefOp.setIsKill(false);
674 }
675
676 MachineInstrBuilder Builder =
677 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
678 .add(DefOp);
679 if (ImpDefSuperReg)
680 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
681
682 if (ImpUseSuperReg) {
683 Builder.addReg(ImpUseSuperReg,
685 }
686
687 return;
688 }
689 }
690
692 RS.backward(std::next(MI));
693
694 // Ideally we want to have three registers for a long reg_sequence copy
695 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
696 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
697 *MBB.getParent());
698
699 // Registers in the sequence are allocated contiguously so we can just
700 // use register number to pick one of three round-robin temps.
701 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
702 Register Tmp =
703 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
705 "VGPR used for an intermediate copy should have been reserved.");
706
707 // Only loop through if there are any free registers left. We don't want to
708 // spill.
709 while (RegNo--) {
710 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
711 /* RestoreAfter */ false, 0,
712 /* AllowSpill */ false);
713 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
714 break;
715 Tmp = Tmp2;
716 RS.setRegUsed(Tmp);
717 }
718
719 // Insert copy to temporary VGPR.
720 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
721 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
722 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
723 } else {
724 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
725 }
726
727 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
728 .addReg(SrcReg, getKillRegState(KillSrc));
729 if (ImpUseSuperReg) {
730 UseBuilder.addReg(ImpUseSuperReg,
732 }
733
734 MachineInstrBuilder DefBuilder
735 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
736 .addReg(Tmp, RegState::Kill);
737
738 if (ImpDefSuperReg)
739 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
740}
741
744 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
745 const TargetRegisterClass *RC, bool Forward) {
746 const SIRegisterInfo &RI = TII.getRegisterInfo();
747 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
749 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
750
751 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
752 int16_t SubIdx = BaseIndices[Idx];
753 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
754 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
755 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
756 unsigned Opcode = AMDGPU::S_MOV_B32;
757
758 // Is SGPR aligned? If so try to combine with next.
759 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
760 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
761 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
762 // Can use SGPR64 copy
763 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
764 SubIdx = RI.getSubRegFromChannel(Channel, 2);
765 DestSubReg = RI.getSubReg(DestReg, SubIdx);
766 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
767 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
768 Opcode = AMDGPU::S_MOV_B64;
769 Idx++;
770 }
771
772 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
773 .addReg(SrcSubReg)
774 .addReg(SrcReg, RegState::Implicit);
775
776 if (!FirstMI)
777 FirstMI = LastMI;
778
779 if (!Forward)
780 I--;
781 }
782
783 assert(FirstMI && LastMI);
784 if (!Forward)
785 std::swap(FirstMI, LastMI);
786
787 FirstMI->addOperand(
788 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
789
790 if (KillSrc)
791 LastMI->addRegisterKilled(SrcReg, &RI);
792}
793
796 const DebugLoc &DL, MCRegister DestReg,
797 MCRegister SrcReg, bool KillSrc) const {
798 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
799 unsigned Size = RI.getRegSizeInBits(*RC);
800 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
801 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
802
803 // The rest of copyPhysReg assumes Src and Dst size are the same size.
804 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
805 // we remove Fix16BitCopies and this code block?
806 if (Fix16BitCopies) {
807 if (((Size == 16) != (SrcSize == 16))) {
808 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
810 MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg;
811 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
812 RegToFix = SubReg;
813
814 if (DestReg == SrcReg) {
815 // Identity copy. Insert empty bundle since ExpandPostRA expects an
816 // instruction here.
817 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
818 return;
819 }
820 RC = RI.getPhysRegBaseClass(DestReg);
821 Size = RI.getRegSizeInBits(*RC);
822 SrcRC = RI.getPhysRegBaseClass(SrcReg);
823 SrcSize = RI.getRegSizeInBits(*SrcRC);
824 }
825 }
826
827 if (RC == &AMDGPU::VGPR_32RegClass) {
828 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
829 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
830 AMDGPU::AGPR_32RegClass.contains(SrcReg));
831 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
832 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
833 BuildMI(MBB, MI, DL, get(Opc), DestReg)
834 .addReg(SrcReg, getKillRegState(KillSrc));
835 return;
836 }
837
838 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
839 RC == &AMDGPU::SReg_32RegClass) {
840 if (SrcReg == AMDGPU::SCC) {
841 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
842 .addImm(1)
843 .addImm(0);
844 return;
845 }
846
847 if (DestReg == AMDGPU::VCC_LO) {
848 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
849 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
850 .addReg(SrcReg, getKillRegState(KillSrc));
851 } else {
852 // FIXME: Hack until VReg_1 removed.
853 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
854 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
855 .addImm(0)
856 .addReg(SrcReg, getKillRegState(KillSrc));
857 }
858
859 return;
860 }
861
862 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
863 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
864 return;
865 }
866
867 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
868 .addReg(SrcReg, getKillRegState(KillSrc));
869 return;
870 }
871
872 if (RC == &AMDGPU::SReg_64RegClass) {
873 if (SrcReg == AMDGPU::SCC) {
874 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
875 .addImm(1)
876 .addImm(0);
877 return;
878 }
879
880 if (DestReg == AMDGPU::VCC) {
881 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
882 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
883 .addReg(SrcReg, getKillRegState(KillSrc));
884 } else {
885 // FIXME: Hack until VReg_1 removed.
886 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
887 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
888 .addImm(0)
889 .addReg(SrcReg, getKillRegState(KillSrc));
890 }
891
892 return;
893 }
894
895 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
896 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
897 return;
898 }
899
900 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
901 .addReg(SrcReg, getKillRegState(KillSrc));
902 return;
903 }
904
905 if (DestReg == AMDGPU::SCC) {
906 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
907 // but SelectionDAG emits such copies for i1 sources.
908 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
909 // This copy can only be produced by patterns
910 // with explicit SCC, which are known to be enabled
911 // only for subtargets with S_CMP_LG_U64 present.
913 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
914 .addReg(SrcReg, getKillRegState(KillSrc))
915 .addImm(0);
916 } else {
917 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
918 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
919 .addReg(SrcReg, getKillRegState(KillSrc))
920 .addImm(0);
921 }
922
923 return;
924 }
925
926 if (RC == &AMDGPU::AGPR_32RegClass) {
927 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
928 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
929 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
930 .addReg(SrcReg, getKillRegState(KillSrc));
931 return;
932 }
933
934 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
935 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
936 .addReg(SrcReg, getKillRegState(KillSrc));
937 return;
938 }
939
940 // FIXME: Pass should maintain scavenger to avoid scan through the block on
941 // every AGPR spill.
942 RegScavenger RS;
943 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
944 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
945 return;
946 }
947
948 if (Size == 16) {
949 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
950 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
951 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
952
953 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
954 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
955 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
956 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
957 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
958 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
959 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
960 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
961
962 if (IsSGPRDst) {
963 if (!IsSGPRSrc) {
964 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
965 return;
966 }
967
968 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
969 .addReg(NewSrcReg, getKillRegState(KillSrc));
970 return;
971 }
972
973 if (IsAGPRDst || IsAGPRSrc) {
974 if (!DstLow || !SrcLow) {
975 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
976 "Cannot use hi16 subreg with an AGPR!");
977 }
978
979 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
980 return;
981 }
982
983 if (ST.hasTrue16BitInsts()) {
984 if (IsSGPRSrc) {
985 assert(SrcLow);
986 SrcReg = NewSrcReg;
987 }
988 // Use the smaller instruction encoding if possible.
989 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
990 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
991 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
992 .addReg(SrcReg);
993 } else {
994 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
995 .addImm(0) // src0_modifiers
996 .addReg(SrcReg)
997 .addImm(0); // op_sel
998 }
999 return;
1000 }
1001
1002 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1003 if (!DstLow || !SrcLow) {
1004 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1005 "Cannot use hi16 subreg on VI!");
1006 }
1007
1008 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1009 .addReg(NewSrcReg, getKillRegState(KillSrc));
1010 return;
1011 }
1012
1013 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1014 .addImm(0) // src0_modifiers
1015 .addReg(NewSrcReg)
1016 .addImm(0) // clamp
1023 // First implicit operand is $exec.
1024 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1025 return;
1026 }
1027
1028 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1029 if (ST.hasMovB64()) {
1030 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1031 .addReg(SrcReg, getKillRegState(KillSrc));
1032 return;
1033 }
1034 if (ST.hasPkMovB32()) {
1035 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1037 .addReg(SrcReg)
1039 .addReg(SrcReg)
1040 .addImm(0) // op_sel_lo
1041 .addImm(0) // op_sel_hi
1042 .addImm(0) // neg_lo
1043 .addImm(0) // neg_hi
1044 .addImm(0) // clamp
1045 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1046 return;
1047 }
1048 }
1049
1050 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1051 if (RI.isSGPRClass(RC)) {
1052 if (!RI.isSGPRClass(SrcRC)) {
1053 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1054 return;
1055 }
1056 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1057 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1058 Forward);
1059 return;
1060 }
1061
1062 unsigned EltSize = 4;
1063 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1064 if (RI.isAGPRClass(RC)) {
1065 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1066 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1067 else if (RI.hasVGPRs(SrcRC) ||
1068 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1069 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1070 else
1071 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1072 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1073 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1074 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1075 (RI.isProperlyAlignedRC(*RC) &&
1076 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1077 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1078 if (ST.hasMovB64()) {
1079 Opcode = AMDGPU::V_MOV_B64_e32;
1080 EltSize = 8;
1081 } else if (ST.hasPkMovB32()) {
1082 Opcode = AMDGPU::V_PK_MOV_B32;
1083 EltSize = 8;
1084 }
1085 }
1086
1087 // For the cases where we need an intermediate instruction/temporary register
1088 // (destination is an AGPR), we need a scavenger.
1089 //
1090 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1091 // whole block for every handled copy.
1092 std::unique_ptr<RegScavenger> RS;
1093 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1094 RS = std::make_unique<RegScavenger>();
1095
1096 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1097
1098 // If there is an overlap, we can't kill the super-register on the last
1099 // instruction, since it will also kill the components made live by this def.
1100 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1101 const bool CanKillSuperReg = KillSrc && !Overlap;
1102
1103 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1104 unsigned SubIdx;
1105 if (Forward)
1106 SubIdx = SubIndices[Idx];
1107 else
1108 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1109 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1110 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1111 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1112
1113 bool IsFirstSubreg = Idx == 0;
1114 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1115
1116 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1117 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1118 Register ImpUseSuper = SrcReg;
1119 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1120 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1121 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1123 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1125 .addReg(SrcSubReg)
1127 .addReg(SrcSubReg)
1128 .addImm(0) // op_sel_lo
1129 .addImm(0) // op_sel_hi
1130 .addImm(0) // neg_lo
1131 .addImm(0) // neg_hi
1132 .addImm(0) // clamp
1133 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1134 if (IsFirstSubreg)
1136 } else {
1137 MachineInstrBuilder Builder =
1138 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1139 if (IsFirstSubreg)
1140 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1141
1142 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1143 }
1144 }
1145}
1146
1147int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1148 int NewOpc;
1149
1150 // Try to map original to commuted opcode
1151 NewOpc = AMDGPU::getCommuteRev(Opcode);
1152 if (NewOpc != -1)
1153 // Check if the commuted (REV) opcode exists on the target.
1154 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1155
1156 // Try to map commuted to original opcode
1157 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1158 if (NewOpc != -1)
1159 // Check if the original (non-REV) opcode exists on the target.
1160 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1161
1162 return Opcode;
1163}
1164
1167 const DebugLoc &DL, Register DestReg,
1168 int64_t Value) const {
1170 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
1171 if (RegClass == &AMDGPU::SReg_32RegClass ||
1172 RegClass == &AMDGPU::SGPR_32RegClass ||
1173 RegClass == &AMDGPU::SReg_32_XM0RegClass ||
1174 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
1175 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
1176 .addImm(Value);
1177 return;
1178 }
1179
1180 if (RegClass == &AMDGPU::SReg_64RegClass ||
1181 RegClass == &AMDGPU::SGPR_64RegClass ||
1182 RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
1183 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
1184 .addImm(Value);
1185 return;
1186 }
1187
1188 if (RegClass == &AMDGPU::VGPR_32RegClass) {
1189 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
1190 .addImm(Value);
1191 return;
1192 }
1193 if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
1194 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
1195 .addImm(Value);
1196 return;
1197 }
1198
1199 unsigned EltSize = 4;
1200 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1201 if (RI.isSGPRClass(RegClass)) {
1202 if (RI.getRegSizeInBits(*RegClass) > 32) {
1203 Opcode = AMDGPU::S_MOV_B64;
1204 EltSize = 8;
1205 } else {
1206 Opcode = AMDGPU::S_MOV_B32;
1207 EltSize = 4;
1208 }
1209 }
1210
1211 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
1212 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1213 int64_t IdxValue = Idx == 0 ? Value : 0;
1214
1215 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
1216 get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
1217 Builder.addImm(IdxValue);
1218 }
1219}
1220
1221const TargetRegisterClass *
1223 return &AMDGPU::VGPR_32RegClass;
1224}
1225
1228 const DebugLoc &DL, Register DstReg,
1230 Register TrueReg,
1231 Register FalseReg) const {
1233 const TargetRegisterClass *BoolXExecRC =
1234 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1235 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1236 "Not a VGPR32 reg");
1237
1238 if (Cond.size() == 1) {
1239 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1240 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1241 .add(Cond[0]);
1242 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1243 .addImm(0)
1244 .addReg(FalseReg)
1245 .addImm(0)
1246 .addReg(TrueReg)
1247 .addReg(SReg);
1248 } else if (Cond.size() == 2) {
1249 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1250 switch (Cond[0].getImm()) {
1251 case SIInstrInfo::SCC_TRUE: {
1252 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1253 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1254 : AMDGPU::S_CSELECT_B64), SReg)
1255 .addImm(1)
1256 .addImm(0);
1257 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1258 .addImm(0)
1259 .addReg(FalseReg)
1260 .addImm(0)
1261 .addReg(TrueReg)
1262 .addReg(SReg);
1263 break;
1264 }
1265 case SIInstrInfo::SCC_FALSE: {
1266 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1267 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1268 : AMDGPU::S_CSELECT_B64), SReg)
1269 .addImm(0)
1270 .addImm(1);
1271 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1272 .addImm(0)
1273 .addReg(FalseReg)
1274 .addImm(0)
1275 .addReg(TrueReg)
1276 .addReg(SReg);
1277 break;
1278 }
1279 case SIInstrInfo::VCCNZ: {
1280 MachineOperand RegOp = Cond[1];
1281 RegOp.setImplicit(false);
1282 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1283 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1284 .add(RegOp);
1285 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1286 .addImm(0)
1287 .addReg(FalseReg)
1288 .addImm(0)
1289 .addReg(TrueReg)
1290 .addReg(SReg);
1291 break;
1292 }
1293 case SIInstrInfo::VCCZ: {
1294 MachineOperand RegOp = Cond[1];
1295 RegOp.setImplicit(false);
1296 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1297 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1298 .add(RegOp);
1299 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1300 .addImm(0)
1301 .addReg(TrueReg)
1302 .addImm(0)
1303 .addReg(FalseReg)
1304 .addReg(SReg);
1305 break;
1306 }
1307 case SIInstrInfo::EXECNZ: {
1308 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1309 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1310 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1311 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1312 .addImm(0);
1313 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1314 : AMDGPU::S_CSELECT_B64), SReg)
1315 .addImm(1)
1316 .addImm(0);
1317 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1318 .addImm(0)
1319 .addReg(FalseReg)
1320 .addImm(0)
1321 .addReg(TrueReg)
1322 .addReg(SReg);
1323 break;
1324 }
1325 case SIInstrInfo::EXECZ: {
1326 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1327 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1328 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1329 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1330 .addImm(0);
1331 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1332 : AMDGPU::S_CSELECT_B64), SReg)
1333 .addImm(0)
1334 .addImm(1);
1335 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1336 .addImm(0)
1337 .addReg(FalseReg)
1338 .addImm(0)
1339 .addReg(TrueReg)
1340 .addReg(SReg);
1341 llvm_unreachable("Unhandled branch predicate EXECZ");
1342 break;
1343 }
1344 default:
1345 llvm_unreachable("invalid branch predicate");
1346 }
1347 } else {
1348 llvm_unreachable("Can only handle Cond size 1 or 2");
1349 }
1350}
1351
1354 const DebugLoc &DL,
1355 Register SrcReg, int Value) const {
1357 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1358 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1359 .addImm(Value)
1360 .addReg(SrcReg);
1361
1362 return Reg;
1363}
1364
1367 const DebugLoc &DL,
1368 Register SrcReg, int Value) const {
1370 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1371 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1372 .addImm(Value)
1373 .addReg(SrcReg);
1374
1375 return Reg;
1376}
1377
1379
1380 if (RI.isAGPRClass(DstRC))
1381 return AMDGPU::COPY;
1382 if (RI.getRegSizeInBits(*DstRC) == 16) {
1383 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1384 // before RA.
1385 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1386 }
1387 if (RI.getRegSizeInBits(*DstRC) == 32)
1388 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1389 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1390 return AMDGPU::S_MOV_B64;
1391 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1392 return AMDGPU::V_MOV_B64_PSEUDO;
1393 return AMDGPU::COPY;
1394}
1395
1396const MCInstrDesc &
1398 bool IsIndirectSrc) const {
1399 if (IsIndirectSrc) {
1400 if (VecSize <= 32) // 4 bytes
1401 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1402 if (VecSize <= 64) // 8 bytes
1403 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1404 if (VecSize <= 96) // 12 bytes
1405 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1406 if (VecSize <= 128) // 16 bytes
1407 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1408 if (VecSize <= 160) // 20 bytes
1409 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1410 if (VecSize <= 256) // 32 bytes
1411 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1412 if (VecSize <= 288) // 36 bytes
1413 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1414 if (VecSize <= 320) // 40 bytes
1415 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1416 if (VecSize <= 352) // 44 bytes
1417 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1418 if (VecSize <= 384) // 48 bytes
1419 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1420 if (VecSize <= 512) // 64 bytes
1421 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1422 if (VecSize <= 1024) // 128 bytes
1423 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1424
1425 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1426 }
1427
1428 if (VecSize <= 32) // 4 bytes
1429 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1430 if (VecSize <= 64) // 8 bytes
1431 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1432 if (VecSize <= 96) // 12 bytes
1433 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1434 if (VecSize <= 128) // 16 bytes
1435 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1436 if (VecSize <= 160) // 20 bytes
1437 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1438 if (VecSize <= 256) // 32 bytes
1439 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1440 if (VecSize <= 288) // 36 bytes
1441 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1442 if (VecSize <= 320) // 40 bytes
1443 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1444 if (VecSize <= 352) // 44 bytes
1445 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1446 if (VecSize <= 384) // 48 bytes
1447 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1448 if (VecSize <= 512) // 64 bytes
1449 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1450 if (VecSize <= 1024) // 128 bytes
1451 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1452
1453 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1454}
1455
1456static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1457 if (VecSize <= 32) // 4 bytes
1458 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1459 if (VecSize <= 64) // 8 bytes
1460 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1461 if (VecSize <= 96) // 12 bytes
1462 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1463 if (VecSize <= 128) // 16 bytes
1464 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1465 if (VecSize <= 160) // 20 bytes
1466 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1467 if (VecSize <= 256) // 32 bytes
1468 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1469 if (VecSize <= 288) // 36 bytes
1470 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1471 if (VecSize <= 320) // 40 bytes
1472 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1473 if (VecSize <= 352) // 44 bytes
1474 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1475 if (VecSize <= 384) // 48 bytes
1476 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1477 if (VecSize <= 512) // 64 bytes
1478 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1479 if (VecSize <= 1024) // 128 bytes
1480 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1481
1482 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1483}
1484
1485static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1486 if (VecSize <= 32) // 4 bytes
1487 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1488 if (VecSize <= 64) // 8 bytes
1489 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1490 if (VecSize <= 96) // 12 bytes
1491 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1492 if (VecSize <= 128) // 16 bytes
1493 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1494 if (VecSize <= 160) // 20 bytes
1495 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1496 if (VecSize <= 256) // 32 bytes
1497 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1498 if (VecSize <= 288) // 36 bytes
1499 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1500 if (VecSize <= 320) // 40 bytes
1501 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1502 if (VecSize <= 352) // 44 bytes
1503 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1504 if (VecSize <= 384) // 48 bytes
1505 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1506 if (VecSize <= 512) // 64 bytes
1507 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1508 if (VecSize <= 1024) // 128 bytes
1509 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1510
1511 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1512}
1513
1514static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1515 if (VecSize <= 64) // 8 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1517 if (VecSize <= 128) // 16 bytes
1518 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1519 if (VecSize <= 256) // 32 bytes
1520 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1521 if (VecSize <= 512) // 64 bytes
1522 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1523 if (VecSize <= 1024) // 128 bytes
1524 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1525
1526 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1527}
1528
1529const MCInstrDesc &
1530SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1531 bool IsSGPR) const {
1532 if (IsSGPR) {
1533 switch (EltSize) {
1534 case 32:
1535 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1536 case 64:
1537 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1538 default:
1539 llvm_unreachable("invalid reg indexing elt size");
1540 }
1541 }
1542
1543 assert(EltSize == 32 && "invalid reg indexing elt size");
1545}
1546
1547static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1548 switch (Size) {
1549 case 4:
1550 return AMDGPU::SI_SPILL_S32_SAVE;
1551 case 8:
1552 return AMDGPU::SI_SPILL_S64_SAVE;
1553 case 12:
1554 return AMDGPU::SI_SPILL_S96_SAVE;
1555 case 16:
1556 return AMDGPU::SI_SPILL_S128_SAVE;
1557 case 20:
1558 return AMDGPU::SI_SPILL_S160_SAVE;
1559 case 24:
1560 return AMDGPU::SI_SPILL_S192_SAVE;
1561 case 28:
1562 return AMDGPU::SI_SPILL_S224_SAVE;
1563 case 32:
1564 return AMDGPU::SI_SPILL_S256_SAVE;
1565 case 36:
1566 return AMDGPU::SI_SPILL_S288_SAVE;
1567 case 40:
1568 return AMDGPU::SI_SPILL_S320_SAVE;
1569 case 44:
1570 return AMDGPU::SI_SPILL_S352_SAVE;
1571 case 48:
1572 return AMDGPU::SI_SPILL_S384_SAVE;
1573 case 64:
1574 return AMDGPU::SI_SPILL_S512_SAVE;
1575 case 128:
1576 return AMDGPU::SI_SPILL_S1024_SAVE;
1577 default:
1578 llvm_unreachable("unknown register size");
1579 }
1580}
1581
1582static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1583 switch (Size) {
1584 case 4:
1585 return AMDGPU::SI_SPILL_V32_SAVE;
1586 case 8:
1587 return AMDGPU::SI_SPILL_V64_SAVE;
1588 case 12:
1589 return AMDGPU::SI_SPILL_V96_SAVE;
1590 case 16:
1591 return AMDGPU::SI_SPILL_V128_SAVE;
1592 case 20:
1593 return AMDGPU::SI_SPILL_V160_SAVE;
1594 case 24:
1595 return AMDGPU::SI_SPILL_V192_SAVE;
1596 case 28:
1597 return AMDGPU::SI_SPILL_V224_SAVE;
1598 case 32:
1599 return AMDGPU::SI_SPILL_V256_SAVE;
1600 case 36:
1601 return AMDGPU::SI_SPILL_V288_SAVE;
1602 case 40:
1603 return AMDGPU::SI_SPILL_V320_SAVE;
1604 case 44:
1605 return AMDGPU::SI_SPILL_V352_SAVE;
1606 case 48:
1607 return AMDGPU::SI_SPILL_V384_SAVE;
1608 case 64:
1609 return AMDGPU::SI_SPILL_V512_SAVE;
1610 case 128:
1611 return AMDGPU::SI_SPILL_V1024_SAVE;
1612 default:
1613 llvm_unreachable("unknown register size");
1614 }
1615}
1616
1617static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
1618 switch (Size) {
1619 case 4:
1620 return AMDGPU::SI_SPILL_A32_SAVE;
1621 case 8:
1622 return AMDGPU::SI_SPILL_A64_SAVE;
1623 case 12:
1624 return AMDGPU::SI_SPILL_A96_SAVE;
1625 case 16:
1626 return AMDGPU::SI_SPILL_A128_SAVE;
1627 case 20:
1628 return AMDGPU::SI_SPILL_A160_SAVE;
1629 case 24:
1630 return AMDGPU::SI_SPILL_A192_SAVE;
1631 case 28:
1632 return AMDGPU::SI_SPILL_A224_SAVE;
1633 case 32:
1634 return AMDGPU::SI_SPILL_A256_SAVE;
1635 case 36:
1636 return AMDGPU::SI_SPILL_A288_SAVE;
1637 case 40:
1638 return AMDGPU::SI_SPILL_A320_SAVE;
1639 case 44:
1640 return AMDGPU::SI_SPILL_A352_SAVE;
1641 case 48:
1642 return AMDGPU::SI_SPILL_A384_SAVE;
1643 case 64:
1644 return AMDGPU::SI_SPILL_A512_SAVE;
1645 case 128:
1646 return AMDGPU::SI_SPILL_A1024_SAVE;
1647 default:
1648 llvm_unreachable("unknown register size");
1649 }
1650}
1651
1652static unsigned getAVSpillSaveOpcode(unsigned Size) {
1653 switch (Size) {
1654 case 4:
1655 return AMDGPU::SI_SPILL_AV32_SAVE;
1656 case 8:
1657 return AMDGPU::SI_SPILL_AV64_SAVE;
1658 case 12:
1659 return AMDGPU::SI_SPILL_AV96_SAVE;
1660 case 16:
1661 return AMDGPU::SI_SPILL_AV128_SAVE;
1662 case 20:
1663 return AMDGPU::SI_SPILL_AV160_SAVE;
1664 case 24:
1665 return AMDGPU::SI_SPILL_AV192_SAVE;
1666 case 28:
1667 return AMDGPU::SI_SPILL_AV224_SAVE;
1668 case 32:
1669 return AMDGPU::SI_SPILL_AV256_SAVE;
1670 case 36:
1671 return AMDGPU::SI_SPILL_AV288_SAVE;
1672 case 40:
1673 return AMDGPU::SI_SPILL_AV320_SAVE;
1674 case 44:
1675 return AMDGPU::SI_SPILL_AV352_SAVE;
1676 case 48:
1677 return AMDGPU::SI_SPILL_AV384_SAVE;
1678 case 64:
1679 return AMDGPU::SI_SPILL_AV512_SAVE;
1680 case 128:
1681 return AMDGPU::SI_SPILL_AV1024_SAVE;
1682 default:
1683 llvm_unreachable("unknown register size");
1684 }
1685}
1686
1687static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1688 bool IsVectorSuperClass) {
1689 // Currently, there is only 32-bit WWM register spills needed.
1690 if (Size != 4)
1691 llvm_unreachable("unknown wwm register spill size");
1692
1693 if (IsVectorSuperClass)
1694 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1695
1696 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1697}
1698
1700 const TargetRegisterClass *RC,
1701 unsigned Size,
1702 const SIRegisterInfo &TRI,
1703 const SIMachineFunctionInfo &MFI) {
1704 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1705
1706 // Choose the right opcode if spilling a WWM register.
1708 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1709
1710 if (IsVectorSuperClass)
1711 return getAVSpillSaveOpcode(Size);
1712
1713 return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
1715}
1716
1719 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1720 const TargetRegisterInfo *TRI, Register VReg) const {
1723 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1724 const DebugLoc &DL = MBB.findDebugLoc(MI);
1725
1726 MachinePointerInfo PtrInfo
1727 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1729 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1730 FrameInfo.getObjectAlign(FrameIndex));
1731 unsigned SpillSize = TRI->getSpillSize(*RC);
1732
1734 if (RI.isSGPRClass(RC)) {
1735 MFI->setHasSpilledSGPRs();
1736 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1737 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1738 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1739
1740 // We are only allowed to create one new instruction when spilling
1741 // registers, so we need to use pseudo instruction for spilling SGPRs.
1742 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1743
1744 // The SGPR spill/restore instructions only work on number sgprs, so we need
1745 // to make sure we are using the correct register class.
1746 if (SrcReg.isVirtual() && SpillSize == 4) {
1747 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1748 }
1749
1750 BuildMI(MBB, MI, DL, OpDesc)
1751 .addReg(SrcReg, getKillRegState(isKill)) // data
1752 .addFrameIndex(FrameIndex) // addr
1753 .addMemOperand(MMO)
1755
1756 if (RI.spillSGPRToVGPR())
1757 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1758 return;
1759 }
1760
1761 unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
1762 SpillSize, RI, *MFI);
1763 MFI->setHasSpilledVGPRs();
1764
1765 BuildMI(MBB, MI, DL, get(Opcode))
1766 .addReg(SrcReg, getKillRegState(isKill)) // data
1767 .addFrameIndex(FrameIndex) // addr
1768 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1769 .addImm(0) // offset
1770 .addMemOperand(MMO);
1771}
1772
1773static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1774 switch (Size) {
1775 case 4:
1776 return AMDGPU::SI_SPILL_S32_RESTORE;
1777 case 8:
1778 return AMDGPU::SI_SPILL_S64_RESTORE;
1779 case 12:
1780 return AMDGPU::SI_SPILL_S96_RESTORE;
1781 case 16:
1782 return AMDGPU::SI_SPILL_S128_RESTORE;
1783 case 20:
1784 return AMDGPU::SI_SPILL_S160_RESTORE;
1785 case 24:
1786 return AMDGPU::SI_SPILL_S192_RESTORE;
1787 case 28:
1788 return AMDGPU::SI_SPILL_S224_RESTORE;
1789 case 32:
1790 return AMDGPU::SI_SPILL_S256_RESTORE;
1791 case 36:
1792 return AMDGPU::SI_SPILL_S288_RESTORE;
1793 case 40:
1794 return AMDGPU::SI_SPILL_S320_RESTORE;
1795 case 44:
1796 return AMDGPU::SI_SPILL_S352_RESTORE;
1797 case 48:
1798 return AMDGPU::SI_SPILL_S384_RESTORE;
1799 case 64:
1800 return AMDGPU::SI_SPILL_S512_RESTORE;
1801 case 128:
1802 return AMDGPU::SI_SPILL_S1024_RESTORE;
1803 default:
1804 llvm_unreachable("unknown register size");
1805 }
1806}
1807
1808static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1809 switch (Size) {
1810 case 4:
1811 return AMDGPU::SI_SPILL_V32_RESTORE;
1812 case 8:
1813 return AMDGPU::SI_SPILL_V64_RESTORE;
1814 case 12:
1815 return AMDGPU::SI_SPILL_V96_RESTORE;
1816 case 16:
1817 return AMDGPU::SI_SPILL_V128_RESTORE;
1818 case 20:
1819 return AMDGPU::SI_SPILL_V160_RESTORE;
1820 case 24:
1821 return AMDGPU::SI_SPILL_V192_RESTORE;
1822 case 28:
1823 return AMDGPU::SI_SPILL_V224_RESTORE;
1824 case 32:
1825 return AMDGPU::SI_SPILL_V256_RESTORE;
1826 case 36:
1827 return AMDGPU::SI_SPILL_V288_RESTORE;
1828 case 40:
1829 return AMDGPU::SI_SPILL_V320_RESTORE;
1830 case 44:
1831 return AMDGPU::SI_SPILL_V352_RESTORE;
1832 case 48:
1833 return AMDGPU::SI_SPILL_V384_RESTORE;
1834 case 64:
1835 return AMDGPU::SI_SPILL_V512_RESTORE;
1836 case 128:
1837 return AMDGPU::SI_SPILL_V1024_RESTORE;
1838 default:
1839 llvm_unreachable("unknown register size");
1840 }
1841}
1842
1843static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1844 switch (Size) {
1845 case 4:
1846 return AMDGPU::SI_SPILL_A32_RESTORE;
1847 case 8:
1848 return AMDGPU::SI_SPILL_A64_RESTORE;
1849 case 12:
1850 return AMDGPU::SI_SPILL_A96_RESTORE;
1851 case 16:
1852 return AMDGPU::SI_SPILL_A128_RESTORE;
1853 case 20:
1854 return AMDGPU::SI_SPILL_A160_RESTORE;
1855 case 24:
1856 return AMDGPU::SI_SPILL_A192_RESTORE;
1857 case 28:
1858 return AMDGPU::SI_SPILL_A224_RESTORE;
1859 case 32:
1860 return AMDGPU::SI_SPILL_A256_RESTORE;
1861 case 36:
1862 return AMDGPU::SI_SPILL_A288_RESTORE;
1863 case 40:
1864 return AMDGPU::SI_SPILL_A320_RESTORE;
1865 case 44:
1866 return AMDGPU::SI_SPILL_A352_RESTORE;
1867 case 48:
1868 return AMDGPU::SI_SPILL_A384_RESTORE;
1869 case 64:
1870 return AMDGPU::SI_SPILL_A512_RESTORE;
1871 case 128:
1872 return AMDGPU::SI_SPILL_A1024_RESTORE;
1873 default:
1874 llvm_unreachable("unknown register size");
1875 }
1876}
1877
1878static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1879 switch (Size) {
1880 case 4:
1881 return AMDGPU::SI_SPILL_AV32_RESTORE;
1882 case 8:
1883 return AMDGPU::SI_SPILL_AV64_RESTORE;
1884 case 12:
1885 return AMDGPU::SI_SPILL_AV96_RESTORE;
1886 case 16:
1887 return AMDGPU::SI_SPILL_AV128_RESTORE;
1888 case 20:
1889 return AMDGPU::SI_SPILL_AV160_RESTORE;
1890 case 24:
1891 return AMDGPU::SI_SPILL_AV192_RESTORE;
1892 case 28:
1893 return AMDGPU::SI_SPILL_AV224_RESTORE;
1894 case 32:
1895 return AMDGPU::SI_SPILL_AV256_RESTORE;
1896 case 36:
1897 return AMDGPU::SI_SPILL_AV288_RESTORE;
1898 case 40:
1899 return AMDGPU::SI_SPILL_AV320_RESTORE;
1900 case 44:
1901 return AMDGPU::SI_SPILL_AV352_RESTORE;
1902 case 48:
1903 return AMDGPU::SI_SPILL_AV384_RESTORE;
1904 case 64:
1905 return AMDGPU::SI_SPILL_AV512_RESTORE;
1906 case 128:
1907 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1908 default:
1909 llvm_unreachable("unknown register size");
1910 }
1911}
1912
1913static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1914 bool IsVectorSuperClass) {
1915 // Currently, there is only 32-bit WWM register spills needed.
1916 if (Size != 4)
1917 llvm_unreachable("unknown wwm register spill size");
1918
1919 if (IsVectorSuperClass)
1920 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1921
1922 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1923}
1924
1925static unsigned
1927 unsigned Size, const SIRegisterInfo &TRI,
1928 const SIMachineFunctionInfo &MFI) {
1929 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1930
1931 // Choose the right opcode if restoring a WWM register.
1933 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1934
1935 if (IsVectorSuperClass)
1937
1938 return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
1940}
1941
1944 Register DestReg, int FrameIndex,
1945 const TargetRegisterClass *RC,
1946 const TargetRegisterInfo *TRI,
1947 Register VReg) const {
1950 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1951 const DebugLoc &DL = MBB.findDebugLoc(MI);
1952 unsigned SpillSize = TRI->getSpillSize(*RC);
1953
1954 MachinePointerInfo PtrInfo
1955 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1956
1958 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1959 FrameInfo.getObjectAlign(FrameIndex));
1960
1961 if (RI.isSGPRClass(RC)) {
1962 MFI->setHasSpilledSGPRs();
1963 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1964 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1965 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1966
1967 // FIXME: Maybe this should not include a memoperand because it will be
1968 // lowered to non-memory instructions.
1969 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1970 if (DestReg.isVirtual() && SpillSize == 4) {
1972 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1973 }
1974
1975 if (RI.spillSGPRToVGPR())
1976 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1977 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1978 .addFrameIndex(FrameIndex) // addr
1979 .addMemOperand(MMO)
1981
1982 return;
1983 }
1984
1985 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1986 SpillSize, RI, *MFI);
1987 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1988 .addFrameIndex(FrameIndex) // vaddr
1989 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1990 .addImm(0) // offset
1991 .addMemOperand(MMO);
1992}
1993
1996 insertNoops(MBB, MI, 1);
1997}
1998
2001 unsigned Quantity) const {
2003 while (Quantity > 0) {
2004 unsigned Arg = std::min(Quantity, 8u);
2005 Quantity -= Arg;
2006 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
2007 }
2008}
2009
2011 auto MF = MBB.getParent();
2013
2014 assert(Info->isEntryFunction());
2015
2016 if (MBB.succ_empty()) {
2017 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
2018 if (HasNoTerminator) {
2019 if (Info->returnsVoid()) {
2020 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
2021 } else {
2022 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
2023 }
2024 }
2025 }
2026}
2027
2031 const DebugLoc &DL) const {
2033 constexpr unsigned DoorbellIDMask = 0x3ff;
2034 constexpr unsigned ECQueueWaveAbort = 0x400;
2035
2036 MachineBasicBlock *TrapBB = &MBB;
2037 MachineBasicBlock *ContBB = &MBB;
2038 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2039
2040 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
2041 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
2042 TrapBB = MF->CreateMachineBasicBlock();
2043 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
2044 MF->push_back(TrapBB);
2045 MBB.addSuccessor(TrapBB);
2046 }
2047
2048 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2049 // will be a nop.
2050 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
2051 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2052 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2053 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2054 DoorbellReg)
2056 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2057 .addUse(AMDGPU::M0);
2058 Register DoorbellRegMasked =
2059 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2060 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2061 .addUse(DoorbellReg)
2062 .addImm(DoorbellIDMask);
2063 Register SetWaveAbortBit =
2064 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2065 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2066 .addUse(DoorbellRegMasked)
2067 .addImm(ECQueueWaveAbort);
2068 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2069 .addUse(SetWaveAbortBit);
2070 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2072 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2073 .addUse(AMDGPU::TTMP2);
2074 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2075 TrapBB->addSuccessor(HaltLoopBB);
2076
2077 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2078 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2079 .addMBB(HaltLoopBB);
2080 MF->push_back(HaltLoopBB);
2081 HaltLoopBB->addSuccessor(HaltLoopBB);
2082
2083 return ContBB;
2084}
2085
2087 switch (MI.getOpcode()) {
2088 default:
2089 if (MI.isMetaInstruction())
2090 return 0;
2091 return 1; // FIXME: Do wait states equal cycles?
2092
2093 case AMDGPU::S_NOP:
2094 return MI.getOperand(0).getImm() + 1;
2095 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2096 // hazard, even if one exist, won't really be visible. Should we handle it?
2097 }
2098}
2099
2101 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2102 MachineBasicBlock &MBB = *MI.getParent();
2104 switch (MI.getOpcode()) {
2105 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2106 case AMDGPU::S_MOV_B64_term:
2107 // This is only a terminator to get the correct spill code placement during
2108 // register allocation.
2109 MI.setDesc(get(AMDGPU::S_MOV_B64));
2110 break;
2111
2112 case AMDGPU::S_MOV_B32_term:
2113 // This is only a terminator to get the correct spill code placement during
2114 // register allocation.
2115 MI.setDesc(get(AMDGPU::S_MOV_B32));
2116 break;
2117
2118 case AMDGPU::S_XOR_B64_term:
2119 // This is only a terminator to get the correct spill code placement during
2120 // register allocation.
2121 MI.setDesc(get(AMDGPU::S_XOR_B64));
2122 break;
2123
2124 case AMDGPU::S_XOR_B32_term:
2125 // This is only a terminator to get the correct spill code placement during
2126 // register allocation.
2127 MI.setDesc(get(AMDGPU::S_XOR_B32));
2128 break;
2129 case AMDGPU::S_OR_B64_term:
2130 // This is only a terminator to get the correct spill code placement during
2131 // register allocation.
2132 MI.setDesc(get(AMDGPU::S_OR_B64));
2133 break;
2134 case AMDGPU::S_OR_B32_term:
2135 // This is only a terminator to get the correct spill code placement during
2136 // register allocation.
2137 MI.setDesc(get(AMDGPU::S_OR_B32));
2138 break;
2139
2140 case AMDGPU::S_ANDN2_B64_term:
2141 // This is only a terminator to get the correct spill code placement during
2142 // register allocation.
2143 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2144 break;
2145
2146 case AMDGPU::S_ANDN2_B32_term:
2147 // This is only a terminator to get the correct spill code placement during
2148 // register allocation.
2149 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2150 break;
2151
2152 case AMDGPU::S_AND_B64_term:
2153 // This is only a terminator to get the correct spill code placement during
2154 // register allocation.
2155 MI.setDesc(get(AMDGPU::S_AND_B64));
2156 break;
2157
2158 case AMDGPU::S_AND_B32_term:
2159 // This is only a terminator to get the correct spill code placement during
2160 // register allocation.
2161 MI.setDesc(get(AMDGPU::S_AND_B32));
2162 break;
2163
2164 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2165 // This is only a terminator to get the correct spill code placement during
2166 // register allocation.
2167 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2168 break;
2169
2170 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2171 // This is only a terminator to get the correct spill code placement during
2172 // register allocation.
2173 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2174 break;
2175
2176 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2177 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2178 break;
2179
2180 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2181 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2182 break;
2183
2184 case AMDGPU::V_MOV_B64_PSEUDO: {
2185 Register Dst = MI.getOperand(0).getReg();
2186 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2187 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2188
2189 const MachineOperand &SrcOp = MI.getOperand(1);
2190 // FIXME: Will this work for 64-bit floating point immediates?
2191 assert(!SrcOp.isFPImm());
2192 if (ST.hasMovB64()) {
2193 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2194 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2195 isUInt<32>(SrcOp.getImm()))
2196 break;
2197 }
2198 if (SrcOp.isImm()) {
2199 APInt Imm(64, SrcOp.getImm());
2200 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2201 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2202 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2203 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2205 .addImm(Lo.getSExtValue())
2207 .addImm(Lo.getSExtValue())
2208 .addImm(0) // op_sel_lo
2209 .addImm(0) // op_sel_hi
2210 .addImm(0) // neg_lo
2211 .addImm(0) // neg_hi
2212 .addImm(0); // clamp
2213 } else {
2214 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2215 .addImm(Lo.getSExtValue())
2217 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2218 .addImm(Hi.getSExtValue())
2220 }
2221 } else {
2222 assert(SrcOp.isReg());
2223 if (ST.hasPkMovB32() &&
2224 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2225 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2226 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2227 .addReg(SrcOp.getReg())
2229 .addReg(SrcOp.getReg())
2230 .addImm(0) // op_sel_lo
2231 .addImm(0) // op_sel_hi
2232 .addImm(0) // neg_lo
2233 .addImm(0) // neg_hi
2234 .addImm(0); // clamp
2235 } else {
2236 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2237 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2239 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2240 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2242 }
2243 }
2244 MI.eraseFromParent();
2245 break;
2246 }
2247 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2249 break;
2250 }
2251 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2252 const MachineOperand &SrcOp = MI.getOperand(1);
2253 assert(!SrcOp.isFPImm());
2254 APInt Imm(64, SrcOp.getImm());
2255 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2256 MI.setDesc(get(AMDGPU::S_MOV_B64));
2257 break;
2258 }
2259
2260 Register Dst = MI.getOperand(0).getReg();
2261 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2262 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2263
2264 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2265 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2266 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2267 .addImm(Lo.getSExtValue())
2269 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2270 .addImm(Hi.getSExtValue())
2272 MI.eraseFromParent();
2273 break;
2274 }
2275 case AMDGPU::V_SET_INACTIVE_B32: {
2276 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2277 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2278 // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
2279 // optimizations (mainly Register Coalescer) aware of WWM register liveness.
2280 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2281 .add(MI.getOperand(1));
2282 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2283 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2284 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2285 .add(MI.getOperand(2));
2286 BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2287 .addReg(Exec);
2288 MI.eraseFromParent();
2289 break;
2290 }
2291 case AMDGPU::V_SET_INACTIVE_B64: {
2292 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2293 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2294 MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2295 MI.getOperand(0).getReg())
2296 .add(MI.getOperand(1));
2297 expandPostRAPseudo(*Copy);
2298 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2299 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2300 Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2301 MI.getOperand(0).getReg())
2302 .add(MI.getOperand(2));
2303 expandPostRAPseudo(*Copy);
2304 BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2305 .addReg(Exec);
2306 MI.eraseFromParent();
2307 break;
2308 }
2309 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2310 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2311 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2312 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2313 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2314 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2315 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2316 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2317 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2318 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2319 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2320 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2321 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2322 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2323 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2324 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2325 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2326 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2327 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2328 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2329 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2330 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2331 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2332 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2333 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2334 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2335 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2336 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2337 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2338 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2339
2340 unsigned Opc;
2341 if (RI.hasVGPRs(EltRC)) {
2342 Opc = AMDGPU::V_MOVRELD_B32_e32;
2343 } else {
2344 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2345 : AMDGPU::S_MOVRELD_B32;
2346 }
2347
2348 const MCInstrDesc &OpDesc = get(Opc);
2349 Register VecReg = MI.getOperand(0).getReg();
2350 bool IsUndef = MI.getOperand(1).isUndef();
2351 unsigned SubReg = MI.getOperand(3).getImm();
2352 assert(VecReg == MI.getOperand(1).getReg());
2353
2355 BuildMI(MBB, MI, DL, OpDesc)
2356 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2357 .add(MI.getOperand(2))
2359 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2360
2361 const int ImpDefIdx =
2362 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2363 const int ImpUseIdx = ImpDefIdx + 1;
2364 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2365 MI.eraseFromParent();
2366 break;
2367 }
2368 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2369 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2370 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2371 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2372 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2373 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2374 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2375 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2376 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2377 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2378 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2379 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2381 Register VecReg = MI.getOperand(0).getReg();
2382 bool IsUndef = MI.getOperand(1).isUndef();
2383 Register Idx = MI.getOperand(3).getReg();
2384 Register SubReg = MI.getOperand(4).getImm();
2385
2386 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2387 .addReg(Idx)
2389 SetOn->getOperand(3).setIsUndef();
2390
2391 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2393 BuildMI(MBB, MI, DL, OpDesc)
2394 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2395 .add(MI.getOperand(2))
2397 .addReg(VecReg,
2398 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2399
2400 const int ImpDefIdx =
2401 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2402 const int ImpUseIdx = ImpDefIdx + 1;
2403 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2404
2405 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2406
2407 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2408
2409 MI.eraseFromParent();
2410 break;
2411 }
2412 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2413 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2414 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2415 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2416 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2417 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2418 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2419 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2420 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2421 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2422 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2423 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2425 Register Dst = MI.getOperand(0).getReg();
2426 Register VecReg = MI.getOperand(1).getReg();
2427 bool IsUndef = MI.getOperand(1).isUndef();
2428 Register Idx = MI.getOperand(2).getReg();
2429 Register SubReg = MI.getOperand(3).getImm();
2430
2431 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2432 .addReg(Idx)
2434 SetOn->getOperand(3).setIsUndef();
2435
2436 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2437 .addDef(Dst)
2438 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2439 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2440
2441 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2442
2443 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2444
2445 MI.eraseFromParent();
2446 break;
2447 }
2448 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2449 MachineFunction &MF = *MBB.getParent();
2450 Register Reg = MI.getOperand(0).getReg();
2451 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2452 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2453 MachineOperand OpLo = MI.getOperand(1);
2454 MachineOperand OpHi = MI.getOperand(2);
2455
2456 // Create a bundle so these instructions won't be re-ordered by the
2457 // post-RA scheduler.
2458 MIBundleBuilder Bundler(MBB, MI);
2459 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2460
2461 // What we want here is an offset from the value returned by s_getpc (which
2462 // is the address of the s_add_u32 instruction) to the global variable, but
2463 // since the encoding of $symbol starts 4 bytes after the start of the
2464 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2465 // small. This requires us to add 4 to the global variable offset in order
2466 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2467 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2468 // instruction.
2469
2470 int64_t Adjust = 0;
2471 if (ST.hasGetPCZeroExtension()) {
2472 // Fix up hardware that does not sign-extend the 48-bit PC value by
2473 // inserting: s_sext_i32_i16 reghi, reghi
2474 Bundler.append(
2475 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2476 Adjust += 4;
2477 }
2478
2479 if (OpLo.isGlobal())
2480 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2481 Bundler.append(
2482 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2483
2484 if (OpHi.isGlobal())
2485 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2486 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2487 .addReg(RegHi)
2488 .add(OpHi));
2489
2490 finalizeBundle(MBB, Bundler.begin());
2491
2492 MI.eraseFromParent();
2493 break;
2494 }
2495 case AMDGPU::ENTER_STRICT_WWM: {
2496 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2497 // Whole Wave Mode is entered.
2498 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
2499 : AMDGPU::S_OR_SAVEEXEC_B64));
2500 break;
2501 }
2502 case AMDGPU::ENTER_STRICT_WQM: {
2503 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2504 // STRICT_WQM is entered.
2505 const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2506 const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
2507 const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2508 BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
2509 BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
2510
2511 MI.eraseFromParent();
2512 break;
2513 }
2514 case AMDGPU::EXIT_STRICT_WWM:
2515 case AMDGPU::EXIT_STRICT_WQM: {
2516 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2517 // WWM/STICT_WQM is exited.
2518 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
2519 break;
2520 }
2521 case AMDGPU::SI_RETURN: {
2522 const MachineFunction *MF = MBB.getParent();
2523 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2524 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2525 // Hiding the return address use with SI_RETURN may lead to extra kills in
2526 // the function and missing live-ins. We are fine in practice because callee
2527 // saved register handling ensures the register value is restored before
2528 // RET, but we need the undef flag here to appease the MachineVerifier
2529 // liveness checks.
2531 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2532 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2533
2534 MIB.copyImplicitOps(MI);
2535 MI.eraseFromParent();
2536 break;
2537 }
2538
2539 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2540 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2541 MI.setDesc(get(AMDGPU::S_MUL_U64));
2542 break;
2543
2544 case AMDGPU::S_GETPC_B64_pseudo:
2545 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2546 if (ST.hasGetPCZeroExtension()) {
2547 Register Dst = MI.getOperand(0).getReg();
2548 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2549 // Fix up hardware that does not sign-extend the 48-bit PC value by
2550 // inserting: s_sext_i32_i16 dsthi, dsthi
2551 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2552 DstHi)
2553 .addReg(DstHi);
2554 }
2555 break;
2556 }
2557 return true;
2558}
2559
2562 unsigned SubIdx, const MachineInstr &Orig,
2563 const TargetRegisterInfo &RI) const {
2564
2565 // Try shrinking the instruction to remat only the part needed for current
2566 // context.
2567 // TODO: Handle more cases.
2568 unsigned Opcode = Orig.getOpcode();
2569 switch (Opcode) {
2570 case AMDGPU::S_LOAD_DWORDX16_IMM:
2571 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2572 if (SubIdx != 0)
2573 break;
2574
2575 if (I == MBB.end())
2576 break;
2577
2578 if (I->isBundled())
2579 break;
2580
2581 // Look for a single use of the register that is also a subreg.
2582 Register RegToFind = Orig.getOperand(0).getReg();
2583 MachineOperand *UseMO = nullptr;
2584 for (auto &CandMO : I->operands()) {
2585 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2586 continue;
2587 if (UseMO) {
2588 UseMO = nullptr;
2589 break;
2590 }
2591 UseMO = &CandMO;
2592 }
2593 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2594 break;
2595
2596 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2597 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2598
2601 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2602
2603 unsigned NewOpcode = -1;
2604 if (SubregSize == 256)
2605 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2606 else if (SubregSize == 128)
2607 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2608 else
2609 break;
2610
2611 const MCInstrDesc &TID = get(NewOpcode);
2612 const TargetRegisterClass *NewRC =
2613 RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
2614 MRI.setRegClass(DestReg, NewRC);
2615
2616 UseMO->setReg(DestReg);
2617 UseMO->setSubReg(AMDGPU::NoSubRegister);
2618
2619 // Use a smaller load with the desired size, possibly with updated offset.
2620 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2621 MI->setDesc(TID);
2622 MI->getOperand(0).setReg(DestReg);
2623 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2624 if (Offset) {
2625 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2626 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2627 OffsetMO->setImm(FinalOffset);
2628 }
2630 for (const MachineMemOperand *MemOp : Orig.memoperands())
2631 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2632 SubregSize / 8));
2633 MI->setMemRefs(*MF, NewMMOs);
2634
2635 MBB.insert(I, MI);
2636 return;
2637 }
2638
2639 default:
2640 break;
2641 }
2642
2643 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2644}
2645
2646std::pair<MachineInstr*, MachineInstr*>
2648 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2649
2650 if (ST.hasMovB64() &&
2652 getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2653 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2654 return std::pair(&MI, nullptr);
2655 }
2656
2657 MachineBasicBlock &MBB = *MI.getParent();
2661 Register Dst = MI.getOperand(0).getReg();
2662 unsigned Part = 0;
2663 MachineInstr *Split[2];
2664
2665 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2666 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2667 if (Dst.isPhysical()) {
2668 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2669 } else {
2670 assert(MRI.isSSA());
2671 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2672 MovDPP.addDef(Tmp);
2673 }
2674
2675 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2676 const MachineOperand &SrcOp = MI.getOperand(I);
2677 assert(!SrcOp.isFPImm());
2678 if (SrcOp.isImm()) {
2679 APInt Imm(64, SrcOp.getImm());
2680 Imm.ashrInPlace(Part * 32);
2681 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2682 } else {
2683 assert(SrcOp.isReg());
2684 Register Src = SrcOp.getReg();
2685 if (Src.isPhysical())
2686 MovDPP.addReg(RI.getSubReg(Src, Sub));
2687 else
2688 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2689 }
2690 }
2691
2692 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2693 MovDPP.addImm(MO.getImm());
2694
2695 Split[Part] = MovDPP;
2696 ++Part;
2697 }
2698
2699 if (Dst.isVirtual())
2700 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2701 .addReg(Split[0]->getOperand(0).getReg())
2702 .addImm(AMDGPU::sub0)
2703 .addReg(Split[1]->getOperand(0).getReg())
2704 .addImm(AMDGPU::sub1);
2705
2706 MI.eraseFromParent();
2707 return std::pair(Split[0], Split[1]);
2708}
2709
2710std::optional<DestSourcePair>
2712 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2713 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2714
2715 return std::nullopt;
2716}
2717
2719 MachineOperand &Src0,
2720 unsigned Src0OpName,
2721 MachineOperand &Src1,
2722 unsigned Src1OpName) const {
2723 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2724 if (!Src0Mods)
2725 return false;
2726
2727 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2728 assert(Src1Mods &&
2729 "All commutable instructions have both src0 and src1 modifiers");
2730
2731 int Src0ModsVal = Src0Mods->getImm();
2732 int Src1ModsVal = Src1Mods->getImm();
2733
2734 Src1Mods->setImm(Src0ModsVal);
2735 Src0Mods->setImm(Src1ModsVal);
2736 return true;
2737}
2738
2740 MachineOperand &RegOp,
2741 MachineOperand &NonRegOp) {
2742 Register Reg = RegOp.getReg();
2743 unsigned SubReg = RegOp.getSubReg();
2744 bool IsKill = RegOp.isKill();
2745 bool IsDead = RegOp.isDead();
2746 bool IsUndef = RegOp.isUndef();
2747 bool IsDebug = RegOp.isDebug();
2748
2749 if (NonRegOp.isImm())
2750 RegOp.ChangeToImmediate(NonRegOp.getImm());
2751 else if (NonRegOp.isFI())
2752 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2753 else if (NonRegOp.isGlobal()) {
2754 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2755 NonRegOp.getTargetFlags());
2756 } else
2757 return nullptr;
2758
2759 // Make sure we don't reinterpret a subreg index in the target flags.
2760 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2761
2762 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2763 NonRegOp.setSubReg(SubReg);
2764
2765 return &MI;
2766}
2767
2769 unsigned Src0Idx,
2770 unsigned Src1Idx) const {
2771 assert(!NewMI && "this should never be used");
2772
2773 unsigned Opc = MI.getOpcode();
2774 int CommutedOpcode = commuteOpcode(Opc);
2775 if (CommutedOpcode == -1)
2776 return nullptr;
2777
2778 if (Src0Idx > Src1Idx)
2779 std::swap(Src0Idx, Src1Idx);
2780
2781 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2782 static_cast<int>(Src0Idx) &&
2783 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2784 static_cast<int>(Src1Idx) &&
2785 "inconsistency with findCommutedOpIndices");
2786
2787 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2788 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2789
2790 MachineInstr *CommutedMI = nullptr;
2791 if (Src0.isReg() && Src1.isReg()) {
2792 if (isOperandLegal(MI, Src1Idx, &Src0)) {
2793 // Be sure to copy the source modifiers to the right place.
2794 CommutedMI
2795 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2796 }
2797
2798 } else if (Src0.isReg() && !Src1.isReg()) {
2799 // src0 should always be able to support any operand type, so no need to
2800 // check operand legality.
2801 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2802 } else if (!Src0.isReg() && Src1.isReg()) {
2803 if (isOperandLegal(MI, Src1Idx, &Src0))
2804 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2805 } else {
2806 // FIXME: Found two non registers to commute. This does happen.
2807 return nullptr;
2808 }
2809
2810 if (CommutedMI) {
2811 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2812 Src1, AMDGPU::OpName::src1_modifiers);
2813
2814 CommutedMI->setDesc(get(CommutedOpcode));
2815 }
2816
2817 return CommutedMI;
2818}
2819
2820// This needs to be implemented because the source modifiers may be inserted
2821// between the true commutable operands, and the base
2822// TargetInstrInfo::commuteInstruction uses it.
2824 unsigned &SrcOpIdx0,
2825 unsigned &SrcOpIdx1) const {
2826 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2827}
2828
2830 unsigned &SrcOpIdx0,
2831 unsigned &SrcOpIdx1) const {
2832 if (!Desc.isCommutable())
2833 return false;
2834
2835 unsigned Opc = Desc.getOpcode();
2836 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2837 if (Src0Idx == -1)
2838 return false;
2839
2840 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2841 if (Src1Idx == -1)
2842 return false;
2843
2844 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2845}
2846
2848 int64_t BrOffset) const {
2849 // BranchRelaxation should never have to check s_setpc_b64 because its dest
2850 // block is unanalyzable.
2851 assert(BranchOp != AMDGPU::S_SETPC_B64);
2852
2853 // Convert to dwords.
2854 BrOffset /= 4;
2855
2856 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2857 // from the next instruction.
2858 BrOffset -= 1;
2859
2860 return isIntN(BranchOffsetBits, BrOffset);
2861}
2862
2865 return MI.getOperand(0).getMBB();
2866}
2867
2869 for (const MachineInstr &MI : MBB->terminators()) {
2870 if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO ||
2871 MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2872 MI.getOpcode() == AMDGPU::SI_LOOP)
2873 return true;
2874 }
2875 return false;
2876}
2877
2879 MachineBasicBlock &DestBB,
2880 MachineBasicBlock &RestoreBB,
2881 const DebugLoc &DL, int64_t BrOffset,
2882 RegScavenger *RS) const {
2883 assert(RS && "RegScavenger required for long branching");
2884 assert(MBB.empty() &&
2885 "new block should be inserted for expanding unconditional branch");
2886 assert(MBB.pred_size() == 1);
2887 assert(RestoreBB.empty() &&
2888 "restore block should be inserted for restoring clobbered registers");
2889
2893
2894 // FIXME: Virtual register workaround for RegScavenger not working with empty
2895 // blocks.
2896 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2897
2898 auto I = MBB.end();
2899
2900 // We need to compute the offset relative to the instruction immediately after
2901 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2902 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2903
2904 auto &MCCtx = MF->getContext();
2905 MCSymbol *PostGetPCLabel =
2906 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2907 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2908
2909 MCSymbol *OffsetLo =
2910 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2911 MCSymbol *OffsetHi =
2912 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2913 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2914 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2915 .addReg(PCReg, 0, AMDGPU::sub0)
2916 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2917 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2918 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2919 .addReg(PCReg, 0, AMDGPU::sub1)
2920 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2921
2922 // Insert the indirect branch after the other terminator.
2923 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2924 .addReg(PCReg);
2925
2926 // If a spill is needed for the pc register pair, we need to insert a spill
2927 // restore block right before the destination block, and insert a short branch
2928 // into the old destination block's fallthrough predecessor.
2929 // e.g.:
2930 //
2931 // s_cbranch_scc0 skip_long_branch:
2932 //
2933 // long_branch_bb:
2934 // spill s[8:9]
2935 // s_getpc_b64 s[8:9]
2936 // s_add_u32 s8, s8, restore_bb
2937 // s_addc_u32 s9, s9, 0
2938 // s_setpc_b64 s[8:9]
2939 //
2940 // skip_long_branch:
2941 // foo;
2942 //
2943 // .....
2944 //
2945 // dest_bb_fallthrough_predecessor:
2946 // bar;
2947 // s_branch dest_bb
2948 //
2949 // restore_bb:
2950 // restore s[8:9]
2951 // fallthrough dest_bb
2952 ///
2953 // dest_bb:
2954 // buzz;
2955
2956 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
2957 Register Scav;
2958
2959 // If we've previously reserved a register for long branches
2960 // avoid running the scavenger and just use those registers
2961 if (LongBranchReservedReg) {
2962 RS->enterBasicBlock(MBB);
2963 Scav = LongBranchReservedReg;
2964 } else {
2966 Scav = RS->scavengeRegisterBackwards(
2967 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
2968 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
2969 }
2970 if (Scav) {
2971 RS->setRegUsed(Scav);
2972 MRI.replaceRegWith(PCReg, Scav);
2973 MRI.clearVirtRegs();
2974 } else {
2975 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
2976 // SGPR spill.
2977 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2978 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2979 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
2980 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
2981 MRI.clearVirtRegs();
2982 }
2983
2984 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
2985 // Now, the distance could be defined.
2987 MCSymbolRefExpr::create(DestLabel, MCCtx),
2988 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
2989 // Add offset assignments.
2990 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
2991 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
2992 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
2993 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
2994}
2995
2996unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
2997 switch (Cond) {
2998 case SIInstrInfo::SCC_TRUE:
2999 return AMDGPU::S_CBRANCH_SCC1;
3000 case SIInstrInfo::SCC_FALSE:
3001 return AMDGPU::S_CBRANCH_SCC0;
3002 case SIInstrInfo::VCCNZ:
3003 return AMDGPU::S_CBRANCH_VCCNZ;
3004 case SIInstrInfo::VCCZ:
3005 return AMDGPU::S_CBRANCH_VCCZ;
3006 case SIInstrInfo::EXECNZ:
3007 return AMDGPU::S_CBRANCH_EXECNZ;
3008 case SIInstrInfo::EXECZ:
3009 return AMDGPU::S_CBRANCH_EXECZ;
3010 default:
3011 llvm_unreachable("invalid branch predicate");
3012 }
3013}
3014
3015SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3016 switch (Opcode) {
3017 case AMDGPU::S_CBRANCH_SCC0:
3018 return SCC_FALSE;
3019 case AMDGPU::S_CBRANCH_SCC1:
3020 return SCC_TRUE;
3021 case AMDGPU::S_CBRANCH_VCCNZ:
3022 return VCCNZ;
3023 case AMDGPU::S_CBRANCH_VCCZ:
3024 return VCCZ;
3025 case AMDGPU::S_CBRANCH_EXECNZ:
3026 return EXECNZ;
3027 case AMDGPU::S_CBRANCH_EXECZ:
3028 return EXECZ;
3029 default:
3030 return INVALID_BR;
3031 }
3032}
3033
3037 MachineBasicBlock *&FBB,
3039 bool AllowModify) const {
3040 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3041 // Unconditional Branch
3042 TBB = I->getOperand(0).getMBB();
3043 return false;
3044 }
3045
3046 MachineBasicBlock *CondBB = nullptr;
3047
3048 if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
3049 CondBB = I->getOperand(1).getMBB();
3050 Cond.push_back(I->getOperand(0));
3051 } else {
3052 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3053 if (Pred == INVALID_BR)
3054 return true;
3055
3056 CondBB = I->getOperand(0).getMBB();
3057 Cond.push_back(MachineOperand::CreateImm(Pred));
3058 Cond.push_back(I->getOperand(1)); // Save the branch register.
3059 }
3060 ++I;
3061
3062 if (I == MBB.end()) {
3063 // Conditional branch followed by fall-through.
3064 TBB = CondBB;
3065 return false;
3066 }
3067
3068 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3069 TBB = CondBB;
3070 FBB = I->getOperand(0).getMBB();
3071 return false;
3072 }
3073
3074 return true;
3075}
3076
3078 MachineBasicBlock *&FBB,
3080 bool AllowModify) const {
3082 auto E = MBB.end();
3083 if (I == E)
3084 return false;
3085
3086 // Skip over the instructions that are artificially terminators for special
3087 // exec management.
3088 while (I != E && !I->isBranch() && !I->isReturn()) {
3089 switch (I->getOpcode()) {
3090 case AMDGPU::S_MOV_B64_term:
3091 case AMDGPU::S_XOR_B64_term:
3092 case AMDGPU::S_OR_B64_term:
3093 case AMDGPU::S_ANDN2_B64_term:
3094 case AMDGPU::S_AND_B64_term:
3095 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3096 case AMDGPU::S_MOV_B32_term:
3097 case AMDGPU::S_XOR_B32_term:
3098 case AMDGPU::S_OR_B32_term:
3099 case AMDGPU::S_ANDN2_B32_term:
3100 case AMDGPU::S_AND_B32_term:
3101 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3102 break;
3103 case AMDGPU::SI_IF:
3104 case AMDGPU::SI_ELSE:
3105 case AMDGPU::SI_KILL_I1_TERMINATOR:
3106 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3107 // FIXME: It's messy that these need to be considered here at all.
3108 return true;
3109 default:
3110 llvm_unreachable("unexpected non-branch terminator inst");
3111 }
3112
3113 ++I;
3114 }
3115
3116 if (I == E)
3117 return false;
3118
3119 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3120}
3121
3123 int *BytesRemoved) const {
3124 unsigned Count = 0;
3125 unsigned RemovedSize = 0;
3127 // Skip over artificial terminators when removing instructions.
3128 if (MI.isBranch() || MI.isReturn()) {
3129 RemovedSize += getInstSizeInBytes(MI);
3130 MI.eraseFromParent();
3131 ++Count;
3132 }
3133 }
3134
3135 if (BytesRemoved)
3136 *BytesRemoved = RemovedSize;
3137
3138 return Count;
3139}
3140
3141// Copy the flags onto the implicit condition register operand.
3143 const MachineOperand &OrigCond) {
3144 CondReg.setIsUndef(OrigCond.isUndef());
3145 CondReg.setIsKill(OrigCond.isKill());
3146}
3147
3150 MachineBasicBlock *FBB,
3152 const DebugLoc &DL,
3153 int *BytesAdded) const {
3154 if (!FBB && Cond.empty()) {
3155 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3156 .addMBB(TBB);
3157 if (BytesAdded)
3158 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3159 return 1;
3160 }
3161
3162 if(Cond.size() == 1 && Cond[0].isReg()) {
3163 BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
3164 .add(Cond[0])
3165 .addMBB(TBB);
3166 return 1;
3167 }
3168
3169 assert(TBB && Cond[0].isImm());
3170
3171 unsigned Opcode
3172 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3173
3174 if (!FBB) {
3175 MachineInstr *CondBr =
3176 BuildMI(&MBB, DL, get(Opcode))
3177 .addMBB(TBB);
3178
3179 // Copy the flags onto the implicit condition register operand.
3180 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3181 fixImplicitOperands(*CondBr);
3182
3183 if (BytesAdded)
3184 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3185 return 1;
3186 }
3187
3188 assert(TBB && FBB);
3189
3190 MachineInstr *CondBr =
3191 BuildMI(&MBB, DL, get(Opcode))
3192 .addMBB(TBB);
3193 fixImplicitOperands(*CondBr);
3194 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3195 .addMBB(FBB);
3196
3197 MachineOperand &CondReg = CondBr->getOperand(1);
3198 CondReg.setIsUndef(Cond[1].isUndef());
3199 CondReg.setIsKill(Cond[1].isKill());
3200
3201 if (BytesAdded)
3202 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3203
3204 return 2;
3205}
3206
3209 if (Cond.size() != 2) {
3210 return true;
3211 }
3212
3213 if (Cond[0].isImm()) {
3214 Cond[0].setImm(-Cond[0].getImm());
3215 return false;
3216 }
3217
3218 return true;
3219}
3220
3223 Register DstReg, Register TrueReg,
3224 Register FalseReg, int &CondCycles,
3225 int &TrueCycles, int &FalseCycles) const {
3226 switch (Cond[0].getImm()) {
3227 case VCCNZ:
3228 case VCCZ: {
3230 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3231 if (MRI.getRegClass(FalseReg) != RC)
3232 return false;
3233
3234 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3235 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3236
3237 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3238 return RI.hasVGPRs(RC) && NumInsts <= 6;
3239 }
3240 case SCC_TRUE:
3241 case SCC_FALSE: {
3242 // FIXME: We could insert for VGPRs if we could replace the original compare
3243 // with a vector one.
3245 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3246 if (MRI.getRegClass(FalseReg) != RC)
3247 return false;
3248
3249 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3250
3251 // Multiples of 8 can do s_cselect_b64
3252 if (NumInsts % 2 == 0)
3253 NumInsts /= 2;
3254
3255 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3256 return RI.isSGPRClass(RC);
3257 }
3258 default:
3259 return false;
3260 }
3261}
3262
3266 Register TrueReg, Register FalseReg) const {
3267 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3268 if (Pred == VCCZ || Pred == SCC_FALSE) {
3269 Pred = static_cast<BranchPredicate>(-Pred);
3270 std::swap(TrueReg, FalseReg);
3271 }
3272
3274 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3275 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3276
3277 if (DstSize == 32) {
3279 if (Pred == SCC_TRUE) {
3280 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3281 .addReg(TrueReg)
3282 .addReg(FalseReg);
3283 } else {
3284 // Instruction's operands are backwards from what is expected.
3285 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3286 .addReg(FalseReg)
3287 .addReg(TrueReg);
3288 }
3289
3290 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3291 return;
3292 }
3293
3294 if (DstSize == 64 && Pred == SCC_TRUE) {
3296 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3297 .addReg(TrueReg)
3298 .addReg(FalseReg);
3299
3300 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3301 return;
3302 }
3303
3304 static const int16_t Sub0_15[] = {
3305 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3306 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3307 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3308 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3309 };
3310
3311 static const int16_t Sub0_15_64[] = {
3312 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3313 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3314 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3315 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3316 };
3317
3318 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3319 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3320 const int16_t *SubIndices = Sub0_15;
3321 int NElts = DstSize / 32;
3322
3323 // 64-bit select is only available for SALU.
3324 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3325 if (Pred == SCC_TRUE) {
3326 if (NElts % 2) {
3327 SelOp = AMDGPU::S_CSELECT_B32;
3328 EltRC = &AMDGPU::SGPR_32RegClass;
3329 } else {
3330 SelOp = AMDGPU::S_CSELECT_B64;
3331 EltRC = &AMDGPU::SGPR_64RegClass;
3332 SubIndices = Sub0_15_64;
3333 NElts /= 2;
3334 }
3335 }
3336
3338 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3339
3340 I = MIB->getIterator();
3341
3343 for (int Idx = 0; Idx != NElts; ++Idx) {
3344 Register DstElt = MRI.createVirtualRegister(EltRC);
3345 Regs.push_back(DstElt);
3346
3347 unsigned SubIdx = SubIndices[Idx];
3348
3350 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3351 Select =
3352 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3353 .addReg(FalseReg, 0, SubIdx)
3354 .addReg(TrueReg, 0, SubIdx);
3355 } else {
3356 Select =
3357 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3358 .addReg(TrueReg, 0, SubIdx)
3359 .addReg(FalseReg, 0, SubIdx);
3360 }
3361
3362 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3364
3365 MIB.addReg(DstElt)
3366 .addImm(SubIdx);
3367 }
3368}
3369
3371 switch (MI.getOpcode()) {
3372 case AMDGPU::V_MOV_B16_t16_e32:
3373 case AMDGPU::V_MOV_B16_t16_e64:
3374 case AMDGPU::V_MOV_B32_e32:
3375 case AMDGPU::V_MOV_B32_e64:
3376 case AMDGPU::V_MOV_B64_PSEUDO:
3377 case AMDGPU::V_MOV_B64_e32:
3378 case AMDGPU::V_MOV_B64_e64:
3379 case AMDGPU::S_MOV_B32:
3380 case AMDGPU::S_MOV_B64:
3381 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3382 case AMDGPU::COPY:
3383 case AMDGPU::WWM_COPY:
3384 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3385 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3386 case AMDGPU::V_ACCVGPR_MOV_B32:
3387 return true;
3388 default:
3389 return false;
3390 }
3391}
3392
3393static constexpr unsigned ModifierOpNames[] = {
3394 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3395 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3396 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3397
3399 unsigned Opc = MI.getOpcode();
3400 for (unsigned Name : reverse(ModifierOpNames)) {
3402 if (Idx >= 0)
3403 MI.removeOperand(Idx);
3404 }
3405}
3406
3408 Register Reg, MachineRegisterInfo *MRI) const {
3409 if (!MRI->hasOneNonDBGUse(Reg))
3410 return false;
3411
3412 switch (DefMI.getOpcode()) {
3413 default:
3414 return false;
3415 case AMDGPU::V_MOV_B64_e32:
3416 case AMDGPU::S_MOV_B64:
3417 case AMDGPU::V_MOV_B64_PSEUDO:
3418 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3419 case AMDGPU::V_MOV_B32_e32:
3420 case AMDGPU::S_MOV_B32:
3421 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3422 break;
3423 }
3424
3425 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
3426 assert(ImmOp);
3427 // FIXME: We could handle FrameIndex values here.
3428 if (!ImmOp->isImm())
3429 return false;
3430
3431 auto getImmFor = [ImmOp](const MachineOperand &UseOp) -> int64_t {
3432 int64_t Imm = ImmOp->getImm();
3433 switch (UseOp.getSubReg()) {
3434 default:
3435 return Imm;
3436 case AMDGPU::sub0:
3437 return Lo_32(Imm);
3438 case AMDGPU::sub1:
3439 return Hi_32(Imm);
3440 case AMDGPU::lo16:
3441 return APInt(16, Imm).getSExtValue();
3442 case AMDGPU::hi16:
3443 return APInt(32, Imm).ashr(16).getSExtValue();
3444 case AMDGPU::sub1_lo16:
3445 return APInt(16, Hi_32(Imm)).getSExtValue();
3446 case AMDGPU::sub1_hi16:
3447 return APInt(32, Hi_32(Imm)).ashr(16).getSExtValue();
3448 }
3449 };
3450
3451 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3452
3453 unsigned Opc = UseMI.getOpcode();
3454 if (Opc == AMDGPU::COPY) {
3455 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3456
3457 Register DstReg = UseMI.getOperand(0).getReg();
3458 unsigned OpSize = getOpSize(UseMI, 0);
3459 bool Is16Bit = OpSize == 2;
3460 bool Is64Bit = OpSize == 8;
3461 bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
3462 unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
3463 : AMDGPU::V_MOV_B32_e32
3464 : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
3465 : AMDGPU::S_MOV_B32;
3466 APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1)));
3467
3468 if (RI.isAGPR(*MRI, DstReg)) {
3469 if (Is64Bit || !isInlineConstant(Imm))
3470 return false;
3471 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
3472 }
3473
3474 if (Is16Bit) {
3475 if (isVGPRCopy)
3476 return false; // Do not clobber vgpr_hi16
3477
3478 if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
3479 return false;
3480
3481 UseMI.getOperand(0).setSubReg(0);
3482 if (DstReg.isPhysical()) {
3483 DstReg = RI.get32BitRegister(DstReg);
3484 UseMI.getOperand(0).setReg(DstReg);
3485 }
3486 assert(UseMI.getOperand(1).getReg().isVirtual());
3487 }
3488
3489 const MCInstrDesc &NewMCID = get(NewOpc);
3490 if (DstReg.isPhysical() &&
3491 !RI.getRegClass(NewMCID.operands()[0].RegClass)->contains(DstReg))
3492 return false;
3493
3494 UseMI.setDesc(NewMCID);
3495 UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
3496 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
3497 return true;
3498 }
3499
3500 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3501 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3502 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3503 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3504 Opc == AMDGPU::V_FMAC_F16_t16_e64) {
3505 // Don't fold if we are using source or output modifiers. The new VOP2
3506 // instructions don't have them.
3508 return false;
3509
3510 // If this is a free constant, there's no reason to do this.
3511 // TODO: We could fold this here instead of letting SIFoldOperands do it
3512 // later.
3513 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
3514
3515 // Any src operand can be used for the legality check.
3516 if (isInlineConstant(UseMI, *Src0, *ImmOp))
3517 return false;
3518
3519 bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3520 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64;
3521 bool IsFMA =
3522 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3523 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3524 Opc == AMDGPU::V_FMAC_F16_t16_e64;
3525 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3526 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3527
3528 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3529 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3530 (Src1->isReg() && Src1->getReg() == Reg)) {
3531 MachineOperand *RegSrc =
3532 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3533 if (!RegSrc->isReg())
3534 return false;
3535 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3536 ST.getConstantBusLimit(Opc) < 2)
3537 return false;
3538
3539 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3540 return false;
3541
3542 // If src2 is also a literal constant then we have to choose which one to
3543 // fold. In general it is better to choose madak so that the other literal
3544 // can be materialized in an sgpr instead of a vgpr:
3545 // s_mov_b32 s0, literal
3546 // v_madak_f32 v0, s0, v0, literal
3547 // Instead of:
3548 // v_mov_b32 v1, literal
3549 // v_madmk_f32 v0, v0, literal, v1
3550 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3551 if (Def && Def->isMoveImmediate() &&
3552 !isInlineConstant(Def->getOperand(1)))
3553 return false;
3554
3555 unsigned NewOpc =
3556 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3557 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
3558 : AMDGPU::V_FMAMK_F16)
3559 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
3560 if (pseudoToMCOpcode(NewOpc) == -1)
3561 return false;
3562
3563 // V_FMAMK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
3564 // would also require restricting their register classes. For now
3565 // just bail out.
3566 if (NewOpc == AMDGPU::V_FMAMK_F16_t16)
3567 return false;
3568
3569 const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
3570
3571 // FIXME: This would be a lot easier if we could return a new instruction
3572 // instead of having to modify in place.
3573
3574 Register SrcReg = RegSrc->getReg();
3575 unsigned SrcSubReg = RegSrc->getSubReg();
3576 Src0->setReg(SrcReg);
3577 Src0->setSubReg(SrcSubReg);
3578 Src0->setIsKill(RegSrc->isKill());
3579
3580 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3581 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3582 Opc == AMDGPU::V_FMAC_F16_e64)
3583 UseMI.untieRegOperand(
3584 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3585
3586 Src1->ChangeToImmediate(Imm);
3587
3589 UseMI.setDesc(get(NewOpc));
3590
3591 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3592 if (DeleteDef)
3593 DefMI.eraseFromParent();
3594
3595 return true;
3596 }
3597
3598 // Added part is the constant: Use v_madak_{f16, f32}.
3599 if (Src2->isReg() && Src2->getReg() == Reg) {
3600 if (ST.getConstantBusLimit(Opc) < 2) {
3601 // Not allowed to use constant bus for another operand.
3602 // We can however allow an inline immediate as src0.
3603 bool Src0Inlined = false;
3604 if (Src0->isReg()) {
3605 // Try to inline constant if possible.
3606 // If the Def moves immediate and the use is single
3607 // We are saving VGPR here.
3608 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3609 if (Def && Def->isMoveImmediate() &&
3610 isInlineConstant(Def->getOperand(1)) &&
3611 MRI->hasOneUse(Src0->getReg())) {
3612 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3613 Src0Inlined = true;
3614 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3615 RI.isSGPRReg(*MRI, Src0->getReg())) {
3616 return false;
3617 }
3618 // VGPR is okay as Src0 - fallthrough
3619 }
3620
3621 if (Src1->isReg() && !Src0Inlined) {
3622 // We have one slot for inlinable constant so far - try to fill it
3623 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3624 if (Def && Def->isMoveImmediate() &&
3625 isInlineConstant(Def->getOperand(1)) &&
3626 MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI))
3627 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3628 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3629 return false;
3630 // VGPR is okay as Src1 - fallthrough
3631 }
3632 }
3633
3634 unsigned NewOpc =
3635 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3636 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
3637 : AMDGPU::V_FMAAK_F16)
3638 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
3639 if (pseudoToMCOpcode(NewOpc) == -1)
3640 return false;
3641
3642 // V_FMAAK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
3643 // would also require restricting their register classes. For now
3644 // just bail out.
3645 if (NewOpc == AMDGPU::V_FMAAK_F16_t16)
3646 return false;
3647
3648 // FIXME: This would be a lot easier if we could return a new instruction
3649 // instead of having to modify in place.
3650
3651 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3652 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3653 Opc == AMDGPU::V_FMAC_F16_e64)
3654 UseMI.untieRegOperand(
3655 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3656
3657 // ChangingToImmediate adds Src2 back to the instruction.
3658 Src2->ChangeToImmediate(getImmFor(*Src2));
3659
3660 // These come before src2.
3662 UseMI.setDesc(get(NewOpc));
3663 // It might happen that UseMI was commuted
3664 // and we now have SGPR as SRC1. If so 2 inlined
3665 // constant and SGPR are illegal.
3667
3668 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3669 if (DeleteDef)
3670 DefMI.eraseFromParent();
3671
3672 return true;
3673 }
3674 }
3675
3676 return false;
3677}
3678
3679static bool
3682 if (BaseOps1.size() != BaseOps2.size())
3683 return false;
3684 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3685 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3686 return false;
3687 }
3688 return true;
3689}
3690
3691static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3692 LocationSize WidthB, int OffsetB) {
3693 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3694 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3695 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3696 return LowWidth.hasValue() &&
3697 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3698}
3699
3700bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3701 const MachineInstr &MIb) const {
3702 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3703 int64_t Offset0, Offset1;
3704 LocationSize Dummy0 = 0, Dummy1 = 0;
3705 bool Offset0IsScalable, Offset1IsScalable;
3706 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3707 Dummy0, &RI) ||
3708 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3709 Dummy1, &RI))
3710 return false;
3711
3712 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3713 return false;
3714
3715 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3716 // FIXME: Handle ds_read2 / ds_write2.
3717 return false;
3718 }
3719 LocationSize Width0 = MIa.memoperands().front()->getSize();
3720 LocationSize Width1 = MIb.memoperands().front()->getSize();
3721 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3722}
3723
3725 const MachineInstr &MIb) const {
3726 assert(MIa.mayLoadOrStore() &&
3727 "MIa must load from or modify a memory location");
3728 assert(MIb.mayLoadOrStore() &&
3729 "MIb must load from or modify a memory location");
3730
3732 return false;
3733
3734 // XXX - Can we relax this between address spaces?
3735 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3736 return false;
3737
3738 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3739 return false;
3740
3741 // TODO: Should we check the address space from the MachineMemOperand? That
3742 // would allow us to distinguish objects we know don't alias based on the
3743 // underlying address space, even if it was lowered to a different one,
3744 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3745 // buffer.
3746 if (isDS(MIa)) {
3747 if (isDS(MIb))
3748 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3749
3750 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3751 }
3752
3753 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3754 if (isMUBUF(MIb) || isMTBUF(MIb))
3755 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3756
3757 if (isFLAT(MIb))
3758 return isFLATScratch(MIb);
3759
3760 return !isSMRD(MIb);
3761 }
3762
3763 if (isSMRD(MIa)) {
3764 if (isSMRD(MIb))
3765 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3766
3767 if (isFLAT(MIb))
3768 return isFLATScratch(MIb);
3769
3770 return !isMUBUF(MIb) && !isMTBUF(MIb);
3771 }
3772
3773 if (isFLAT(MIa)) {
3774 if (isFLAT(MIb)) {
3775 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3776 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3777 return true;
3778
3779 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3780 }
3781
3782 return false;
3783 }
3784
3785 return false;
3786}
3787
3789 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3790 if (Reg.isPhysical())
3791 return false;
3792 auto *Def = MRI.getUniqueVRegDef(Reg);
3793 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3794 Imm = Def->getOperand(1).getImm();
3795 if (DefMI)
3796 *DefMI = Def;
3797 return true;
3798 }
3799 return false;
3800}
3801
3802static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3803 MachineInstr **DefMI = nullptr) {
3804 if (!MO->isReg())
3805 return false;
3806 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3807 const MachineRegisterInfo &MRI = MF->getRegInfo();
3808 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3809}
3810
3812 MachineInstr &NewMI) {
3813 if (LV) {
3814 unsigned NumOps = MI.getNumOperands();
3815 for (unsigned I = 1; I < NumOps; ++I) {
3816 MachineOperand &Op = MI.getOperand(I);
3817 if (Op.isReg() && Op.isKill())
3818 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3819 }
3820 }
3821}
3822
3824 LiveVariables *LV,
3825 LiveIntervals *LIS) const {
3826 MachineBasicBlock &MBB = *MI.getParent();
3827 unsigned Opc = MI.getOpcode();
3828
3829 // Handle MFMA.
3830 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
3831 if (NewMFMAOpc != -1) {
3833 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
3834 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3835 MIB.add(MI.getOperand(I));
3836 updateLiveVariables(LV, MI, *MIB);
3837 if (LIS) {
3838 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3839 // SlotIndex of defs needs to be updated when converting to early-clobber
3840 MachineOperand &Def = MIB->getOperand(0);
3841 if (Def.isEarlyClobber() && Def.isReg() &&
3842 LIS->hasInterval(Def.getReg())) {
3843 SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
3844 SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
3845 auto &LI = LIS->getInterval(Def.getReg());
3846 auto UpdateDefIndex = [&](LiveRange &LR) {
3847 auto S = LR.find(OldIndex);
3848 if (S != LR.end() && S->start == OldIndex) {
3849 assert(S->valno && S->valno->def == OldIndex);
3850 S->start = NewIndex;
3851 S->valno->def = NewIndex;
3852 }
3853 };
3854 UpdateDefIndex(LI);
3855 for (auto &SR : LI.subranges())
3856 UpdateDefIndex(SR);
3857 }
3858 }
3859 return MIB;
3860 }
3861
3862 if (SIInstrInfo::isWMMA(MI)) {
3863 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
3864 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3865 .setMIFlags(MI.getFlags());
3866 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3867 MIB->addOperand(MI.getOperand(I));
3868
3869 updateLiveVariables(LV, MI, *MIB);
3870 if (LIS)
3871 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3872
3873 return MIB;
3874 }
3875
3876 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
3877 "V_FMAC_F16_t16_e32 is not supported and not expected to be present "
3878 "pre-RA");
3879
3880 // Handle MAC/FMAC.
3881 bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
3882 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3883 Opc == AMDGPU::V_FMAC_F16_t16_e64;
3884 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3885 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3886 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
3887 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3888 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3889 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3890 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3891 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
3892 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
3893 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3894 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
3895 bool Src0Literal = false;
3896
3897 switch (Opc) {
3898 default:
3899 return nullptr;
3900 case AMDGPU::V_MAC_F16_e64:
3901 case AMDGPU::V_FMAC_F16_e64:
3902 case AMDGPU::V_FMAC_F16_t16_e64:
3903 case AMDGPU::V_MAC_F32_e64:
3904 case AMDGPU::V_MAC_LEGACY_F32_e64:
3905 case AMDGPU::V_FMAC_F32_e64:
3906 case AMDGPU::V_FMAC_LEGACY_F32_e64:
3907 case AMDGPU::V_FMAC_F64_e64:
3908 break;
3909 case AMDGPU::V_MAC_F16_e32:
3910 case AMDGPU::V_FMAC_F16_e32:
3911 case AMDGPU::V_MAC_F32_e32:
3912 case AMDGPU::V_MAC_LEGACY_F32_e32:
3913 case AMDGPU::V_FMAC_F32_e32:
3914 case AMDGPU::V_FMAC_LEGACY_F32_e32:
3915 case AMDGPU::V_FMAC_F64_e32: {
3916 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
3917 AMDGPU::OpName::src0);
3918 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
3919 if (!Src0->isReg() && !Src0->isImm())
3920 return nullptr;
3921
3922 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
3923 Src0Literal = true;
3924
3925 break;
3926 }
3927 }
3928
3930 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
3931 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
3932 const MachineOperand *Src0Mods =
3933 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
3934 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3935 const MachineOperand *Src1Mods =
3936 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
3937 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3938 const MachineOperand *Src2Mods =
3939 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
3940 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
3941 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
3942 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
3943
3944 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 &&
3945 !IsLegacy &&
3946 // If we have an SGPR input, we will violate the constant bus restriction.
3947 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
3948 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
3950 const auto killDef = [&]() -> void {
3952 // The only user is the instruction which will be killed.
3953 Register DefReg = DefMI->getOperand(0).getReg();
3954 if (!MRI.hasOneNonDBGUse(DefReg))
3955 return;
3956 // We cannot just remove the DefMI here, calling pass will crash.
3957 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
3958 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
3960 if (LV)
3961 LV->getVarInfo(DefReg).AliveBlocks.clear();
3962 };
3963
3964 int64_t Imm;
3965 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
3966 unsigned NewOpc =
3967 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
3968 : AMDGPU::V_FMAAK_F16)
3969 : AMDGPU::V_FMAAK_F32)
3970 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
3971 if (pseudoToMCOpcode(NewOpc) != -1) {
3972 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3973 .add(*Dst)
3974 .add(*Src0)
3975 .add(*Src1)
3976 .addImm(Imm)
3977 .setMIFlags(MI.getFlags());
3978 updateLiveVariables(LV, MI, *MIB);
3979 if (LIS)
3980 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3981 killDef();
3982 return MIB;
3983 }
3984 }
3985 unsigned NewOpc =
3986 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
3987 : AMDGPU::V_FMAMK_F16)
3988 : AMDGPU::V_FMAMK_F32)
3989 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
3990 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
3991 if (pseudoToMCOpcode(NewOpc) != -1) {
3992 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3993 .add(*Dst)
3994 .add(*Src0)
3995 .addImm(Imm)
3996 .add(*Src2)
3997 .setMIFlags(MI.getFlags());
3998 updateLiveVariables(LV, MI, *MIB);
3999 if (LIS)
4000 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4001 killDef();
4002 return MIB;
4003 }
4004 }
4005 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4006 if (Src0Literal) {
4007 Imm = Src0->getImm();
4008 DefMI = nullptr;
4009 }
4010 if (pseudoToMCOpcode(NewOpc) != -1 &&
4012 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4013 Src1)) {
4014 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4015 .add(*Dst)
4016 .add(*Src1)
4017 .addImm(Imm)
4018 .add(*Src2)
4019 .setMIFlags(MI.getFlags());
4020 updateLiveVariables(LV, MI, *MIB);
4021 if (LIS)
4022 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4023 if (DefMI)
4024 killDef();
4025 return MIB;
4026 }
4027 }
4028 }
4029
4030 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4031 // if VOP3 does not allow a literal operand.
4032 if (Src0Literal && !ST.hasVOP3Literal())
4033 return nullptr;
4034
4035 unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64
4036 : IsF64 ? AMDGPU::V_FMA_F64_e64
4037 : IsLegacy
4038 ? AMDGPU::V_FMA_LEGACY_F32_e64
4039 : AMDGPU::V_FMA_F32_e64
4040 : IsF16 ? AMDGPU::V_MAD_F16_e64
4041 : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64
4042 : AMDGPU::V_MAD_F32_e64;
4043 if (pseudoToMCOpcode(NewOpc) == -1)
4044 return nullptr;
4045
4046 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4047 .add(*Dst)
4048 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4049 .add(*Src0)
4050 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4051 .add(*Src1)
4052 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4053 .add(*Src2)
4054 .addImm(Clamp ? Clamp->getImm() : 0)
4055 .addImm(Omod ? Omod->getImm() : 0)
4056 .setMIFlags(MI.getFlags());
4057 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4058 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4059 updateLiveVariables(LV, MI, *MIB);
4060 if (LIS)
4061 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4062 return MIB;
4063}
4064
4065// It's not generally safe to move VALU instructions across these since it will
4066// start using the register as a base index rather than directly.
4067// XXX - Why isn't hasSideEffects sufficient for these?
4069 switch (MI.getOpcode()) {
4070 case AMDGPU::S_SET_GPR_IDX_ON:
4071 case AMDGPU::S_SET_GPR_IDX_MODE:
4072 case AMDGPU::S_SET_GPR_IDX_OFF:
4073 return true;
4074 default:
4075 return false;
4076 }
4077}
4078
4080 const MachineBasicBlock *MBB,
4081 const MachineFunction &MF) const {
4082 // Skipping the check for SP writes in the base implementation. The reason it
4083 // was added was apparently due to compile time concerns.
4084 //
4085 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4086 // but is probably avoidable.
4087
4088 // Copied from base implementation.
4089 // Terminators and labels can't be scheduled around.
4090 if (MI.isTerminator() || MI.isPosition())
4091 return true;
4092
4093 // INLINEASM_BR can jump to another block
4094 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4095 return true;
4096
4097 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4098 return true;
4099
4100 // Target-independent instructions do not have an implicit-use of EXEC, even
4101 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4102 // boundaries prevents incorrect movements of such instructions.
4103 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4104 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4105 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4106 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4108}
4109
4111 return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
4112}
4113
4115 // Skip the full operand and register alias search modifiesRegister
4116 // does. There's only a handful of instructions that touch this, it's only an
4117 // implicit def, and doesn't alias any other registers.
4118 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4119}
4120
4122 unsigned Opcode = MI.getOpcode();
4123
4124 if (MI.mayStore() && isSMRD(MI))
4125 return true; // scalar store or atomic
4126
4127 // This will terminate the function when other lanes may need to continue.
4128 if (MI.isReturn())
4129 return true;
4130
4131 // These instructions cause shader I/O that may cause hardware lockups
4132 // when executed with an empty EXEC mask.
4133 //
4134 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4135 // EXEC = 0, but checking for that case here seems not worth it
4136 // given the typical code patterns.
4137 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4138 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4139 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4140 return true;
4141
4142 if (MI.isCall() || MI.isInlineAsm())
4143 return true; // conservative assumption
4144
4145 // Assume that barrier interactions are only intended with active lanes.
4146 if (isBarrier(Opcode))
4147 return true;
4148
4149 // A mode change is a scalar operation that influences vector instructions.
4151 return true;
4152
4153 // These are like SALU instructions in terms of effects, so it's questionable
4154 // whether we should return true for those.
4155 //
4156 // However, executing them with EXEC = 0 causes them to operate on undefined
4157 // data, which we avoid by returning true here.
4158 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4159 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4160 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4161 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4162 return true;
4163
4164 return false;
4165}
4166
4168 const MachineInstr &MI) const {
4169 if (MI.isMetaInstruction())
4170 return false;
4171
4172 // This won't read exec if this is an SGPR->SGPR copy.
4173 if (MI.isCopyLike()) {
4174 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4175 return true;
4176
4177 // Make sure this isn't copying exec as a normal operand
4178 return MI.readsRegister(AMDGPU::EXEC, &RI);
4179 }
4180
4181 // Make a conservative assumption about the callee.
4182 if (MI.isCall())
4183 return true;
4184
4185 // Be conservative with any unhandled generic opcodes.
4186 if (!isTargetSpecificOpcode(MI.getOpcode()))
4187 return true;
4188
4189 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4190}
4191
4192bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4193 switch (Imm.getBitWidth()) {
4194 case 1: // This likely will be a condition code mask.
4195 return true;
4196
4197 case 32:
4198 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4199 ST.hasInv2PiInlineImm());
4200 case 64:
4201 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4202 ST.hasInv2PiInlineImm());
4203 case 16:
4204 return ST.has16BitInsts() &&
4205 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4206 ST.hasInv2PiInlineImm());
4207 default:
4208 llvm_unreachable("invalid bitwidth");
4209 }
4210}
4211
4213 APInt IntImm = Imm.bitcastToAPInt();
4214 int64_t IntImmVal = IntImm.getSExtValue();
4215 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4216 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4217 default:
4218 llvm_unreachable("invalid fltSemantics");
4221 return isInlineConstant(IntImm);
4223 return ST.has16BitInsts() &&
4224 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4226 return ST.has16BitInsts() &&
4227 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4228 }
4229}
4230
4232 uint8_t OperandType) const {
4233 assert(!MO.isReg() && "isInlineConstant called on register operand!");
4234 if (!MO.isImm())
4235 return false;
4236
4237 // MachineOperand provides no way to tell the true operand size, since it only
4238 // records a 64-bit value. We need to know the size to determine if a 32-bit
4239 // floating point immediate bit pattern is legal for an integer immediate. It
4240 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4241
4242 int64_t Imm = MO.getImm();
4243 switch (OperandType) {
4256 int32_t Trunc = static_cast<int32_t>(Imm);
4258 }
4265 ST.hasInv2PiInlineImm());
4269 // We would expect inline immediates to not be concerned with an integer/fp
4270 // distinction. However, in the case of 16-bit integer operations, the
4271 // "floating point" values appear to not work. It seems read the low 16-bits
4272 // of 32-bit immediates, which happens to always work for the integer
4273 // values.
4274 //
4275 // See llvm bugzilla 46302.
4276 //
4277 // TODO: Theoretically we could use op-sel to use the high bits of the
4278 // 32-bit FP values.
4296 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4297 // A few special case instructions have 16-bit operands on subtargets
4298 // where 16-bit instructions are not legal.
4299 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4300 // constants in these cases
4301 int16_t Trunc = static_cast<int16_t>(Imm);
4302 return ST.has16BitInsts() &&
4304 }
4305
4306 return false;
4307 }
4312 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4313 int16_t Trunc = static_cast<int16_t>(Imm);
4314 return ST.has16BitInsts() &&
4316 }
4317 return false;
4318 }
4321 return false;
4324 // Always embedded in the instruction for free.
4325 return true;
4335 // Just ignore anything else.
4336 return true;
4337 default:
4338 llvm_unreachable("invalid operand type");
4339 }
4340}
4341
4342static bool compareMachineOp(const MachineOperand &Op0,
4343 const MachineOperand &Op1) {
4344 if (Op0.getType() != Op1.getType())
4345 return false;
4346
4347 switch (Op0.getType()) {
4349 return Op0.getReg() == Op1.getReg();
4351 return Op0.getImm() == Op1.getImm();
4352 default:
4353 llvm_unreachable("Didn't expect to be comparing these operand types");
4354 }
4355}
4356
4358 const MachineOperand &MO) const {
4359 const MCInstrDesc &InstDesc = MI.getDesc();
4360 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4361
4362 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
4363
4365 return true;
4366
4367 if (OpInfo.RegClass < 0)
4368 return false;
4369
4370 if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
4371 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
4372 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4373 AMDGPU::OpName::src2))
4374 return false;
4375 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4376 }
4377
4378 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4379 return false;
4380
4381 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
4382 return true;
4383
4384 return ST.hasVOP3Literal();
4385}
4386
4387bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4388 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4389 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4390 return false;
4391
4392 int Op32 = AMDGPU::getVOPe32(Opcode);
4393 if (Op32 == -1)
4394 return false;
4395
4396 return pseudoToMCOpcode(Op32) != -1;
4397}
4398
4399bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4400 // The src0_modifier operand is present on all instructions
4401 // that have modifiers.
4402
4403 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4404}
4405
4407 unsigned OpName) const {
4408 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4409 return Mods && Mods->getImm();
4410}
4411
4413 return any_of(ModifierOpNames,
4414 [&](unsigned Name) { return hasModifiersSet(MI, Name); });
4415}
4416
4418 const MachineRegisterInfo &MRI) const {
4419 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4420 // Can't shrink instruction with three operands.
4421 if (Src2) {
4422 switch (MI.getOpcode()) {
4423 default: return false;
4424
4425 case AMDGPU::V_ADDC_U32_e64:
4426 case AMDGPU::V_SUBB_U32_e64:
4427 case AMDGPU::V_SUBBREV_U32_e64: {
4428 const MachineOperand *Src1
4429 = getNamedOperand(MI, AMDGPU::OpName::src1);
4430 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4431 return false;
4432 // Additional verification is needed for sdst/src2.
4433 return true;
4434 }
4435 case AMDGPU::V_MAC_F16_e64:
4436 case AMDGPU::V_MAC_F32_e64:
4437 case AMDGPU::V_MAC_LEGACY_F32_e64:
4438 case AMDGPU::V_FMAC_F16_e64:
4439 case AMDGPU::V_FMAC_F16_t16_e64:
4440 case AMDGPU::V_FMAC_F32_e64:
4441 case AMDGPU::V_FMAC_F64_e64:
4442 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4443 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4444 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4445 return false;
4446 break;
4447
4448 case AMDGPU::V_CNDMASK_B32_e64:
4449 break;
4450 }
4451 }
4452
4453 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4454 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4455 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4456 return false;
4457
4458 // We don't need to check src0, all input types are legal, so just make sure
4459 // src0 isn't using any modifiers.
4460 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4461 return false;
4462
4463 // Can it be shrunk to a valid 32 bit opcode?
4464 if (!hasVALU32BitEncoding(MI.getOpcode()))
4465 return false;
4466
4467 // Check output modifiers
4468 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4469 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4470 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel);
4471}
4472
4473// Set VCC operand with all flags from \p Orig, except for setting it as
4474// implicit.
4476 const MachineOperand &Orig) {
4477
4478 for (MachineOperand &Use : MI.implicit_operands()) {
4479 if (Use.isUse() &&
4480 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4481 Use.setIsUndef(Orig.isUndef());
4482 Use.setIsKill(Orig.isKill());
4483 return;
4484 }
4485 }
4486}
4487
4489 unsigned Op32) const {
4490 MachineBasicBlock *MBB = MI.getParent();
4491
4492 const MCInstrDesc &Op32Desc = get(Op32);
4493 MachineInstrBuilder Inst32 =
4494 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4495 .setMIFlags(MI.getFlags());
4496
4497 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4498 // For VOPC instructions, this is replaced by an implicit def of vcc.
4499
4500 // We assume the defs of the shrunk opcode are in the same order, and the
4501 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4502 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4503 Inst32.add(MI.getOperand(I));
4504
4505 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4506
4507 int Idx = MI.getNumExplicitDefs();
4508 for (const MachineOperand &Use : MI.explicit_uses()) {
4509 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4511 continue;
4512
4513 if (&Use == Src2) {
4514 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4515 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4516 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4517 // of vcc was already added during the initial BuildMI, but we
4518 // 1) may need to change vcc to vcc_lo to preserve the original register
4519 // 2) have to preserve the original flags.
4520 fixImplicitOperands(*Inst32);
4521 copyFlagsToImplicitVCC(*Inst32, *Src2);
4522 continue;
4523 }
4524 }
4525
4526 Inst32.add(Use);
4527 }
4528
4529 // FIXME: Losing implicit operands
4530
4531 return Inst32;
4532}
4533
4535 const MachineOperand &MO,
4536 const MCOperandInfo &OpInfo) const {
4537 // Literal constants use the constant bus.
4538 if (!MO.isReg())
4539 return !isInlineConstant(MO, OpInfo);
4540
4541 if (!MO.isUse())
4542 return false;
4543
4544 if (MO.getReg().isVirtual())
4545 return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
4546
4547 // Null is free
4548 if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64)
4549 return false;
4550
4551 // SGPRs use the constant bus
4552 if (MO.isImplicit()) {
4553 return MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
4554 MO.getReg() == AMDGPU::VCC_LO;
4555 }
4556 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
4557 AMDGPU::SReg_64RegClass.contains(MO.getReg());
4558}
4559
4561 for (const MachineOperand &MO : MI.implicit_operands()) {
4562 // We only care about reads.
4563 if (MO.isDef())
4564 continue;
4565
4566 switch (MO.getReg()) {
4567 case AMDGPU::VCC:
4568 case AMDGPU::VCC_LO:
4569 case AMDGPU::VCC_HI:
4570 case AMDGPU::M0:
4571 case AMDGPU::FLAT_SCR:
4572 return MO.getReg();
4573
4574 default:
4575 break;
4576 }
4577 }
4578
4579 return Register();
4580}
4581
4582static bool shouldReadExec(const MachineInstr &MI) {
4583 if (SIInstrInfo::isVALU(MI)) {
4584 switch (MI.getOpcode()) {
4585 case AMDGPU::V_READLANE_B32:
4586 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4587 case AMDGPU::V_WRITELANE_B32:
4588 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4589 return false;
4590 }
4591
4592 return true;
4593 }
4594
4595 if (MI.isPreISelOpcode() ||
4596 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4599 return false;
4600
4601 return true;
4602}
4603
4604static bool isRegOrFI(const MachineOperand &MO) {
4605 return MO.isReg() || MO.isFI();
4606}
4607
4608static bool isSubRegOf(const SIRegisterInfo &TRI,
4609 const MachineOperand &SuperVec,
4610 const MachineOperand &SubReg) {
4611 if (SubReg.getReg().isPhysical())
4612 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4613
4614 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4615 SubReg.getReg() == SuperVec.getReg();
4616}
4617
4619 StringRef &ErrInfo) const {
4620 uint16_t Opcode = MI.getOpcode();
4621 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
4622 return true;
4623
4624 const MachineFunction *MF = MI.getParent()->getParent();
4625 const MachineRegisterInfo &MRI = MF->getRegInfo();
4626
4627 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4628 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4629 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4630 int Src3Idx = -1;
4631 if (Src0Idx == -1) {
4632 // VOPD V_DUAL_* instructions use different operand names.
4633 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4634 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4635 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4636 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4637 }
4638
4639 // Make sure the number of operands is correct.
4640 const MCInstrDesc &Desc = get(Opcode);
4641 if (!Desc.isVariadic() &&
4642 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4643 ErrInfo = "Instruction has wrong number of operands.";
4644 return false;
4645 }
4646
4647 if (MI.isInlineAsm()) {
4648 // Verify register classes for inlineasm constraints.
4649 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4650 I != E; ++I) {
4651 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4652 if (!RC)
4653 continue;
4654
4655 const MachineOperand &Op = MI.getOperand(I);
4656 if (!Op.isReg())
4657 continue;
4658
4659 Register Reg = Op.getReg();
4660 if (!Reg.isVirtual() && !RC->contains(Reg)) {
4661 ErrInfo = "inlineasm operand has incorrect register class.";
4662 return false;
4663 }
4664 }
4665
4666 return true;
4667 }
4668
4669 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
4670 ErrInfo = "missing memory operand from image instruction.";
4671 return false;
4672 }
4673
4674 // Make sure the register classes are correct.
4675 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
4676 const MachineOperand &MO = MI.getOperand(i);
4677 if (MO.isFPImm()) {
4678 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
4679 "all fp values to integers.";
4680 return false;
4681 }
4682
4683 int RegClass = Desc.operands()[i].RegClass;
4684
4685 switch (Desc.operands()[i].OperandType) {
4687 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
4688 ErrInfo = "Illegal immediate value for operand.";
4689 return false;
4690 }
4691 break;
4696 break;
4708 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
4709 ErrInfo = "Illegal immediate value for operand.";
4710 return false;
4711 }
4712 break;
4713 }
4715 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
4716 ErrInfo = "Expected inline constant for operand.";
4717 return false;
4718 }
4719 break;
4722 // Check if this operand is an immediate.
4723 // FrameIndex operands will be replaced by immediates, so they are
4724 // allowed.
4725 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
4726 ErrInfo = "Expected immediate, but got non-immediate";
4727 return false;
4728 }
4729 [[fallthrough]];
4730 default:
4731 continue;
4732 }
4733
4734 if (!MO.isReg())
4735 continue;
4736 Register Reg = MO.getReg();
4737 if (!Reg)
4738 continue;
4739
4740 // FIXME: Ideally we would have separate instruction definitions with the
4741 // aligned register constraint.
4742 // FIXME: We do not verify inline asm operands, but custom inline asm
4743 // verification is broken anyway
4744 if (ST.needsAlignedVGPRs()) {
4745 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
4746 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
4747 const TargetRegisterClass *SubRC =
4748 RI.getSubRegisterClass(RC, MO.getSubReg());
4749 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
4750 if (RC)
4751 RC = SubRC;
4752 }
4753
4754 // Check that this is the aligned version of the class.
4755 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
4756 ErrInfo = "Subtarget requires even aligned vector registers";
4757 return false;
4758 }
4759 }
4760
4761 if (RegClass != -1) {
4762 if (Reg.isVirtual())
4763 continue;
4764
4765 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
4766 if (!RC->contains(Reg)) {
4767 ErrInfo = "Operand has incorrect register class.";
4768 return false;
4769 }
4770 }
4771 }
4772
4773 // Verify SDWA
4774 if (isSDWA(MI)) {
4775 if (!ST.hasSDWA()) {
4776 ErrInfo = "SDWA is not supported on this target";
4777 return false;
4778 }
4779
4780 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
4781
4782 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
4783 if (OpIdx == -1)
4784 continue;
4785 const MachineOperand &MO = MI.getOperand(OpIdx);
4786
4787 if (!ST.hasSDWAScalar()) {
4788 // Only VGPRS on VI
4789 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
4790 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
4791 return false;
4792 }
4793 } else {
4794 // No immediates on GFX9
4795 if (!MO.isReg()) {
4796 ErrInfo =
4797 "Only reg allowed as operands in SDWA instructions on GFX9+";
4798 return false;
4799 }
4800 }
4801 }
4802
4803 if (!ST.hasSDWAOmod()) {
4804 // No omod allowed on VI
4805 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4806 if (OMod != nullptr &&
4807 (!OMod->isImm() || OMod->getImm() != 0)) {
4808 ErrInfo = "OMod not allowed in SDWA instructions on VI";
4809 return false;
4810 }
4811 }
4812
4813 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
4814 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
4815 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
4816 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
4817 const MachineOperand *Src0ModsMO =
4818 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4819 unsigned Mods = Src0ModsMO->getImm();
4820 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
4821 Mods & SISrcMods::SEXT) {
4822 ErrInfo = "sext, abs and neg are not allowed on this instruction";
4823 return false;
4824 }
4825 }
4826
4827 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
4828 if (isVOPC(BasicOpcode)) {
4829 if (!ST.hasSDWASdst() && DstIdx != -1) {
4830 // Only vcc allowed as dst on VI for VOPC
4831 const MachineOperand &Dst = MI.getOperand(DstIdx);
4832 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
4833 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
4834 return false;
4835 }
4836 } else if (!ST.hasSDWAOutModsVOPC()) {
4837 // No clamp allowed on GFX9 for VOPC
4838 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4839 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
4840 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
4841 return false;
4842 }
4843
4844 // No omod allowed on GFX9 for VOPC
4845 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4846 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
4847 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
4848 return false;
4849 }
4850 }
4851 }
4852
4853 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
4854 if (DstUnused && DstUnused->isImm() &&
4855 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
4856 const MachineOperand &Dst = MI.getOperand(DstIdx);
4857 if (!Dst.isReg() || !Dst.isTied()) {
4858 ErrInfo = "Dst register should have tied register";
4859 return false;
4860 }
4861
4862 const MachineOperand &TiedMO =
4863 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
4864 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
4865 ErrInfo =
4866 "Dst register should be tied to implicit use of preserved register";
4867 return false;
4868 }
4869 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
4870 ErrInfo = "Dst register should use same physical register as preserved";
4871 return false;
4872 }
4873 }
4874 }
4875
4876 // Verify MIMG / VIMAGE / VSAMPLE
4877 if (isImage(MI.getOpcode()) && !MI.mayStore()) {
4878 // Ensure that the return type used is large enough for all the options
4879 // being used TFE/LWE require an extra result register.
4880 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
4881 if (DMask) {
4882 uint64_t DMaskImm = DMask->getImm();
4883 uint32_t RegCount =
4884 isGather4(MI.getOpcode()) ? 4 : llvm::popcount(DMaskImm);
4885 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
4886 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
4887 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
4888
4889 // Adjust for packed 16 bit values
4890 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
4891 RegCount = divideCeil(RegCount, 2);
4892
4893 // Adjust if using LWE or TFE
4894 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
4895 RegCount += 1;
4896
4897 const uint32_t DstIdx =
4898 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
4899 const MachineOperand &Dst = MI.getOperand(DstIdx);
4900 if (Dst.isReg()) {
4901 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
4902 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
4903 if (RegCount > DstSize) {
4904 ErrInfo = "Image instruction returns too many registers for dst "
4905 "register class";
4906 return false;
4907 }
4908 }
4909 }
4910 }
4911
4912 // Verify VOP*. Ignore multiple sgpr operands on writelane.
4913 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
4914 unsigned ConstantBusCount = 0;
4915 bool UsesLiteral = false;
4916 const MachineOperand *LiteralVal = nullptr;
4917
4918 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
4919 if (ImmIdx != -1) {
4920 ++ConstantBusCount;
4921 UsesLiteral = true;
4922 LiteralVal = &MI.getOperand(ImmIdx);
4923 }
4924
4925 SmallVector<Register, 2> SGPRsUsed;
4926 Register SGPRUsed;
4927
4928 // Only look at the true operands. Only a real operand can use the constant
4929 // bus, and we don't want to check pseudo-operands like the source modifier
4930 // flags.
4931 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
4932 if (OpIdx == -1)
4933 continue;
4934 const MachineOperand &MO = MI.getOperand(OpIdx);
4935 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
4936 if (MO.isReg()) {
4937 SGPRUsed = MO.getReg();
4938 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
4939 ++ConstantBusCount;
4940 SGPRsUsed.push_back(SGPRUsed);
4941 }
4942 } else if (!MO.isFI()) { // Treat FI like a register.
4943 if (!UsesLiteral) {
4944 ++ConstantBusCount;
4945 UsesLiteral = true;
4946 LiteralVal = &MO;
4947 } else if (!MO.isIdenticalTo(*LiteralVal)) {
4948 assert(isVOP2(MI) || isVOP3(MI));
4949 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
4950 return false;
4951 }
4952 }
4953 }
4954 }
4955
4956 SGPRUsed = findImplicitSGPRRead(MI);
4957 if (SGPRUsed) {
4958 // Implicit uses may safely overlap true operands
4959 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
4960 return !RI.regsOverlap(SGPRUsed, SGPR);
4961 })) {
4962 ++ConstantBusCount;
4963 SGPRsUsed.push_back(SGPRUsed);
4964 }
4965 }
4966
4967 // v_writelane_b32 is an exception from constant bus restriction:
4968 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
4969 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
4970 Opcode != AMDGPU::V_WRITELANE_B32) {
4971 ErrInfo = "VOP* instruction violates constant bus restriction";
4972 return false;
4973 }
4974
4975 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
4976 ErrInfo = "VOP3 instruction uses literal";
4977 return false;
4978 }
4979 }
4980
4981 // Special case for writelane - this can break the multiple constant bus rule,
4982 // but still can't use more than one SGPR register
4983 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
4984 unsigned SGPRCount = 0;
4985 Register SGPRUsed;
4986
4987 for (int OpIdx : {Src0Idx, Src1Idx}) {
4988 if (OpIdx == -1)
4989 break;
4990
4991 const MachineOperand &MO = MI.getOperand(OpIdx);
4992
4993 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
4994 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
4995 if (MO.getReg() != SGPRUsed)
4996 ++SGPRCount;
4997 SGPRUsed = MO.getReg();
4998 }
4999 }
5000 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5001 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5002 return false;
5003 }
5004 }
5005 }
5006
5007 // Verify misc. restrictions on specific instructions.
5008 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5009 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5010 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5011 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5012 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5013 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5014 if (!compareMachineOp(Src0, Src1) &&
5015 !compareMachineOp(Src0, Src2)) {
5016 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5017 return false;
5018 }
5019 }
5020 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5021 SISrcMods::ABS) ||
5022 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5023 SISrcMods::ABS) ||
5024 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5025 SISrcMods::ABS)) {
5026 ErrInfo = "ABS not allowed in VOP3B instructions";
5027 return false;
5028 }
5029 }
5030
5031 if (isSOP2(MI) || isSOPC(MI)) {
5032 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5033 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5034
5035 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5036 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5037 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5038 !Src0.isIdenticalTo(Src1)) {
5039 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5040 return false;
5041 }
5042 }
5043
5044 if (isSOPK(MI)) {
5045 auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5046 if (Desc.isBranch()) {
5047 if (!Op->isMBB()) {
5048 ErrInfo = "invalid branch target for SOPK instruction";
5049 return false;
5050 }
5051 } else {
5052 uint64_t Imm = Op->getImm();
5053 if (sopkIsZext(Opcode)) {
5054 if (!isUInt<16>(Imm)) {
5055 ErrInfo = "invalid immediate for SOPK instruction";
5056 return false;
5057 }
5058 } else {
5059 if (!isInt<16>(Imm)) {
5060 ErrInfo = "invalid immediate for SOPK instruction";
5061 return false;
5062 }
5063 }
5064 }
5065 }
5066
5067 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5068 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5069 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5070 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5071 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5072 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5073
5074 const unsigned StaticNumOps =
5075 Desc.getNumOperands() + Desc.implicit_uses().size();
5076 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5077
5078 // Allow additional implicit operands. This allows a fixup done by the post
5079 // RA scheduler where the main implicit operand is killed and implicit-defs
5080 // are added for sub-registers that remain live after this instruction.
5081 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5082 ErrInfo = "missing implicit register operands";
5083 return false;
5084 }
5085
5086 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5087 if (IsDst) {
5088 if (!Dst->isUse()) {
5089 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5090 return false;
5091 }
5092
5093 unsigned UseOpIdx;
5094 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5095 UseOpIdx != StaticNumOps + 1) {
5096 ErrInfo = "movrel implicit operands should be tied";
5097 return false;
5098 }
5099 }
5100
5101 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5102 const MachineOperand &ImpUse
5103 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5104 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5105 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5106 ErrInfo = "src0 should be subreg of implicit vector use";
5107 return false;
5108 }
5109 }
5110
5111 // Make sure we aren't losing exec uses in the td files. This mostly requires
5112 // being careful when using let Uses to try to add other use registers.
5113 if (shouldReadExec(MI)) {
5114 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5115 ErrInfo = "VALU instruction does not implicitly read exec mask";
5116 return false;
5117 }
5118 }
5119
5120 if (isSMRD(MI)) {
5121 if (MI.mayStore() &&
5123 // The register offset form of scalar stores may only use m0 as the
5124 // soffset register.
5125 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5126 if (Soff && Soff->getReg() != AMDGPU::M0) {
5127 ErrInfo = "scalar stores must use m0 as offset register";
5128 return false;
5129 }
5130 }
5131 }
5132
5133 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5134 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5135 if (Offset->getImm() != 0) {
5136 ErrInfo = "subtarget does not support offsets in flat instructions";
5137 return false;
5138 }
5139 }
5140
5141 if (isDS(MI) && !ST.hasGDS()) {
5142 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5143 if (GDSOp && GDSOp->getImm() != 0) {
5144 ErrInfo = "GDS is not supported on this subtarget";
5145 return false;
5146 }
5147 }
5148
5149 if (isImage(MI)) {
5150 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5151 if (DimOp) {
5152 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5153 AMDGPU::OpName::vaddr0);
5154 int RSrcOpName =
5155 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5156 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5157 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5158 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5160 const AMDGPU::MIMGDimInfo *Dim =
5162
5163 if (!Dim) {
5164 ErrInfo = "dim is out of range";
5165 return false;
5166 }
5167
5168 bool IsA16 = false;
5169 if (ST.hasR128A16()) {
5170 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5171 IsA16 = R128A16->getImm() != 0;
5172 } else if (ST.hasA16()) {
5173 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5174 IsA16 = A16->getImm() != 0;
5175 }
5176
5177 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5178
5179 unsigned AddrWords =
5180 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5181
5182 unsigned VAddrWords;
5183 if (IsNSA) {
5184 VAddrWords = RsrcIdx - VAddr0Idx;
5185 if (ST.hasPartialNSAEncoding() &&
5186 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5187 unsigned LastVAddrIdx = RsrcIdx - 1;
5188 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5189 }
5190 } else {
5191 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5192 if (AddrWords > 12)
5193 AddrWords = 16;
5194 }
5195
5196 if (VAddrWords != AddrWords) {
5197 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5198 << " but got " << VAddrWords << "\n");
5199 ErrInfo = "bad vaddr size";
5200 return false;
5201 }
5202 }
5203 }
5204
5205 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5206 if (DppCt) {
5207 using namespace AMDGPU::DPP;
5208
5209 unsigned DC = DppCt->getImm();
5210 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5211 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5212 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5213 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5214 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5215 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5216 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5217 ErrInfo = "Invalid dpp_ctrl value";
5218 return false;
5219 }
5220 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5222 ErrInfo = "Invalid dpp_ctrl value: "
5223 "wavefront shifts are not supported on GFX10+";
5224 return false;
5225 }
5226 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5228 ErrInfo = "Invalid dpp_ctrl value: "
5229 "broadcasts are not supported on GFX10+";
5230 return false;
5231 }
5232 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5234 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5235 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5236 !ST.hasGFX90AInsts()) {
5237 ErrInfo = "Invalid dpp_ctrl value: "
5238 "row_newbroadcast/row_share is not supported before "
5239 "GFX90A/GFX10";
5240 return false;
5241 }
5242 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5243 ErrInfo = "Invalid dpp_ctrl value: "
5244 "row_share and row_xmask are not supported before GFX10";
5245 return false;
5246 }
5247 }
5248
5249 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5251 ErrInfo = "Invalid dpp_ctrl value: "
5252 "DP ALU dpp only support row_newbcast";
5253 return false;
5254 }
5255 }
5256
5257 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5258 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5259 uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0
5260 : AMDGPU::OpName::vdata;
5261 const MachineOperand *Data = getNamedOperand(MI, DataNameIdx);
5262 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5263 if (Data && !Data->isReg())
5264 Data = nullptr;
5265
5266 if (ST.hasGFX90AInsts()) {
5267 if (Dst && Data &&
5268 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5269 ErrInfo = "Invalid register class: "
5270 "vdata and vdst should be both VGPR or AGPR";
5271 return false;
5272 }
5273 if (Data && Data2 &&
5274 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5275 ErrInfo = "Invalid register class: "
5276 "both data operands should be VGPR or AGPR";
5277 return false;
5278 }
5279 } else {
5280 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5281 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5282 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5283 ErrInfo = "Invalid register class: "
5284 "agpr loads and stores not supported on this GPU";
5285 return false;
5286 }
5287 }
5288 }
5289
5290 if (ST.needsAlignedVGPRs()) {
5291 const auto isAlignedReg = [&MI, &MRI, this](unsigned OpName) -> bool {
5293 if (!Op)
5294 return true;
5295 Register Reg = Op->getReg();
5296 if (Reg.isPhysical())
5297 return !(RI.getHWRegIndex(Reg) & 1);
5298 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5299 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5300 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5301 };
5302
5303 if (MI.getOpcode() == AMDGPU::DS_GWS_INIT ||
5304 MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
5305 MI.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
5306
5307 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5308 ErrInfo = "Subtarget requires even aligned vector registers "
5309 "for DS_GWS instructions";
5310 return false;
5311 }
5312 }
5313
5314 if (isMIMG(MI)) {
5315 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5316 ErrInfo = "Subtarget requires even aligned vector registers "
5317 "for vaddr operand of image instructions";
5318 return false;
5319 }
5320 }
5321 }
5322
5323 if (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
5324 !ST.hasGFX90AInsts()) {
5325 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5326 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5327 ErrInfo = "Invalid register class: "
5328 "v_accvgpr_write with an SGPR is not supported on this GPU";
5329 return false;
5330 }
5331 }
5332
5333 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5334 const MachineOperand &SrcOp = MI.getOperand(1);
5335 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5336 ErrInfo = "pseudo expects only physical SGPRs";
5337 return false;
5338 }
5339 }
5340
5341 return true;
5342}
5343
5344// It is more readable to list mapped opcodes on the same line.
5345// clang-format off
5346
5348 switch (MI.getOpcode()) {
5349 default: return AMDGPU::INSTRUCTION_LIST_END;
5350 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5351 case AMDGPU::COPY: return AMDGPU::COPY;
5352 case AMDGPU::PHI: return AMDGPU::PHI;
5353 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5354 case AMDGPU::WQM: return AMDGPU::WQM;
5355 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5356 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5357 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5358 case AMDGPU::S_MOV_B32: {
5359 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5360 return MI.getOperand(1).isReg() ||
5361 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5362 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5363 }
5364 case AMDGPU::S_ADD_I32:
5365 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5366 case AMDGPU::S_ADDC_U32:
5367 return AMDGPU::V_ADDC_U32_e32;
5368 case AMDGPU::S_SUB_I32:
5369 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5370 // FIXME: These are not consistently handled, and selected when the carry is
5371 // used.
5372 case AMDGPU::S_ADD_U32:
5373 return AMDGPU::V_ADD_CO_U32_e32;
5374 case AMDGPU::S_SUB_U32:
5375 return AMDGPU::V_SUB_CO_U32_e32;
5376 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5377 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5378 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5379 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5380 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5381 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5382 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5383 case AMDGPU::S_XNOR_B32:
5384 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5385 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5386 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5387 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5388 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5389 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5390 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5391 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5392 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5393 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5394 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5395 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5396 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5397 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5398 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5399 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5400 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5401 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5402 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5403 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5404 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5405 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5406 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5407 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5408 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5409 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5410 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5411 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5412 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5413 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5414 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5415 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5416 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5417 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5418 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5419 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5420 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5421 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5422 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5423 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5424 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5425 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5426 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5427 case AMDGPU::S_CVT_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
5428 case AMDGPU::S_CVT_HI_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
5429 case AMDGPU::S_CVT_F16_F32: return AMDGPU::V_CVT_F16_F32_t16_e64;
5430 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5431 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5432 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5433 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5434 case AMDGPU::S_CEIL_F16:
5435 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5436 : AMDGPU::V_CEIL_F16_fake16_e64;
5437 case AMDGPU::S_FLOOR_F16:
5438 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5439 : AMDGPU::V_FLOOR_F16_fake16_e64;
5440 case AMDGPU::S_TRUNC_F16:
5441 return AMDGPU::V_TRUNC_F16_fake16_e64;
5442 case AMDGPU::S_RNDNE_F16:
5443 return AMDGPU::V_RNDNE_F16_fake16_e64;
5444 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5445 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5446 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5447 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5448 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5449 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5450 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5451 case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64;
5452 case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
5453 case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
5454 case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
5455 case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64;
5456 case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64;
5457 case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
5458 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5459 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5460 case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_t16_e64;
5461 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5462 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5463 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5464 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5465 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5466 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5467 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5468 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5469 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5470 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5471 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5472 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5473 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5474 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5475 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5476 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5477 case AMDGPU::S_CMP_LT_F16: return AMDGPU::V_CMP_LT_F16_t16_e64;
5478 case AMDGPU::S_CMP_EQ_F16: return AMDGPU::V_CMP_EQ_F16_t16_e64;
5479 case AMDGPU::S_CMP_LE_F16: return AMDGPU::V_CMP_LE_F16_t16_e64;
5480 case AMDGPU::S_CMP_GT_F16: return AMDGPU::V_CMP_GT_F16_t16_e64;
5481 case AMDGPU::S_CMP_LG_F16: return AMDGPU::V_CMP_LG_F16_t16_e64;
5482 case AMDGPU::S_CMP_GE_F16: return AMDGPU::V_CMP_GE_F16_t16_e64;
5483 case AMDGPU::S_CMP_O_F16: return AMDGPU::V_CMP_O_F16_t16_e64;
5484 case AMDGPU::S_CMP_U_F16: return AMDGPU::V_CMP_U_F16_t16_e64;
5485 case AMDGPU::S_CMP_NGE_F16: return AMDGPU::V_CMP_NGE_F16_t16_e64;
5486 case AMDGPU::S_CMP_NLG_F16: return AMDGPU::V_CMP_NLG_F16_t16_e64;
5487 case AMDGPU::S_CMP_NGT_F16: return AMDGPU::V_CMP_NGT_F16_t16_e64;
5488 case AMDGPU::S_CMP_NLE_F16: return AMDGPU::V_CMP_NLE_F16_t16_e64;
5489 case AMDGPU::S_CMP_NEQ_F16: return AMDGPU::V_CMP_NEQ_F16_t16_e64;
5490 case AMDGPU::S_CMP_NLT_F16: return AMDGPU::V_CMP_NLT_F16_t16_e64;
5491 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5492 case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_fake16_e64;
5493 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5494 case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_fake16_e64;
5495 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5496 case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_fake16_e64;
5497 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5498 case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_fake16_e64;
5499 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5500 case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_fake16_e64;
5501 }
5503 "Unexpected scalar opcode without corresponding vector one!");
5504}
5505
5506// clang-format on
5507
5511 const DebugLoc &DL, Register Reg,
5512 bool IsSCCLive,
5513 SlotIndexes *Indexes) const {
5514 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5515 const SIInstrInfo *TII = ST.getInstrInfo();
5516 bool IsWave32 = ST.isWave32();
5517 if (IsSCCLive) {
5518 // Insert two move instructions, one to save the original value of EXEC and
5519 // the other to turn on all bits in EXEC. This is required as we can't use
5520 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5521 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5522 MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5523 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
5524 .addReg(Exec, RegState::Kill);
5525 auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
5526 if (Indexes) {
5527 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5528 Indexes->insertMachineInstrInMaps(*FlipExecMI);
5529 }
5530 } else {
5531 const unsigned OrSaveExec =
5532 IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
5533 auto SaveExec =
5534 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
5535 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5536 if (Indexes)
5537 Indexes->insertMachineInstrInMaps(*SaveExec);
5538 }
5539}
5540
5543 const DebugLoc &DL, Register Reg,
5544 SlotIndexes *Indexes) const {
5545 unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5546 MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5547 auto ExecRestoreMI =
5548 BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
5549 if (Indexes)
5550 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
5551}
5552
5553static const TargetRegisterClass *
5555 const MachineRegisterInfo &MRI,
5556 const MCInstrDesc &TID, unsigned RCID,
5557 bool IsAllocatable) {
5558 if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5559 (((TID.mayLoad() || TID.mayStore()) &&
5560 !(TID.TSFlags & SIInstrFlags::Spill)) ||
5562 switch (RCID) {
5563 case AMDGPU::AV_32RegClassID:
5564 RCID = AMDGPU::VGPR_32RegClassID;
5565 break;
5566 case AMDGPU::AV_64RegClassID:
5567 RCID = AMDGPU::VReg_64RegClassID;
5568 break;
5569 case AMDGPU::AV_96RegClassID:
5570 RCID = AMDGPU::VReg_96RegClassID;
5571 break;
5572 case AMDGPU::AV_128RegClassID:
5573 RCID = AMDGPU::VReg_128RegClassID;
5574 break;
5575 case AMDGPU::AV_160RegClassID:
5576 RCID = AMDGPU::VReg_160RegClassID;
5577 break;
5578 case AMDGPU::AV_512RegClassID:
5579 RCID = AMDGPU::VReg_512RegClassID;
5580 break;
5581 default:
5582 break;
5583 }
5584 }
5585
5586 return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
5587}
5588
5590 unsigned OpNum, const TargetRegisterInfo *TRI,
5591 const MachineFunction &MF)
5592 const {
5593 if (OpNum >= TID.getNumOperands())
5594 return nullptr;
5595 auto RegClass = TID.operands()[OpNum].RegClass;
5596 bool IsAllocatable = false;
5598 // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
5599 // with two data operands. Request register class constrained to VGPR only
5600 // of both operands present as Machine Copy Propagation can not check this
5601 // constraint and possibly other passes too.
5602 //
5603 // The check is limited to FLAT and DS because atomics in non-flat encoding
5604 // have their vdst and vdata tied to be the same register.
5605 const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5606 AMDGPU::OpName::vdst);
5607 const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5608 (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
5609 : AMDGPU::OpName::vdata);
5610 if (DataIdx != -1) {
5611 IsAllocatable = VDstIdx != -1 || AMDGPU::hasNamedOperand(
5612 TID.Opcode, AMDGPU::OpName::data1);
5613 }
5614 }
5615 return adjustAllocatableRegClass(ST, RI, MF.getRegInfo(), TID, RegClass,
5616 IsAllocatable);
5617}
5618
5620 unsigned OpNo) const {
5621 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5622 const MCInstrDesc &Desc = get(MI.getOpcode());
5623 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
5624 Desc.operands()[OpNo].RegClass == -1) {
5625 Register Reg = MI.getOperand(OpNo).getReg();
5626
5627 if (Reg.isVirtual())
5628 return MRI.getRegClass(Reg);
5629 return RI.getPhysRegBaseClass(Reg);
5630 }
5631
5632 unsigned RCID = Desc.operands()[OpNo].RegClass;
5633 return adjustAllocatableRegClass(ST, RI, MRI, Desc, RCID, true);
5634}
5635
5638 MachineBasicBlock *MBB = MI.getParent();
5639 MachineOperand &MO = MI.getOperand(OpIdx);
5641 unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
5642 const TargetRegisterClass *RC = RI.getRegClass(RCID);
5643 unsigned Size = RI.getRegSizeInBits(*RC);
5644 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
5645 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
5646 : AMDGPU::V_MOV_B32_e32;
5647 if (MO.isReg())
5648 Opcode = AMDGPU::COPY;
5649 else if (RI.isSGPRClass(RC))
5650 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
5651
5652 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
5653 Register Reg = MRI.createVirtualRegister(VRC);
5655 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
5656 MO.ChangeToRegister(Reg, false);
5657}
5658
5661 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
5662 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5663 MachineBasicBlock *MBB = MI->getParent();
5664 DebugLoc DL = MI->getDebugLoc();
5665 Register SubReg = MRI.createVirtualRegister(SubRC);
5666
5667 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
5668 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
5669 .addReg(SuperReg.getReg(), 0, NewSubIdx);
5670 return SubReg;
5671}
5672
5675 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
5676 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5677 if (Op.isImm()) {
5678 if (SubIdx == AMDGPU::sub0)
5679 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
5680 if (SubIdx == AMDGPU::sub1)
5681 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
5682
5683 llvm_unreachable("Unhandled register index for immediate");
5684 }
5685
5686 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
5687 SubIdx, SubRC);
5688 return MachineOperand::CreateReg(SubReg, false);
5689}
5690
5691// Change the order of operands from (0, 1, 2) to (0, 2, 1)
5692void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
5693 assert(Inst.getNumExplicitOperands() == 3);
5694 MachineOperand Op1 = Inst.getOperand(1);
5695 Inst.removeOperand(1);
5696 Inst.addOperand(Op1);
5697}
5698
5700 const MCOperandInfo &OpInfo,
5701 const MachineOperand &MO) const {
5702 if (!MO.isReg())
5703 return false;
5704
5705 Register Reg = MO.getReg();
5706
5707 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
5708 if (Reg.isPhysical())
5709 return DRC->contains(Reg);
5710
5711 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
5712
5713 if (MO.getSubReg()) {
5714 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
5715 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
5716 if (!SuperRC)
5717 return false;
5718
5719 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
5720 if (!DRC)
5721 return false;
5722 }
5723 return RC->hasSuperClassEq(DRC);
5724}
5725
5727 const MCOperandInfo &OpInfo,
5728 const MachineOperand &MO) const {
5729 if (MO.isReg())
5730 return isLegalRegOperand(MRI, OpInfo, MO);
5731
5732 // Handle non-register types that are treated like immediates.
5733 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
5734 return true;
5735}
5736
5737bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
5738 const MachineOperand *MO) const {
5739 const MachineFunction &MF = *MI.getParent()->getParent();
5740 const MachineRegisterInfo &MRI = MF.getRegInfo();
5741 const MCInstrDesc &InstDesc = MI.getDesc();
5742 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
5743 const TargetRegisterClass *DefinedRC =
5744 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
5745 if (!MO)
5746 MO = &MI.getOperand(OpIdx);
5747
5748 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
5749 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
5750 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
5751 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo) && !LiteralLimit--)
5752 return false;
5753
5755 if (MO->isReg())
5756 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
5757
5758 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
5759 if (i == OpIdx)
5760 continue;
5761 const MachineOperand &Op = MI.getOperand(i);
5762 if (Op.isReg()) {
5763 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
5764 if (!SGPRsUsed.count(SGPR) &&
5765 // FIXME: This can access off the end of the operands() array.
5766 usesConstantBus(MRI, Op, InstDesc.operands().begin()[i])) {
5767 if (--ConstantBusLimit <= 0)
5768 return false;
5769 SGPRsUsed.insert(SGPR);
5770 }
5771 } else if (AMDGPU::isSISrcOperand(InstDesc, i) &&
5772 !isInlineConstant(Op, InstDesc.operands()[i])) {
5773 if (!LiteralLimit--)
5774 return false;
5775 if (--ConstantBusLimit <= 0)
5776 return false;
5777 }
5778 }
5779 }
5780
5781 if (MO->isReg()) {
5782 if (!DefinedRC)
5783 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
5784 if (!isLegalRegOperand(MRI, OpInfo, *MO))
5785 return false;
5786 bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
5787 if (IsAGPR && !ST.hasMAIInsts())
5788 return false;
5789 unsigned Opc = MI.getOpcode();
5790 if (IsAGPR &&
5791 (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5792 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
5793 return false;
5794 // Atomics should have both vdst and vdata either vgpr or agpr.
5795 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
5796 const int DataIdx = AMDGPU::getNamedOperandIdx(Opc,
5797 isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
5798 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
5799 MI.getOperand(DataIdx).isReg() &&
5800 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
5801 return false;
5802 if ((int)OpIdx == DataIdx) {
5803 if (VDstIdx != -1 &&
5804 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
5805 return false;
5806 // DS instructions with 2 src operands also must have tied RC.
5807 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc,
5808 AMDGPU::OpName::data1);
5809 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
5810 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
5811 return false;
5812 }
5813 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
5814 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
5815 RI.isSGPRReg(MRI, MO->getReg()))
5816 return false;
5817 return true;
5818 }
5819
5820 if (MO->isImm()) {
5821 uint64_t Imm = MO->getImm();
5822 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
5823 bool Is64BitOp = Is64BitFPOp ||
5827 if (Is64BitOp &&
5829 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp))
5830 return false;
5831
5832 // FIXME: We can use sign extended 64-bit literals, but only for signed
5833 // operands. At the moment we do not know if an operand is signed.
5834 // Such operand will be encoded as its low 32 bits and then either
5835 // correctly sign extended or incorrectly zero extended by HW.
5836 if (!Is64BitFPOp && (int32_t)Imm < 0)
5837 return false;
5838 }
5839 }
5840
5841 // Handle non-register types that are treated like immediates.
5842 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
5843
5844 if (!DefinedRC) {
5845 // This operand expects an immediate.
5846 return true;
5847 }
5848
5849 return isImmOperandLegal(MI, OpIdx, *MO);
5850}
5851
5853 MachineInstr &MI) const {
5854 unsigned Opc = MI.getOpcode();
5855 const MCInstrDesc &InstrDesc = get(Opc);
5856
5857 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
5858 MachineOperand &Src0 = MI.getOperand(Src0Idx);
5859
5860 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
5861 MachineOperand &Src1 = MI.getOperand(Src1Idx);
5862
5863 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
5864 // we need to only have one constant bus use before GFX10.
5865 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
5866 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
5867 RI.isSGPRReg(MRI, Src0.getReg()))
5868 legalizeOpWithMove(MI, Src0Idx);
5869
5870 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
5871 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
5872 // src0/src1 with V_READFIRSTLANE.
5873 if (Opc == AMDGPU::V_WRITELANE_B32) {
5874 const DebugLoc &DL = MI.getDebugLoc();
5875 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
5876 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5877 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5878 .add(Src0);
5879 Src0.ChangeToRegister(Reg, false);
5880 }
5881 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
5882 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5883 const DebugLoc &DL = MI.getDebugLoc();
5884 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5885 .add(Src1);
5886 Src1.ChangeToRegister(Reg, false);
5887 }
5888 return;
5889 }
5890
5891 // No VOP2 instructions support AGPRs.
5892 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
5893 legalizeOpWithMove(MI, Src0Idx);
5894
5895 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
5896 legalizeOpWithMove(MI, Src1Idx);
5897
5898 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
5899 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
5900 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
5901 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
5902 legalizeOpWithMove(MI, Src2Idx);
5903 }
5904
5905 // VOP2 src0 instructions support all operand types, so we don't need to check
5906 // their legality. If src1 is already legal, we don't need to do anything.
5907 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
5908 return;
5909
5910 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
5911 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
5912 // select is uniform.
5913 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
5914 RI.isVGPR(MRI, Src1.getReg())) {
5915 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5916 const DebugLoc &DL = MI.getDebugLoc();
5917 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5918 .add(Src1);
5919 Src1.ChangeToRegister(Reg, false);
5920 return;
5921 }
5922
5923 // We do not use commuteInstruction here because it is too aggressive and will
5924 // commute if it is possible. We only want to commute here if it improves
5925 // legality. This can be called a fairly large number of times so don't waste
5926 // compile time pointlessly swapping and checking legality again.
5927 if (HasImplicitSGPR || !MI.isCommutable()) {
5928 legalizeOpWithMove(MI, Src1Idx);
5929 return;
5930 }
5931
5932 // If src0 can be used as src1, commuting will make the operands legal.
5933 // Otherwise we have to give up and insert a move.
5934 //
5935 // TODO: Other immediate-like operand kinds could be commuted if there was a
5936 // MachineOperand::ChangeTo* for them.
5937 if ((!Src1.isImm() && !Src1.isReg()) ||
5938 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
5939 legalizeOpWithMove(MI, Src1Idx);
5940 return;
5941 }
5942
5943 int CommutedOpc = commuteOpcode(MI);
5944 if (CommutedOpc == -1) {
5945 legalizeOpWithMove(MI, Src1Idx);
5946 return;
5947 }
5948
5949 MI.setDesc(get(CommutedOpc));
5950
5951 Register Src0Reg = Src0.getReg();
5952 unsigned Src0SubReg = Src0.getSubReg();
5953 bool Src0Kill = Src0.isKill();
5954
5955 if (Src1.isImm())
5956 Src0.ChangeToImmediate(Src1.getImm());
5957 else if (Src1.isReg()) {
5958 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
5959 Src0.setSubReg(Src1.getSubReg());
5960 } else
5961 llvm_unreachable("Should only have register or immediate operands");
5962
5963 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
5964 Src1.setSubReg(Src0SubReg);
5966}
5967
5968// Legalize VOP3 operands. All operand types are supported for any operand
5969// but only one literal constant and only starting from GFX10.
5971 MachineInstr &MI) const {
5972 unsigned Opc = MI.getOpcode();
5973
5974 int VOP3Idx[3] = {
5975 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
5976 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
5977 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
5978 };
5979
5980 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
5981 Opc == AMDGPU::V_PERMLANEX16_B32_e64) {
5982 // src1 and src2 must be scalar
5983 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
5984 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
5985 const DebugLoc &DL = MI.getDebugLoc();
5986 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
5987 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5988 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5989 .add(Src1);
5990 Src1.ChangeToRegister(Reg, false);
5991 }
5992 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
5993 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5994 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5995 .add(Src2);
5996 Src2.ChangeToRegister(Reg, false);
5997 }
5998 }
5999
6000 // Find the one SGPR operand we are allowed to use.
6001 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6002 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6003 SmallDenseSet<unsigned> SGPRsUsed;
6004 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6005 if (SGPRReg) {
6006 SGPRsUsed.insert(SGPRReg);
6007 --ConstantBusLimit;
6008 }
6009
6010 for (int Idx : VOP3Idx) {
6011 if (Idx == -1)
6012 break;
6013 MachineOperand &MO = MI.getOperand(Idx);
6014
6015 if (!MO.isReg()) {
6016 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6017 continue;
6018
6019 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6020 --LiteralLimit;
6021 --ConstantBusLimit;
6022 continue;
6023 }
6024
6025 --LiteralLimit;
6026 --ConstantBusLimit;
6028 continue;
6029 }
6030
6031 if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&
6032 !isOperandLegal(MI, Idx, &MO)) {
6034 continue;
6035 }
6036
6037 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6038 continue; // VGPRs are legal
6039
6040 // We can use one SGPR in each VOP3 instruction prior to GFX10
6041 // and two starting from GFX10.
6042 if (SGPRsUsed.count(MO.getReg()))
6043 continue;
6044 if (ConstantBusLimit > 0) {
6045 SGPRsUsed.insert(MO.getReg());
6046 --ConstantBusLimit;
6047 continue;
6048 }
6049
6050 // If we make it this far, then the operand is not legal and we must
6051 // legalize it.
6053 }
6054
6055 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6056 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6057 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6058 legalizeOpWithMove(MI, VOP3Idx[2]);
6059}
6060
6062 MachineRegisterInfo &MRI) const {
6063 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6064 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6065 Register DstReg = MRI.createVirtualRegister(SRC);
6066 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6067
6068 if (RI.hasAGPRs(VRC)) {
6069 VRC = RI.getEquivalentVGPRClass(VRC);
6070 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6071 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6072 get(TargetOpcode::COPY), NewSrcReg)
6073 .addReg(SrcReg);
6074 SrcReg = NewSrcReg;
6075 }
6076
6077 if (SubRegs == 1) {
6078 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6079 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6080 .addReg(SrcReg);
6081 return DstReg;
6082 }
6083
6085 for (unsigned i = 0; i < SubRegs; ++i) {
6086 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6087 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6088 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6089 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6090 SRegs.push_back(SGPR);
6091 }
6092
6094 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6095 get(AMDGPU::REG_SEQUENCE), DstReg);
6096 for (unsigned i = 0; i < SubRegs; ++i) {
6097 MIB.addReg(SRegs[i]);
6098 MIB.addImm(RI.getSubRegFromChannel(i));
6099 }
6100 return DstReg;
6101}
6102
6104 MachineInstr &MI) const {
6105
6106 // If the pointer is store in VGPRs, then we need to move them to
6107 // SGPRs using v_readfirstlane. This is safe because we only select
6108 // loads with uniform pointers to SMRD instruction so we know the
6109 // pointer value is uniform.
6110 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6111 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6112 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6113 SBase->setReg(SGPR);
6114 }
6115 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6116 if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
6117 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6118 SOff->setReg(SGPR);
6119 }
6120}
6121
6123 unsigned Opc = Inst.getOpcode();
6124 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6125 if (OldSAddrIdx < 0)
6126 return false;
6127
6129
6130 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6131 if (NewOpc < 0)
6133 if (NewOpc < 0)
6134 return false;
6135
6137 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6138 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6139 return false;
6140
6141 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6142 if (NewVAddrIdx < 0)
6143 return false;
6144
6145 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6146
6147 // Check vaddr, it shall be zero or absent.
6148 MachineInstr *VAddrDef = nullptr;
6149 if (OldVAddrIdx >= 0) {
6150 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6151 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6152 if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
6153 !VAddrDef->getOperand(1).isImm() ||
6154 VAddrDef->getOperand(1).getImm() != 0)
6155 return false;
6156 }
6157
6158 const MCInstrDesc &NewDesc = get(NewOpc);
6159 Inst.setDesc(NewDesc);
6160
6161 // Callers expect iterator to be valid after this call, so modify the
6162 // instruction in place.
6163 if (OldVAddrIdx == NewVAddrIdx) {
6164 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6165 // Clear use list from the old vaddr holding a zero register.
6166 MRI.removeRegOperandFromUseList(&NewVAddr);
6167 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6168 Inst.removeOperand(OldSAddrIdx);
6169 // Update the use list with the pointer we have just moved from vaddr to
6170 // saddr position. Otherwise new vaddr will be missing from the use list.
6171 MRI.removeRegOperandFromUseList(&NewVAddr);
6172 MRI.addRegOperandToUseList(&NewVAddr);
6173 } else {
6174 assert(OldSAddrIdx == NewVAddrIdx);
6175
6176 if (OldVAddrIdx >= 0) {
6177 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6178 AMDGPU::OpName::vdst_in);
6179
6180 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6181 // it asserts. Untie the operands for now and retie them afterwards.
6182 if (NewVDstIn != -1) {
6183 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6184 Inst.untieRegOperand(OldVDstIn);
6185 }
6186
6187 Inst.removeOperand(OldVAddrIdx);
6188
6189 if (NewVDstIn != -1) {
6190 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6191 Inst.tieOperands(NewVDst, NewVDstIn);
6192 }
6193 }
6194 }
6195
6196 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6197 VAddrDef->eraseFromParent();
6198
6199 return true;
6200}
6201
6202// FIXME: Remove this when SelectionDAG is obsoleted.
6204 MachineInstr &MI) const {
6206 return;
6207
6208 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6209 // thinks they are uniform, so a readfirstlane should be valid.
6210 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6211 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6212 return;
6213
6215 return;
6216
6217 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI);
6218 SAddr->setReg(ToSGPR);
6219}
6220
6223 const TargetRegisterClass *DstRC,
6226 const DebugLoc &DL) const {
6227 Register OpReg = Op.getReg();
6228 unsigned OpSubReg = Op.getSubReg();
6229
6230 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6231 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6232
6233 // Check if operand is already the correct register class.
6234 if (DstRC == OpRC)
6235 return;
6236
6237 Register DstReg = MRI.createVirtualRegister(DstRC);
6238 auto Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
6239
6240 Op.setReg(DstReg);
6241 Op.setSubReg(0);
6242
6243 MachineInstr *Def = MRI.getVRegDef(OpReg);
6244 if (!Def)
6245 return;
6246
6247 // Try to eliminate the copy if it is copying an immediate value.
6248 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6249 foldImmediate(*Copy, *Def, OpReg, &MRI);
6250
6251 bool ImpDef = Def->isImplicitDef();
6252 while (!ImpDef && Def && Def->isCopy()) {
6253 if (Def->getOperand(1).getReg().isPhysical())
6254 break;
6255 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6256 ImpDef = Def && Def->isImplicitDef();
6257 }
6258 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6259 !ImpDef)
6260 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6261}
6262
6263// Emit the actual waterfall loop, executing the wrapped instruction for each
6264// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6265// iteration, in the worst case we execute 64 (once per lane).
6268 MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL,
6269 ArrayRef<MachineOperand *> ScalarOps) {
6270 MachineFunction &MF = *OrigBB.getParent();
6271 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6272 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6273 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6274 unsigned SaveExecOpc =
6275 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
6276 unsigned XorTermOpc =
6277 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
6278 unsigned AndOpc =
6279 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6280 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6281
6283
6284 SmallVector<Register, 8> ReadlanePieces;
6285 Register CondReg;
6286
6287 for (MachineOperand *ScalarOp : ScalarOps) {
6288 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6289 unsigned NumSubRegs = RegSize / 32;
6290 Register VScalarOp = ScalarOp->getReg();
6291
6292 if (NumSubRegs == 1) {
6293 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6294
6295 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6296 .addReg(VScalarOp);
6297
6298 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6299
6300 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6301 .addReg(CurReg)
6302 .addReg(VScalarOp);
6303
6304 // Combine the comparison results with AND.
6305 if (!CondReg) // First.
6306 CondReg = NewCondReg;
6307 else { // If not the first, we create an AND.
6308 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6309 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6310 .addReg(CondReg)
6311 .addReg(NewCondReg);
6312 CondReg = AndReg;
6313 }
6314
6315 // Update ScalarOp operand to use the SGPR ScalarOp.
6316 ScalarOp->setReg(CurReg);
6317 ScalarOp->setIsKill();
6318 } else {
6319 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6320 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6321 "Unhandled register size");
6322
6323 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6324 Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6325 Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6326
6327 // Read the next variant <- also loop target.
6328 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6329 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6330
6331 // Read the next variant <- also loop target.
6332 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6333 .addReg(VScalarOp, VScalarOpUndef,
6334 TRI->getSubRegFromChannel(Idx + 1));
6335
6336 ReadlanePieces.push_back(CurRegLo);
6337 ReadlanePieces.push_back(CurRegHi);
6338
6339 // Comparison is to be done as 64-bit.
6340 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6341 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6342 .addReg(CurRegLo)
6343 .addImm(AMDGPU::sub0)
6344 .addReg(CurRegHi)
6345 .addImm(AMDGPU::sub1);
6346
6347 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6348 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6349 NewCondReg)
6350 .addReg(CurReg);
6351 if (NumSubRegs <= 2)
6352 Cmp.addReg(VScalarOp);
6353 else
6354 Cmp.addReg(VScalarOp, VScalarOpUndef,
6355 TRI->getSubRegFromChannel(Idx, 2));
6356
6357 // Combine the comparison results with AND.
6358 if (!CondReg) // First.
6359 CondReg = NewCondReg;
6360 else { // If not the first, we create an AND.
6361 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6362 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6363 .addReg(CondReg)
6364 .addReg(NewCondReg);
6365 CondReg = AndReg;
6366 }
6367 } // End for loop.
6368
6369 auto SScalarOpRC =
6370 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6371 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6372
6373 // Build scalar ScalarOp.
6374 auto Merge =
6375 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6376 unsigned Channel = 0;
6377 for (Register Piece : ReadlanePieces) {
6378 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6379 }
6380
6381 // Update ScalarOp operand to use the SGPR ScalarOp.
6382 ScalarOp->setReg(SScalarOp);
6383 ScalarOp->setIsKill();
6384 }
6385 }
6386
6387 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6388 MRI.setSimpleHint(SaveExec, CondReg);
6389
6390 // Update EXEC to matching lanes, saving original to SaveExec.
6391 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
6392 .addReg(CondReg, RegState::Kill);
6393
6394 // The original instruction is here; we insert the terminators after it.
6395 I = BodyBB.end();
6396
6397 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6398 BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec)
6399 .addReg(Exec)
6400 .addReg(SaveExec);
6401
6402 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
6403}
6404
6405// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6406// with SGPRs by iterating over all unique values across all lanes.
6407// Returns the loop basic block that now contains \p MI.
6408static MachineBasicBlock *
6412 MachineBasicBlock::iterator Begin = nullptr,
6413 MachineBasicBlock::iterator End = nullptr) {
6414 MachineBasicBlock &MBB = *MI.getParent();
6415 MachineFunction &MF = *MBB.getParent();
6416 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6417 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6419 if (!Begin.isValid())
6420 Begin = &MI;
6421 if (!End.isValid()) {
6422 End = &MI;
6423 ++End;
6424 }
6425 const DebugLoc &DL = MI.getDebugLoc();
6426 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6427 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6428 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6429
6430 // Save SCC. Waterfall Loop may overwrite SCC.
6431 Register SaveSCCReg;
6432
6433 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
6434 // rather than unlimited scan everywhere
6435 bool SCCNotDead =
6436 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
6437 std::numeric_limits<unsigned>::max()) !=
6439 if (SCCNotDead) {
6440 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6441 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
6442 .addImm(1)
6443 .addImm(0);
6444 }
6445
6446 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6447
6448 // Save the EXEC mask
6449 BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
6450
6451 // Killed uses in the instruction we are waterfalling around will be
6452 // incorrect due to the added control-flow.
6454 ++AfterMI;
6455 for (auto I = Begin; I != AfterMI; I++) {
6456 for (auto &MO : I->all_uses())
6457 MRI.clearKillFlags(MO.getReg());
6458 }
6459
6460 // To insert the loop we need to split the block. Move everything after this
6461 // point to a new block, and insert a new empty block between the two.
6464 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
6466 ++MBBI;
6467
6468 MF.insert(MBBI, LoopBB);
6469 MF.insert(MBBI, BodyBB);
6470 MF.insert(MBBI, RemainderBB);
6471
6472 LoopBB->addSuccessor(BodyBB);
6473 BodyBB->addSuccessor(LoopBB);
6474 BodyBB->addSuccessor(RemainderBB);
6475
6476 // Move Begin to MI to the BodyBB, and the remainder of the block to
6477 // RemainderBB.
6478 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
6479 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
6480 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
6481
6482 MBB.addSuccessor(LoopBB);
6483
6484 // Update dominators. We know that MBB immediately dominates LoopBB, that
6485 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
6486 // RemainderBB. RemainderBB immediately dominates all of the successors
6487 // transferred to it from MBB that MBB used to properly dominate.
6488 if (MDT) {
6489 MDT->addNewBlock(LoopBB, &MBB);
6490 MDT->addNewBlock(BodyBB, LoopBB);
6491 MDT->addNewBlock(RemainderBB, BodyBB);
6492 for (auto &Succ : RemainderBB->successors()) {
6493 if (MDT->properlyDominates(&MBB, Succ)) {
6494 MDT->changeImmediateDominator(Succ, RemainderBB);
6495 }
6496 }
6497 }
6498
6499 emitLoadScalarOpsFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, ScalarOps);
6500
6501 MachineBasicBlock::iterator First = RemainderBB->begin();
6502 // Restore SCC
6503 if (SCCNotDead) {
6504 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
6505 .addReg(SaveSCCReg, RegState::Kill)
6506 .addImm(0);
6507 }
6508
6509 // Restore the EXEC mask
6510 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
6511 return BodyBB;
6512}
6513
6514// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
6515static std::tuple<unsigned, unsigned>
6517 MachineBasicBlock &MBB = *MI.getParent();
6518 MachineFunction &MF = *MBB.getParent();
6520
6521 // Extract the ptr from the resource descriptor.
6522 unsigned RsrcPtr =
6523 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
6524 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
6525
6526 // Create an empty resource descriptor
6527 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6528 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6529 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6530 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6531 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
6532
6533 // Zero64 = 0
6534 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
6535 .addImm(0);
6536
6537 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
6538 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
6539 .addImm(RsrcDataFormat & 0xFFFFFFFF);
6540
6541 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
6542 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
6543 .addImm(RsrcDataFormat >> 32);
6544
6545 // NewSRsrc = {Zero64, SRsrcFormat}
6546 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
6547 .addReg(Zero64)
6548 .addImm(AMDGPU::sub0_sub1)
6549 .addReg(SRsrcFormatLo)
6550 .addImm(AMDGPU::sub2)
6551 .addReg(SRsrcFormatHi)
6552 .addImm(AMDGPU::sub3);
6553
6554 return std::tuple(RsrcPtr, NewSRsrc);
6555}
6556
6559 MachineDominatorTree *MDT) const {
6560 MachineFunction &MF = *MI.getParent()->getParent();
6562 MachineBasicBlock *CreatedBB = nullptr;
6563
6564 // Legalize VOP2
6565 if (isVOP2(MI) || isVOPC(MI)) {
6567 return CreatedBB;
6568 }
6569
6570 // Legalize VOP3
6571 if (isVOP3(MI)) {
6573 return CreatedBB;
6574 }
6575
6576 // Legalize SMRD
6577 if (isSMRD(MI)) {
6579 return CreatedBB;
6580 }
6581
6582 // Legalize FLAT
6583 if (isFLAT(MI)) {
6585 return CreatedBB;
6586 }
6587
6588 // Legalize REG_SEQUENCE and PHI
6589 // The register class of the operands much be the same type as the register
6590 // class of the output.
6591 if (MI.getOpcode() == AMDGPU::PHI) {
6592 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
6593 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
6594 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
6595 continue;
6596 const TargetRegisterClass *OpRC =
6597 MRI.getRegClass(MI.getOperand(i).getReg());
6598 if (RI.hasVectorRegisters(OpRC)) {
6599 VRC = OpRC;
6600 } else {
6601 SRC = OpRC;
6602 }
6603 }
6604
6605 // If any of the operands are VGPR registers, then they all most be
6606 // otherwise we will create illegal VGPR->SGPR copies when legalizing
6607 // them.
6608 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
6609 if (!VRC) {
6610 assert(SRC);
6611 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
6612 VRC = &AMDGPU::VReg_1RegClass;
6613 } else
6614 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6615 ? RI.getEquivalentAGPRClass(SRC)
6616 : RI.getEquivalentVGPRClass(SRC);
6617 } else {
6618 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6619 ? RI.getEquivalentAGPRClass(VRC)
6620 : RI.getEquivalentVGPRClass(VRC);
6621 }
6622 RC = VRC;
6623 } else {
6624 RC = SRC;
6625 }
6626
6627 // Update all the operands so they have the same type.
6628 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6629 MachineOperand &Op = MI.getOperand(I);
6630 if (!Op.isReg() || !Op.getReg().isVirtual())
6631 continue;
6632
6633 // MI is a PHI instruction.
6634 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
6636
6637 // Avoid creating no-op copies with the same src and dst reg class. These
6638 // confuse some of the machine passes.
6639 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
6640 }
6641 }
6642
6643 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
6644 // VGPR dest type and SGPR sources, insert copies so all operands are
6645 // VGPRs. This seems to help operand folding / the register coalescer.
6646 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
6647 MachineBasicBlock *MBB = MI.getParent();
6648 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
6649 if (RI.hasVGPRs(DstRC)) {
6650 // Update all the operands so they are VGPR register classes. These may
6651 // not be the same register class because REG_SEQUENCE supports mixing
6652 // subregister index types e.g. sub0_sub1 + sub2 + sub3
6653 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6654 MachineOperand &Op = MI.getOperand(I);
6655 if (!Op.isReg() || !Op.getReg().isVirtual())
6656 continue;
6657
6658 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
6659 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
6660 if (VRC == OpRC)
6661 continue;
6662
6663 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
6664 Op.setIsKill();
6665 }
6666 }
6667
6668 return CreatedBB;
6669 }
6670
6671 // Legalize INSERT_SUBREG
6672 // src0 must have the same register class as dst
6673 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
6674 Register Dst = MI.getOperand(0).getReg();
6675 Register Src0 = MI.getOperand(1).getReg();
6676 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
6677 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
6678 if (DstRC != Src0RC) {
6679 MachineBasicBlock *MBB = MI.getParent();
6680 MachineOperand &Op = MI.getOperand(1);
6681 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
6682 }
6683 return CreatedBB;
6684 }
6685
6686 // Legalize SI_INIT_M0
6687 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
6688 MachineOperand &Src = MI.getOperand(0);
6689 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6690 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6691 return CreatedBB;
6692 }
6693
6694 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
6695 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
6696 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
6697 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
6698 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
6699 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
6700 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
6701 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
6702 MachineOperand &Src = MI.getOperand(1);
6703 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6704 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6705 return CreatedBB;
6706 }
6707
6708 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
6709 //
6710 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
6711 // scratch memory access. In both cases, the legalization never involves
6712 // conversion to the addr64 form.
6714 (isMUBUF(MI) || isMTBUF(MI)))) {
6715 int RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI)) ? AMDGPU::OpName::rsrc
6716 : AMDGPU::OpName::srsrc;
6717 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
6718 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
6719 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
6720
6721 int SampOpName = isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
6722 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
6723 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
6724 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
6725
6726 return CreatedBB;
6727 }
6728
6729 // Legalize SI_CALL
6730 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
6731 MachineOperand *Dest = &MI.getOperand(0);
6732 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
6733 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
6734 // following copies, we also need to move copies from and to physical
6735 // registers into the loop block.
6736 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
6737 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
6738
6739 // Also move the copies to physical registers into the loop block
6740 MachineBasicBlock &MBB = *MI.getParent();
6742 while (Start->getOpcode() != FrameSetupOpcode)
6743 --Start;
6745 while (End->getOpcode() != FrameDestroyOpcode)
6746 ++End;
6747 // Also include following copies of the return value
6748 ++End;
6749 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
6750 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
6751 ++End;
6752 CreatedBB =
6753 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
6754 }
6755 }
6756
6757 // Legalize s_sleep_var.
6758 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
6759 const DebugLoc &DL = MI.getDebugLoc();
6760 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6761 int Src0Idx =
6762 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
6763 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6764 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6765 .add(Src0);
6766 Src0.ChangeToRegister(Reg, false);
6767 return nullptr;
6768 }
6769
6770 // Legalize MUBUF instructions.
6771 bool isSoffsetLegal = true;
6772 int SoffsetIdx =
6773 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
6774 if (SoffsetIdx != -1) {
6775 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
6776 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
6777 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
6778 isSoffsetLegal = false;
6779 }
6780 }
6781
6782 bool isRsrcLegal = true;
6783 int RsrcIdx =
6784 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
6785 if (RsrcIdx != -1) {
6786 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6787 if (Rsrc->isReg() && !RI.isSGPRClass(MRI.getRegClass(Rsrc->getReg()))) {
6788 isRsrcLegal = false;
6789 }
6790 }
6791
6792 // The operands are legal.
6793 if (isRsrcLegal && isSoffsetLegal)
6794 return CreatedBB;
6795
6796 if (!isRsrcLegal) {
6797 // Legalize a VGPR Rsrc
6798 //
6799 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
6800 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
6801 // a zero-value SRsrc.
6802 //
6803 // If the instruction is _OFFSET (both idxen and offen disabled), and we
6804 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
6805 // above.
6806 //
6807 // Otherwise we are on non-ADDR64 hardware, and/or we have
6808 // idxen/offen/bothen and we fall back to a waterfall loop.
6809
6810 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6811 MachineBasicBlock &MBB = *MI.getParent();
6812
6813 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
6814 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
6815 // This is already an ADDR64 instruction so we need to add the pointer
6816 // extracted from the resource descriptor to the current value of VAddr.
6817 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6818 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6819 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6820
6821 const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6822 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
6823 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
6824
6825 unsigned RsrcPtr, NewSRsrc;
6826 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
6827
6828 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
6829 const DebugLoc &DL = MI.getDebugLoc();
6830 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
6831 .addDef(CondReg0)
6832 .addReg(RsrcPtr, 0, AMDGPU::sub0)
6833 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
6834 .addImm(0);
6835
6836 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
6837 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
6838 .addDef(CondReg1, RegState::Dead)
6839 .addReg(RsrcPtr, 0, AMDGPU::sub1)
6840 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
6841 .addReg(CondReg0, RegState::Kill)
6842 .addImm(0);
6843
6844 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6845 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
6846 .addReg(NewVAddrLo)
6847 .addImm(AMDGPU::sub0)
6848 .addReg(NewVAddrHi)
6849 .addImm(AMDGPU::sub1);
6850
6851 VAddr->setReg(NewVAddr);
6852 Rsrc->setReg(NewSRsrc);
6853 } else if (!VAddr && ST.hasAddr64()) {
6854 // This instructions is the _OFFSET variant, so we need to convert it to
6855 // ADDR64.
6857 "FIXME: Need to emit flat atomics here");
6858
6859 unsigned RsrcPtr, NewSRsrc;
6860 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
6861
6862 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6863 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
6864 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
6865 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6866 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
6867
6868 // Atomics with return have an additional tied operand and are
6869 // missing some of the special bits.
6870 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
6871 MachineInstr *Addr64;
6872
6873 if (!VDataIn) {
6874 // Regular buffer load / store.
6876 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
6877 .add(*VData)
6878 .addReg(NewVAddr)
6879 .addReg(NewSRsrc)
6880 .add(*SOffset)
6881 .add(*Offset);
6882
6883 if (const MachineOperand *CPol =
6884 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
6885 MIB.addImm(CPol->getImm());
6886 }
6887
6888 if (const MachineOperand *TFE =
6889 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
6890 MIB.addImm(TFE->getImm());
6891 }
6892
6893 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
6894
6895 MIB.cloneMemRefs(MI);
6896 Addr64 = MIB;
6897 } else {
6898 // Atomics with return.
6899 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
6900 .add(*VData)
6901 .add(*VDataIn)
6902 .addReg(NewVAddr)
6903 .addReg(NewSRsrc)
6904 .add(*SOffset)
6905 .add(*Offset)
6906 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
6907 .cloneMemRefs(MI);
6908 }
6909
6910 MI.removeFromParent();
6911
6912 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6913 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
6914 NewVAddr)
6915 .addReg(RsrcPtr, 0, AMDGPU::sub0)
6916 .addImm(AMDGPU::sub0)
6917 .addReg(RsrcPtr, 0, AMDGPU::sub1)
6918 .addImm(AMDGPU::sub1);
6919 } else {
6920 // Legalize a VGPR Rsrc and soffset together.
6921 if (!isSoffsetLegal) {
6922 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6923 CreatedBB =
6924 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
6925 return CreatedBB;
6926 }
6927 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
6928 return CreatedBB;
6929 }
6930 }
6931
6932 // Legalize a VGPR soffset.
6933 if (!isSoffsetLegal) {
6934 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6935 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
6936 return CreatedBB;
6937 }
6938 return CreatedBB;
6939}
6940
6942 InstrList.insert(MI);
6943 // Add MBUF instructiosn to deferred list.
6944 int RsrcIdx =
6945 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
6946 if (RsrcIdx != -1) {
6947 DeferredList.insert(MI);
6948 }
6949}
6950
6952 return DeferredList.contains(MI);
6953}
6954
6956 MachineDominatorTree *MDT) const {
6957
6958 while (!Worklist.empty()) {
6959 MachineInstr &Inst = *Worklist.top();
6960 Worklist.erase_top();
6961 // Skip MachineInstr in the deferred list.
6962 if (Worklist.isDeferred(&Inst))
6963 continue;
6964 moveToVALUImpl(Worklist, MDT, Inst);
6965 }
6966
6967 // Deferred list of instructions will be processed once
6968 // all the MachineInstr in the worklist are done.
6969 for (MachineInstr *Inst : Worklist.getDeferredList()) {
6970 moveToVALUImpl(Worklist, MDT, *Inst);
6971 assert(Worklist.empty() &&
6972 "Deferred MachineInstr are not supposed to re-populate worklist");
6973 }
6974}
6975
6978 MachineInstr &Inst) const {
6979
6981 if (!MBB)
6982 return;
6984 unsigned Opcode = Inst.getOpcode();
6985 unsigned NewOpcode = getVALUOp(Inst);
6986 // Handle some special cases
6987 switch (Opcode) {
6988 default:
6989 break;
6990 case AMDGPU::S_ADD_U64_PSEUDO:
6991 NewOpcode = AMDGPU::V_ADD_U64_PSEUDO;
6992 break;
6993 case AMDGPU::S_SUB_U64_PSEUDO:
6994 NewOpcode = AMDGPU::V_SUB_U64_PSEUDO;
6995 break;
6996 case AMDGPU::S_ADD_I32:
6997 case AMDGPU::S_SUB_I32: {
6998 // FIXME: The u32 versions currently selected use the carry.
6999 bool Changed;
7000 MachineBasicBlock *CreatedBBTmp = nullptr;
7001 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7002 if (Changed)
7003 return;
7004
7005 // Default handling
7006 break;
7007 }
7008
7009 case AMDGPU::S_MUL_U64:
7010 // Split s_mul_u64 in 32-bit vector multiplications.
7011 splitScalarSMulU64(Worklist, Inst, MDT);
7012 Inst.eraseFromParent();
7013 return;
7014
7015 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7016 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7017 // This is a special case of s_mul_u64 where all the operands are either
7018 // zero extended or sign extended.
7019 splitScalarSMulPseudo(Worklist, Inst, MDT);
7020 Inst.eraseFromParent();
7021 return;
7022
7023 case AMDGPU::S_AND_B64:
7024 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7025 Inst.eraseFromParent();
7026 return;
7027
7028 case AMDGPU::S_OR_B64:
7029 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7030 Inst.eraseFromParent();
7031 return;
7032
7033 case AMDGPU::S_XOR_B64:
7034 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7035 Inst.eraseFromParent();
7036 return;
7037
7038 case AMDGPU::S_NAND_B64:
7039 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7040 Inst.eraseFromParent();
7041 return;
7042
7043 case AMDGPU::S_NOR_B64:
7044 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7045 Inst.eraseFromParent();
7046 return;
7047
7048 case AMDGPU::S_XNOR_B64:
7049 if (ST.hasDLInsts())
7050 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7051 else
7052 splitScalar64BitXnor(Worklist, Inst, MDT);
7053 Inst.eraseFromParent();
7054 return;
7055
7056 case AMDGPU::S_ANDN2_B64:
7057 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7058 Inst.eraseFromParent();
7059 return;
7060
7061 case AMDGPU::S_ORN2_B64:
7062 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7063 Inst.eraseFromParent();
7064 return;
7065
7066 case AMDGPU::S_BREV_B64:
7067 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7068 Inst.eraseFromParent();
7069 return;
7070
7071 case AMDGPU::S_NOT_B64:
7072 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7073 Inst.eraseFromParent();
7074 return;
7075
7076 case AMDGPU::S_BCNT1_I32_B64:
7077 splitScalar64BitBCNT(Worklist, Inst);
7078 Inst.eraseFromParent();
7079 return;
7080
7081 case AMDGPU::S_BFE_I64:
7082 splitScalar64BitBFE(Worklist, Inst);
7083 Inst.eraseFromParent();
7084 return;
7085
7086 case AMDGPU::S_FLBIT_I32_B64:
7087 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7088 Inst.eraseFromParent();
7089 return;
7090 case AMDGPU::S_FF1_I32_B64:
7091 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7092 Inst.eraseFromParent();
7093 return;
7094
7095 case AMDGPU::S_LSHL_B32:
7096 if (ST.hasOnlyRevVALUShifts()) {
7097 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7098 swapOperands(Inst);
7099 }
7100 break;
7101 case AMDGPU::S_ASHR_I32:
7102 if (ST.hasOnlyRevVALUShifts()) {
7103 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7104 swapOperands(Inst);
7105 }
7106 break;
7107 case AMDGPU::S_LSHR_B32:
7108 if (ST.hasOnlyRevVALUShifts()) {
7109 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7110 swapOperands(Inst);
7111 }
7112 break;
7113 case AMDGPU::S_LSHL_B64:
7114 if (ST.hasOnlyRevVALUShifts()) {
7115 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7116 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7117 : AMDGPU::V_LSHLREV_B64_e64;
7118 swapOperands(Inst);
7119 }
7120 break;
7121 case AMDGPU::S_ASHR_I64:
7122 if (ST.hasOnlyRevVALUShifts()) {
7123 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7124 swapOperands(Inst);
7125 }
7126 break;
7127 case AMDGPU::S_LSHR_B64:
7128 if (ST.hasOnlyRevVALUShifts()) {
7129 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7130 swapOperands(Inst);
7131 }
7132 break;
7133
7134 case AMDGPU::S_ABS_I32:
7135 lowerScalarAbs(Worklist, Inst);
7136 Inst.eraseFromParent();
7137 return;
7138
7139 case AMDGPU::S_CBRANCH_SCC0:
7140 case AMDGPU::S_CBRANCH_SCC1: {
7141 // Clear unused bits of vcc
7142 Register CondReg = Inst.getOperand(1).getReg();
7143 bool IsSCC = CondReg == AMDGPU::SCC;
7144 Register VCC = RI.getVCC();
7145 Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
7146 unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
7147 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
7148 .addReg(EXEC)
7149 .addReg(IsSCC ? VCC : CondReg);
7150 Inst.removeOperand(1);
7151 } break;
7152
7153 case AMDGPU::S_BFE_U64:
7154 case AMDGPU::S_BFM_B64:
7155 llvm_unreachable("Moving this op to VALU not implemented");
7156
7157 case AMDGPU::S_PACK_LL_B32_B16:
7158 case AMDGPU::S_PACK_LH_B32_B16:
7159 case AMDGPU::S_PACK_HL_B32_B16:
7160 case AMDGPU::S_PACK_HH_B32_B16:
7161 movePackToVALU(Worklist, MRI, Inst);
7162 Inst.eraseFromParent();
7163 return;
7164
7165 case AMDGPU::S_XNOR_B32:
7166 lowerScalarXnor(Worklist, Inst);
7167 Inst.eraseFromParent();
7168 return;
7169
7170 case AMDGPU::S_NAND_B32:
7171 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7172 Inst.eraseFromParent();
7173 return;
7174
7175 case AMDGPU::S_NOR_B32:
7176 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7177 Inst.eraseFromParent();
7178 return;
7179
7180 case AMDGPU::S_ANDN2_B32:
7181 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7182 Inst.eraseFromParent();
7183 return;
7184
7185 case AMDGPU::S_ORN2_B32:
7186 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7187 Inst.eraseFromParent();
7188 return;
7189
7190 // TODO: remove as soon as everything is ready
7191 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7192 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7193 // can only be selected from the uniform SDNode.
7194 case AMDGPU::S_ADD_CO_PSEUDO:
7195 case AMDGPU::S_SUB_CO_PSEUDO: {
7196 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7197 ? AMDGPU::V_ADDC_U32_e64
7198 : AMDGPU::V_SUBB_U32_e64;
7199 const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
7200
7201 Register CarryInReg = Inst.getOperand(4).getReg();
7202 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7203 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7204 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7205 .addReg(CarryInReg);
7206 }
7207
7208 Register CarryOutReg = Inst.getOperand(1).getReg();
7209
7210 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7211 MRI.getRegClass(Inst.getOperand(0).getReg())));
7212 MachineInstr *CarryOp =
7213 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7214 .addReg(CarryOutReg, RegState::Define)
7215 .add(Inst.getOperand(2))
7216 .add(Inst.getOperand(3))
7217 .addReg(CarryInReg)
7218 .addImm(0);
7219 legalizeOperands(*CarryOp);
7220 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7221 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7222 Inst.eraseFromParent();
7223 }
7224 return;
7225 case AMDGPU::S_UADDO_PSEUDO:
7226 case AMDGPU::S_USUBO_PSEUDO: {
7227 const DebugLoc &DL = Inst.getDebugLoc();
7228 MachineOperand &Dest0 = Inst.getOperand(0);
7229 MachineOperand &Dest1 = Inst.getOperand(1);
7230 MachineOperand &Src0 = Inst.getOperand(2);
7231 MachineOperand &Src1 = Inst.getOperand(3);
7232
7233 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7234 ? AMDGPU::V_ADD_CO_U32_e64
7235 : AMDGPU::V_SUB_CO_U32_e64;
7236 const TargetRegisterClass *NewRC =
7237 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7238 Register DestReg = MRI.createVirtualRegister(NewRC);
7239 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7240 .addReg(Dest1.getReg(), RegState::Define)
7241 .add(Src0)
7242 .add(Src1)
7243 .addImm(0); // clamp bit
7244
7245 legalizeOperands(*NewInstr, MDT);
7246 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7247 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7248 Worklist);
7249 Inst.eraseFromParent();
7250 }
7251 return;
7252
7253 case AMDGPU::S_CSELECT_B32:
7254 case AMDGPU::S_CSELECT_B64:
7255 lowerSelect(Worklist, Inst, MDT);
7256 Inst.eraseFromParent();
7257 return;
7258 case AMDGPU::S_CMP_EQ_I32:
7259 case AMDGPU::S_CMP_LG_I32:
7260 case AMDGPU::S_CMP_GT_I32:
7261 case AMDGPU::S_CMP_GE_I32:
7262 case AMDGPU::S_CMP_LT_I32:
7263 case AMDGPU::S_CMP_LE_I32:
7264 case AMDGPU::S_CMP_EQ_U32:
7265 case AMDGPU::S_CMP_LG_U32:
7266 case AMDGPU::S_CMP_GT_U32:
7267 case AMDGPU::S_CMP_GE_U32:
7268 case AMDGPU::S_CMP_LT_U32:
7269 case AMDGPU::S_CMP_LE_U32:
7270 case AMDGPU::S_CMP_EQ_U64:
7271 case AMDGPU::S_CMP_LG_U64:
7272 case AMDGPU::S_CMP_LT_F32:
7273 case AMDGPU::S_CMP_EQ_F32:
7274 case AMDGPU::S_CMP_LE_F32:
7275 case AMDGPU::S_CMP_GT_F32:
7276 case AMDGPU::S_CMP_LG_F32:
7277 case AMDGPU::S_CMP_GE_F32:
7278 case AMDGPU::S_CMP_O_F32:
7279 case AMDGPU::S_CMP_U_F32:
7280 case AMDGPU::S_CMP_NGE_F32:
7281 case AMDGPU::S_CMP_NLG_F32:
7282 case AMDGPU::S_CMP_NGT_F32:
7283 case AMDGPU::S_CMP_NLE_F32:
7284 case AMDGPU::S_CMP_NEQ_F32:
7285 case AMDGPU::S_CMP_NLT_F32:
7286 case AMDGPU::S_CMP_LT_F16:
7287 case AMDGPU::S_CMP_EQ_F16:
7288 case AMDGPU::S_CMP_LE_F16:
7289 case AMDGPU::S_CMP_GT_F16:
7290 case AMDGPU::S_CMP_LG_F16:
7291 case AMDGPU::S_CMP_GE_F16:
7292 case AMDGPU::S_CMP_O_F16:
7293 case AMDGPU::S_CMP_U_F16:
7294 case AMDGPU::S_CMP_NGE_F16:
7295 case AMDGPU::S_CMP_NLG_F16:
7296 case AMDGPU::S_CMP_NGT_F16:
7297 case AMDGPU::S_CMP_NLE_F16:
7298 case AMDGPU::S_CMP_NEQ_F16:
7299 case AMDGPU::S_CMP_NLT_F16: {
7300 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7301 auto NewInstr =
7302 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7303 .setMIFlags(Inst.getFlags());
7304 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7305 AMDGPU::OpName::src0_modifiers) >= 0) {
7306 NewInstr
7307 .addImm(0) // src0_modifiers
7308 .add(Inst.getOperand(0)) // src0
7309 .addImm(0) // src1_modifiers
7310 .add(Inst.getOperand(1)) // src1
7311 .addImm(0); // clamp
7312 } else {
7313 NewInstr
7314 .add(Inst.getOperand(0))
7315 .add(Inst.getOperand(1));
7316 }
7317 legalizeOperands(*NewInstr, MDT);
7318 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7319 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7320 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7321 Inst.eraseFromParent();
7322 return;
7323 }
7324 case AMDGPU::S_CVT_HI_F32_F16: {
7325 const DebugLoc &DL = Inst.getDebugLoc();
7326 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7327 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7328 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
7329 .addImm(16)
7330 .add(Inst.getOperand(1));
7331 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7332 .addImm(0) // src0_modifiers
7333 .addReg(TmpReg)
7334 .addImm(0) // clamp
7335 .addImm(0); // omod
7336
7337 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7338 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7339 Inst.eraseFromParent();
7340 return;
7341 }
7342 case AMDGPU::S_MINIMUM_F32:
7343 case AMDGPU::S_MAXIMUM_F32:
7344 case AMDGPU::S_MINIMUM_F16:
7345 case AMDGPU::S_MAXIMUM_F16: {
7346 const DebugLoc &DL = Inst.getDebugLoc();
7347 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7348 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7349 .addImm(0) // src0_modifiers
7350 .add(Inst.getOperand(1))
7351 .addImm(0) // src1_modifiers
7352 .add(Inst.getOperand(2))
7353 .addImm(0) // clamp
7354 .addImm(0); // omod
7355 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7356
7357 legalizeOperands(*NewInstr, MDT);
7358 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7359 Inst.eraseFromParent();
7360 return;
7361 }
7362 }
7363
7364 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
7365 // We cannot move this instruction to the VALU, so we should try to
7366 // legalize its operands instead.
7367 legalizeOperands(Inst, MDT);
7368 return;
7369 }
7370 // Handle converting generic instructions like COPY-to-SGPR into
7371 // COPY-to-VGPR.
7372 if (NewOpcode == Opcode) {
7373 Register DstReg = Inst.getOperand(0).getReg();
7374 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
7375
7376 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7377 // hope for the best.
7378 if (Inst.isCopy() && DstReg.isPhysical() &&
7379 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
7380 // TODO: Only works for 32 bit registers.
7381 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7382 get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
7383 .add(Inst.getOperand(1));
7384 Inst.eraseFromParent();
7385 return;
7386 }
7387
7388 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
7389 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
7390 // Instead of creating a copy where src and dst are the same register
7391 // class, we just replace all uses of dst with src. These kinds of
7392 // copies interfere with the heuristics MachineSink uses to decide
7393 // whether or not to split a critical edge. Since the pass assumes
7394 // that copies will end up as machine instructions and not be
7395 // eliminated.
7396 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
7397 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
7398 MRI.clearKillFlags(Inst.getOperand(1).getReg());
7399 Inst.getOperand(0).setReg(DstReg);
7400 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
7401 // these are deleted later, but at -O0 it would leave a suspicious
7402 // looking illegal copy of an undef register.
7403 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
7404 Inst.removeOperand(I);
7405 Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
7406 return;
7407 }
7408 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
7409 MRI.replaceRegWith(DstReg, NewDstReg);
7410 legalizeOperands(Inst, MDT);
7411 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7412 return;
7413 }
7414
7415 // Use the new VALU Opcode.
7416 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
7417 .setMIFlags(Inst.getFlags());
7418 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
7419 // Intersperse VOP3 modifiers among the SALU operands.
7420 NewInstr->addOperand(Inst.getOperand(0));
7421 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7422 AMDGPU::OpName::src0_modifiers) >= 0)
7423 NewInstr.addImm(0);
7424 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
7425 MachineOperand Src = Inst.getOperand(1);
7426 if (AMDGPU::isTrue16Inst(NewOpcode) && ST.useRealTrue16Insts() &&
7427 Src.isReg() && RI.isVGPR(MRI, Src.getReg()))
7428 NewInstr.addReg(Src.getReg(), 0, AMDGPU::lo16);
7429 else
7430 NewInstr->addOperand(Src);
7431 }
7432
7433 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
7434 // We are converting these to a BFE, so we need to add the missing
7435 // operands for the size and offset.
7436 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
7437 NewInstr.addImm(0);
7438 NewInstr.addImm(Size);
7439 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
7440 // The VALU version adds the second operand to the result, so insert an
7441 // extra 0 operand.
7442 NewInstr.addImm(0);
7443 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
7444 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
7445 // If we need to move this to VGPRs, we need to unpack the second
7446 // operand back into the 2 separate ones for bit offset and width.
7447 assert(OffsetWidthOp.isImm() &&
7448 "Scalar BFE is only implemented for constant width and offset");
7449 uint32_t Imm = OffsetWidthOp.getImm();
7450
7451 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
7452 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
7453 NewInstr.addImm(Offset);
7454 NewInstr.addImm(BitWidth);
7455 } else {
7456 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7457 AMDGPU::OpName::src1_modifiers) >= 0)
7458 NewInstr.addImm(0);
7459 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
7460 NewInstr->addOperand(Inst.getOperand(2));
7461 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7462 AMDGPU::OpName::src2_modifiers) >= 0)
7463 NewInstr.addImm(0);
7464 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
7465 NewInstr->addOperand(Inst.getOperand(3));
7466 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
7467 NewInstr.addImm(0);
7468 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
7469 NewInstr.addImm(0);
7470 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
7471 NewInstr.addImm(0);
7472 }
7473 } else {
7474 // Just copy the SALU operands.
7475 for (const MachineOperand &Op : Inst.explicit_operands())
7476 NewInstr->addOperand(Op);
7477 }
7478
7479 // Remove any references to SCC. Vector instructions can't read from it, and
7480 // We're just about to add the implicit use / defs of VCC, and we don't want
7481 // both.
7482 for (MachineOperand &Op : Inst.implicit_operands()) {
7483 if (Op.getReg() == AMDGPU::SCC) {
7484 // Only propagate through live-def of SCC.
7485 if (Op.isDef() && !Op.isDead())
7486 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
7487 if (Op.isUse())
7488 addSCCDefsToVALUWorklist(NewInstr, Worklist);
7489 }
7490 }
7491 Inst.eraseFromParent();
7492 Register NewDstReg;
7493 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
7494 Register DstReg = NewInstr->getOperand(0).getReg();
7495 assert(DstReg.isVirtual());
7496 // Update the destination register class.
7497 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
7498 assert(NewDstRC);
7499 NewDstReg = MRI.createVirtualRegister(NewDstRC);
7500 MRI.replaceRegWith(DstReg, NewDstReg);
7501 }
7502 fixImplicitOperands(*NewInstr);
7503 // Legalize the operands
7504 legalizeOperands(*NewInstr, MDT);
7505 if (NewDstReg)
7506 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7507}
7508
7509// Add/sub require special handling to deal with carry outs.
7510std::pair<bool, MachineBasicBlock *>
7511SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
7512 MachineDominatorTree *MDT) const {
7513 if (ST.hasAddNoCarry()) {
7514 // Assume there is no user of scc since we don't select this in that case.
7515 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
7516 // is used.
7517
7518 MachineBasicBlock &MBB = *Inst.getParent();
7520
7521 Register OldDstReg = Inst.getOperand(0).getReg();
7522 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7523
7524 unsigned Opc = Inst.getOpcode();
7525 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
7526
7527 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
7528 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
7529
7530 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
7531 Inst.removeOperand(3);
7532
7533 Inst.setDesc(get(NewOpc));
7534 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
7536 MRI.replaceRegWith(OldDstReg, ResultReg);
7537 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
7538
7539 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7540 return std::pair(true, NewBB);
7541 }
7542
7543 return std::pair(false, nullptr);
7544}
7545
7546void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
7547 MachineDominatorTree *MDT) const {
7548
7549 MachineBasicBlock &MBB = *Inst.getParent();
7551 MachineBasicBlock::iterator MII = Inst;
7552 DebugLoc DL = Inst.getDebugLoc();
7553
7554 MachineOperand &Dest = Inst.getOperand(0);
7555 MachineOperand &Src0 = Inst.getOperand(1);
7556 MachineOperand &Src1 = Inst.getOperand(2);
7557 MachineOperand &Cond = Inst.getOperand(3);
7558
7559 Register CondReg = Cond.getReg();
7560 bool IsSCC = (CondReg == AMDGPU::SCC);
7561
7562 // If this is a trivial select where the condition is effectively not SCC
7563 // (CondReg is a source of copy to SCC), then the select is semantically
7564 // equivalent to copying CondReg. Hence, there is no need to create
7565 // V_CNDMASK, we can just use that and bail out.
7566 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
7567 (Src1.getImm() == 0)) {
7568 MRI.replaceRegWith(Dest.getReg(), CondReg);
7569 return;
7570 }
7571
7572 Register NewCondReg = CondReg;
7573 if (IsSCC) {
7574 const TargetRegisterClass *TC =
7575 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
7576 NewCondReg = MRI.createVirtualRegister(TC);
7577
7578 // Now look for the closest SCC def if it is a copy
7579 // replacing the CondReg with the COPY source register
7580 bool CopyFound = false;
7581 for (MachineInstr &CandI :
7583 Inst.getParent()->rend())) {
7584 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
7585 -1) {
7586 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
7587 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
7588 .addReg(CandI.getOperand(1).getReg());
7589 CopyFound = true;
7590 }
7591 break;
7592 }
7593 }
7594 if (!CopyFound) {
7595 // SCC def is not a copy
7596 // Insert a trivial select instead of creating a copy, because a copy from
7597 // SCC would semantically mean just copying a single bit, but we may need
7598 // the result to be a vector condition mask that needs preserving.
7599 unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
7600 : AMDGPU::S_CSELECT_B32;
7601 auto NewSelect =
7602 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
7603 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
7604 }
7605 }
7606
7607 Register NewDestReg = MRI.createVirtualRegister(
7608 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
7609 MachineInstr *NewInst;
7610 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
7611 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
7612 .addImm(0)
7613 .add(Src1) // False
7614 .addImm(0)
7615 .add(Src0) // True
7616 .addReg(NewCondReg);
7617 } else {
7618 NewInst =
7619 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
7620 .add(Src1) // False
7621 .add(Src0) // True
7622 .addReg(NewCondReg);
7623 }
7624 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
7625 legalizeOperands(*NewInst, MDT);
7626 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
7627}
7628
7629void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
7630 MachineInstr &Inst) const {
7631 MachineBasicBlock &MBB = *Inst.getParent();
7633 MachineBasicBlock::iterator MII = Inst;
7634 DebugLoc DL = Inst.getDebugLoc();
7635
7636 MachineOperand &Dest = Inst.getOperand(0);
7637 MachineOperand &Src = Inst.getOperand(1);
7638 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7639 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7640
7641 unsigned SubOp = ST.hasAddNoCarry() ?
7642 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
7643
7644 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
7645 .addImm(0)
7646 .addReg(Src.getReg());
7647
7648 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
7649 .addReg(Src.getReg())
7650 .addReg(TmpReg);
7651
7652 MRI.replaceRegWith(Dest.getReg(), ResultReg);
7653 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7654}
7655
7656void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
7657 MachineInstr &Inst) const {
7658 MachineBasicBlock &MBB = *Inst.getParent();
7660 MachineBasicBlock::iterator MII = Inst;
7661 const DebugLoc &DL = Inst.getDebugLoc();
7662
7663 MachineOperand &Dest = Inst.getOperand(0);
7664 MachineOperand &Src0 = Inst.getOperand(1);
7665 MachineOperand &Src1 = Inst.getOperand(2);
7666
7667 if (ST.hasDLInsts()) {
7668 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7669 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
7670 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
7671
7672 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
7673 .add(Src0)
7674 .add(Src1);
7675
7676 MRI.replaceRegWith(Dest.getReg(), NewDest);
7677 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7678 } else {
7679 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
7680 // invert either source and then perform the XOR. If either source is a
7681 // scalar register, then we can leave the inversion on the scalar unit to
7682 // achieve a better distribution of scalar and vector instructions.
7683 bool Src0IsSGPR = Src0.isReg() &&
7684 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
7685 bool Src1IsSGPR = Src1.isReg() &&
7686 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
7688 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7689 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7690
7691 // Build a pair of scalar instructions and add them to the work list.
7692 // The next iteration over the work list will lower these to the vector
7693 // unit as necessary.
7694 if (Src0IsSGPR) {
7695 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
7696 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7697 .addReg(Temp)
7698 .add(Src1);
7699 } else if (Src1IsSGPR) {
7700 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
7701 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7702 .add(Src0)
7703 .addReg(Temp);
7704 } else {
7705 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
7706 .add(Src0)
7707 .add(Src1);
7708 MachineInstr *Not =
7709 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
7710 Worklist.insert(Not);
7711 }
7712
7713 MRI.replaceRegWith(Dest.getReg(), NewDest);
7714
7715 Worklist.insert(Xor);
7716
7717 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7718 }
7719}
7720
7721void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
7722 MachineInstr &Inst,
7723 unsigned Opcode) const {
7724 MachineBasicBlock &MBB = *Inst.getParent();
7726 MachineBasicBlock::iterator MII = Inst;
7727 const DebugLoc &DL = Inst.getDebugLoc();
7728
7729 MachineOperand &Dest = Inst.getOperand(0);
7730 MachineOperand &Src0 = Inst.getOperand(1);
7731 MachineOperand &Src1 = Inst.getOperand(2);
7732
7733 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7734 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7735
7736 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
7737 .add(Src0)
7738 .add(Src1);
7739
7740 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
7741 .addReg(Interm);
7742
7743 Worklist.insert(&Op);
7744 Worklist.insert(&Not);
7745
7746 MRI.replaceRegWith(Dest.getReg(), NewDest);
7747 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7748}
7749
7750void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
7751 MachineInstr &Inst,
7752 unsigned Opcode) const {
7753 MachineBasicBlock &MBB = *Inst.getParent();
7755 MachineBasicBlock::iterator MII = Inst;
7756 const DebugLoc &DL = Inst.getDebugLoc();
7757
7758 MachineOperand &Dest = Inst.getOperand(0);
7759 MachineOperand &Src0 = Inst.getOperand(1);
7760 MachineOperand &Src1 = Inst.getOperand(2);
7761
7762 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7763 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7764
7765 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
7766 .add(Src1);
7767
7768 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
7769 .add(Src0)
7770 .addReg(Interm);
7771
7772 Worklist.insert(&Not);
7773 Worklist.insert(&Op);
7774
7775 MRI.replaceRegWith(Dest.getReg(), NewDest);
7776 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7777}
7778
7779void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
7780 MachineInstr &Inst, unsigned Opcode,
7781 bool Swap) const {
7782 MachineBasicBlock &MBB = *Inst.getParent();
7784
7785 MachineOperand &Dest = Inst.getOperand(0);
7786 MachineOperand &Src0 = Inst.getOperand(1);
7787 DebugLoc DL = Inst.getDebugLoc();
7788
7789 MachineBasicBlock::iterator MII = Inst;
7790
7791 const MCInstrDesc &InstDesc = get(Opcode);
7792 const TargetRegisterClass *Src0RC = Src0.isReg() ?
7793 MRI.getRegClass(Src0.getReg()) :
7794 &AMDGPU::SGPR_32RegClass;
7795
7796 const TargetRegisterClass *Src0SubRC =
7797 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7798
7799 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7800 AMDGPU::sub0, Src0SubRC);
7801
7802 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
7803 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
7804 const TargetRegisterClass *NewDestSubRC =
7805 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
7806
7807 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
7808 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
7809
7810 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7811 AMDGPU::sub1, Src0SubRC);
7812
7813 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
7814 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
7815
7816 if (Swap)
7817 std::swap(DestSub0, DestSub1);
7818
7819 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
7820 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7821 .addReg(DestSub0)
7822 .addImm(AMDGPU::sub0)
7823 .addReg(DestSub1)
7824 .addImm(AMDGPU::sub1);
7825
7826 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7827
7828 Worklist.insert(&LoHalf);
7829 Worklist.insert(&HiHalf);
7830
7831 // We don't need to legalizeOperands here because for a single operand, src0
7832 // will support any kind of input.
7833
7834 // Move all users of this moved value.
7835 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
7836}
7837
7838// There is not a vector equivalent of s_mul_u64. For this reason, we need to
7839// split the s_mul_u64 in 32-bit vector multiplications.
7840void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
7841 MachineInstr &Inst,
7842 MachineDominatorTree *MDT) const {
7843 MachineBasicBlock &MBB = *Inst.getParent();
7845
7846 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7847 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7848 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7849
7850 MachineOperand &Dest = Inst.getOperand(0);
7851 MachineOperand &Src0 = Inst.getOperand(1);
7852 MachineOperand &Src1 = Inst.getOperand(2);
7853 const DebugLoc &DL = Inst.getDebugLoc();
7854 MachineBasicBlock::iterator MII = Inst;
7855
7856 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
7857 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
7858 const TargetRegisterClass *Src0SubRC =
7859 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7860 if (RI.isSGPRClass(Src0SubRC))
7861 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
7862 const TargetRegisterClass *Src1SubRC =
7863 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
7864 if (RI.isSGPRClass(Src1SubRC))
7865 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
7866
7867 // First, we extract the low 32-bit and high 32-bit values from each of the
7868 // operands.
7869 MachineOperand Op0L =
7870 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
7871 MachineOperand Op1L =
7872 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
7873 MachineOperand Op0H =
7874 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
7875 MachineOperand Op1H =
7876 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
7877
7878 // The multilication is done as follows:
7879 //
7880 // Op1H Op1L
7881 // * Op0H Op0L
7882 // --------------------
7883 // Op1H*Op0L Op1L*Op0L
7884 // + Op1H*Op0H Op1L*Op0H
7885 // -----------------------------------------
7886 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
7887 //
7888 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
7889 // value and that would overflow.
7890 // The low 32-bit value is Op1L*Op0L.
7891 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
7892
7893 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7894 MachineInstr *Op1L_Op0H =
7895 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
7896 .add(Op1L)
7897 .add(Op0H);
7898
7899 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7900 MachineInstr *Op1H_Op0L =
7901 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
7902 .add(Op1H)
7903 .add(Op0L);
7904
7905 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7906 MachineInstr *Carry =
7907 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
7908 .add(Op1L)
7909 .add(Op0L);
7910
7911 MachineInstr *LoHalf =
7912 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
7913 .add(Op1L)
7914 .add(Op0L);
7915
7916 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7917 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
7918 .addReg(Op1L_Op0H_Reg)
7919 .addReg(Op1H_Op0L_Reg);
7920
7921 MachineInstr *HiHalf =
7922 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
7923 .addReg(AddReg)
7924 .addReg(CarryReg);
7925
7926 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7927 .addReg(DestSub0)
7928 .addImm(AMDGPU::sub0)
7929 .addReg(DestSub1)
7930 .addImm(AMDGPU::sub1);
7931
7932 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7933
7934 // Try to legalize the operands in case we need to swap the order to keep it
7935 // valid.
7936 legalizeOperands(*Op1L_Op0H, MDT);
7937 legalizeOperands(*Op1H_Op0L, MDT);
7938 legalizeOperands(*Carry, MDT);
7939 legalizeOperands(*LoHalf, MDT);
7940 legalizeOperands(*Add, MDT);
7941 legalizeOperands(*HiHalf, MDT);
7942
7943 // Move all users of this moved value.
7944 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
7945}
7946
7947// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
7948// multiplications.
7949void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
7950 MachineInstr &Inst,
7951 MachineDominatorTree *MDT) const {
7952 MachineBasicBlock &MBB = *Inst.getParent();
7954
7955 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7956 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7957 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7958
7959 MachineOperand &Dest = Inst.getOperand(0);
7960 MachineOperand &Src0 = Inst.getOperand(1);
7961 MachineOperand &Src1 = Inst.getOperand(2);
7962 const DebugLoc &DL = Inst.getDebugLoc();
7963 MachineBasicBlock::iterator MII = Inst;
7964
7965 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
7966 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
7967 const TargetRegisterClass *Src0SubRC =
7968 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7969 if (RI.isSGPRClass(Src0SubRC))
7970 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
7971 const TargetRegisterClass *Src1SubRC =
7972 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
7973 if (RI.isSGPRClass(Src1SubRC))
7974 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
7975
7976 // First, we extract the low 32-bit and high 32-bit values from each of the
7977 // operands.
7978 MachineOperand Op0L =
7979 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
7980 MachineOperand Op1L =
7981 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
7982
7983 unsigned Opc = Inst.getOpcode();
7984 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
7985 ? AMDGPU::V_MUL_HI_U32_e64
7986 : AMDGPU::V_MUL_HI_I32_e64;
7987 MachineInstr *HiHalf =
7988 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
7989
7990 MachineInstr *LoHalf =
7991 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
7992 .add(Op1L)
7993 .add(Op0L);
7994
7995 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7996 .addReg(DestSub0)
7997 .addImm(AMDGPU::sub0)
7998 .addReg(DestSub1)
7999 .addImm(AMDGPU::sub1);
8000
8001 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8002
8003 // Try to legalize the operands in case we need to swap the order to keep it
8004 // valid.
8005 legalizeOperands(*HiHalf, MDT);
8006 legalizeOperands(*LoHalf, MDT);
8007
8008 // Move all users of this moved value.
8009 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8010}
8011
8012void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8013 MachineInstr &Inst, unsigned Opcode,
8014 MachineDominatorTree *MDT) const {
8015 MachineBasicBlock &MBB = *Inst.getParent();
8017
8018 MachineOperand &Dest = Inst.getOperand(0);
8019 MachineOperand &Src0 = Inst.getOperand(1);
8020 MachineOperand &Src1 = Inst.getOperand(2);
8021 DebugLoc DL = Inst.getDebugLoc();
8022
8023 MachineBasicBlock::iterator MII = Inst;
8024
8025 const MCInstrDesc &InstDesc = get(Opcode);
8026 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8027 MRI.getRegClass(Src0.getReg()) :
8028 &AMDGPU::SGPR_32RegClass;
8029
8030 const TargetRegisterClass *Src0SubRC =
8031 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8032 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8033 MRI.getRegClass(Src1.getReg()) :
8034 &AMDGPU::SGPR_32RegClass;
8035
8036 const TargetRegisterClass *Src1SubRC =
8037 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8038
8039 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8040 AMDGPU::sub0, Src0SubRC);
8041 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8042 AMDGPU::sub0, Src1SubRC);
8043 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8044 AMDGPU::sub1, Src0SubRC);
8045 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8046 AMDGPU::sub1, Src1SubRC);
8047
8048 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8049 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8050 const TargetRegisterClass *NewDestSubRC =
8051 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8052
8053 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8054 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8055 .add(SrcReg0Sub0)
8056 .add(SrcReg1Sub0);
8057
8058 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8059 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8060 .add(SrcReg0Sub1)
8061 .add(SrcReg1Sub1);
8062
8063 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8064 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8065 .addReg(DestSub0)
8066 .addImm(AMDGPU::sub0)
8067 .addReg(DestSub1)
8068 .addImm(AMDGPU::sub1);
8069
8070 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8071
8072 Worklist.insert(&LoHalf);
8073 Worklist.insert(&HiHalf);
8074
8075 // Move all users of this moved value.
8076 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8077}
8078
8079void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8080 MachineInstr &Inst,
8081 MachineDominatorTree *MDT) const {
8082 MachineBasicBlock &MBB = *Inst.getParent();
8084
8085 MachineOperand &Dest = Inst.getOperand(0);
8086 MachineOperand &Src0 = Inst.getOperand(1);
8087 MachineOperand &Src1 = Inst.getOperand(2);
8088 const DebugLoc &DL = Inst.getDebugLoc();
8089
8090 MachineBasicBlock::iterator MII = Inst;
8091
8092 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8093
8094 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8095
8096 MachineOperand* Op0;
8097 MachineOperand* Op1;
8098
8099 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8100 Op0 = &Src0;
8101 Op1 = &Src1;
8102 } else {
8103 Op0 = &Src1;
8104 Op1 = &Src0;
8105 }
8106
8107 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8108 .add(*Op0);
8109
8110 Register NewDest = MRI.createVirtualRegister(DestRC);
8111
8112 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8113 .addReg(Interm)
8114 .add(*Op1);
8115
8116 MRI.replaceRegWith(Dest.getReg(), NewDest);
8117
8118 Worklist.insert(&Xor);
8119}
8120
8121void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8122 MachineInstr &Inst) const {
8123 MachineBasicBlock &MBB = *Inst.getParent();
8125
8126 MachineBasicBlock::iterator MII = Inst;
8127 const DebugLoc &DL = Inst.getDebugLoc();
8128
8129 MachineOperand &Dest = Inst.getOperand(0);
8130 MachineOperand &Src = Inst.getOperand(1);
8131
8132 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8133 const TargetRegisterClass *SrcRC = Src.isReg() ?
8134 MRI.getRegClass(Src.getReg()) :
8135 &AMDGPU::SGPR_32RegClass;
8136
8137 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8138 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8139
8140 const TargetRegisterClass *SrcSubRC =
8141 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8142
8143 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8144 AMDGPU::sub0, SrcSubRC);
8145 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8146 AMDGPU::sub1, SrcSubRC);
8147
8148 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8149
8150 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8151
8152 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8153
8154 // We don't need to legalize operands here. src0 for either instruction can be
8155 // an SGPR, and the second input is unused or determined here.
8156 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8157}
8158
8159void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8160 MachineInstr &Inst) const {
8161 MachineBasicBlock &MBB = *Inst.getParent();
8163 MachineBasicBlock::iterator MII = Inst;
8164 const DebugLoc &DL = Inst.getDebugLoc();
8165
8166 MachineOperand &Dest = Inst.getOperand(0);
8167 uint32_t Imm = Inst.getOperand(2).getImm();
8168 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8169 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8170
8171 (void) Offset;
8172
8173 // Only sext_inreg cases handled.
8174 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8175 Offset == 0 && "Not implemented");
8176
8177 if (BitWidth < 32) {
8178 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8179 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8180 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8181
8182 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8183 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8184 .addImm(0)
8185 .addImm(BitWidth);
8186
8187 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8188 .addImm(31)
8189 .addReg(MidRegLo);
8190
8191 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8192 .addReg(MidRegLo)
8193 .addImm(AMDGPU::sub0)
8194 .addReg(MidRegHi)
8195 .addImm(AMDGPU::sub1);
8196
8197 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8198 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8199 return;
8200 }
8201
8202 MachineOperand &Src = Inst.getOperand(1);
8203 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8204 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8205
8206 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8207 .addImm(31)
8208 .addReg(Src.getReg(), 0, AMDGPU::sub0);
8209
8210 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8211 .addReg(Src.getReg(), 0, AMDGPU::sub0)
8212 .addImm(AMDGPU::sub0)
8213 .addReg(TmpReg)
8214 .addImm(AMDGPU::sub1);
8215
8216 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8217 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8218}
8219
8220void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8221 MachineInstr &Inst, unsigned Opcode,
8222 MachineDominatorTree *MDT) const {
8223 // (S_FLBIT_I32_B64 hi:lo) ->
8224 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8225 // (S_FF1_I32_B64 hi:lo) ->
8226 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8227
8228 MachineBasicBlock &MBB = *Inst.getParent();
8230 MachineBasicBlock::iterator MII = Inst;
8231 const DebugLoc &DL = Inst.getDebugLoc();
8232
8233 MachineOperand &Dest = Inst.getOperand(0);
8234 MachineOperand &Src = Inst.getOperand(1);
8235
8236 const MCInstrDesc &InstDesc = get(Opcode);
8237
8238 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
8239 unsigned OpcodeAdd =
8240 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
8241
8242 const TargetRegisterClass *SrcRC =
8243 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
8244 const TargetRegisterClass *SrcSubRC =
8245 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8246
8247 MachineOperand SrcRegSub0 =
8248 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
8249 MachineOperand SrcRegSub1 =
8250 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
8251
8252 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8253 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8254 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8255 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8256
8257 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
8258
8259 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
8260
8261 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
8262 .addReg(IsCtlz ? MidReg1 : MidReg2)
8263 .addImm(32)
8264 .addImm(1); // enable clamp
8265
8266 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
8267 .addReg(MidReg3)
8268 .addReg(IsCtlz ? MidReg2 : MidReg1);
8269
8270 MRI.replaceRegWith(Dest.getReg(), MidReg4);
8271
8272 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
8273}
8274
8275void SIInstrInfo::addUsersToMoveToVALUWorklist(
8277 SIInstrWorklist &Worklist) const {
8278 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
8279 E = MRI.use_end(); I != E;) {
8280 MachineInstr &UseMI = *I->getParent();
8281
8282 unsigned OpNo = 0;
8283
8284 switch (UseMI.getOpcode()) {
8285 case AMDGPU::COPY:
8286 case AMDGPU::WQM:
8287 case AMDGPU::SOFT_WQM:
8288 case AMDGPU::STRICT_WWM:
8289 case AMDGPU::STRICT_WQM:
8290 case AMDGPU::REG_SEQUENCE:
8291 case AMDGPU::PHI:
8292 case AMDGPU::INSERT_SUBREG:
8293 break;
8294 default:
8295 OpNo = I.getOperandNo();
8296 break;
8297 }
8298
8299 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) {
8300 Worklist.insert(&UseMI);
8301
8302 do {
8303 ++I;
8304 } while (I != E && I->getParent() == &UseMI);
8305 } else {
8306 ++I;
8307 }
8308 }
8309}
8310
8311void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
8313 MachineInstr &Inst) const {
8314 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8316 MachineOperand &Src0 = Inst.getOperand(1);
8317 MachineOperand &Src1 = Inst.getOperand(2);
8318 const DebugLoc &DL = Inst.getDebugLoc();
8319
8320 switch (Inst.getOpcode()) {
8321 case AMDGPU::S_PACK_LL_B32_B16: {
8322 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8323 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8324
8325 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
8326 // 0.
8327 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8328 .addImm(0xffff);
8329
8330 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
8331 .addReg(ImmReg, RegState::Kill)
8332 .add(Src0);
8333
8334 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8335 .add(Src1)
8336 .addImm(16)
8337 .addReg(TmpReg, RegState::Kill);
8338 break;
8339 }
8340 case AMDGPU::S_PACK_LH_B32_B16: {
8341 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8342 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8343 .addImm(0xffff);
8344 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
8345 .addReg(ImmReg, RegState::Kill)
8346 .add(Src0)
8347 .add(Src1);
8348 break;
8349 }
8350 case AMDGPU::S_PACK_HL_B32_B16: {
8351 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8352 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8353 .addImm(16)
8354 .add(Src0);
8355 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8356 .add(Src1)
8357 .addImm(16)
8358 .addReg(TmpReg, RegState::Kill);
8359 break;
8360 }
8361 case AMDGPU::S_PACK_HH_B32_B16: {
8362 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8363 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8364 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8365 .addImm(16)
8366 .add(Src0);
8367 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8368 .addImm(0xffff0000);
8369 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
8370 .add(Src1)
8371 .addReg(ImmReg, RegState::Kill)
8372 .addReg(TmpReg, RegState::Kill);
8373 break;
8374 }
8375 default:
8376 llvm_unreachable("unhandled s_pack_* instruction");
8377 }
8378
8379 MachineOperand &Dest = Inst.getOperand(0);
8380 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8381 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8382}
8383
8384void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
8385 MachineInstr &SCCDefInst,
8386 SIInstrWorklist &Worklist,
8387 Register NewCond) const {
8388
8389 // Ensure that def inst defines SCC, which is still live.
8390 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
8391 !Op.isDead() && Op.getParent() == &SCCDefInst);
8392 SmallVector<MachineInstr *, 4> CopyToDelete;
8393 // This assumes that all the users of SCC are in the same block
8394 // as the SCC def.
8395 for (MachineInstr &MI : // Skip the def inst itself.
8396 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
8397 SCCDefInst.getParent()->end())) {
8398 // Check if SCC is used first.
8399 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
8400 if (SCCIdx != -1) {
8401 if (MI.isCopy()) {
8402 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8403 Register DestReg = MI.getOperand(0).getReg();
8404
8405 MRI.replaceRegWith(DestReg, NewCond);
8406 CopyToDelete.push_back(&MI);
8407 } else {
8408
8409 if (NewCond.isValid())
8410 MI.getOperand(SCCIdx).setReg(NewCond);
8411
8412 Worklist.insert(&MI);
8413 }
8414 }
8415 // Exit if we find another SCC def.
8416 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
8417 break;
8418 }
8419 for (auto &Copy : CopyToDelete)
8420 Copy->eraseFromParent();
8421}
8422
8423// Instructions that use SCC may be converted to VALU instructions. When that
8424// happens, the SCC register is changed to VCC_LO. The instruction that defines
8425// SCC must be changed to an instruction that defines VCC. This function makes
8426// sure that the instruction that defines SCC is added to the moveToVALU
8427// worklist.
8428void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
8429 SIInstrWorklist &Worklist) const {
8430 // Look for a preceding instruction that either defines VCC or SCC. If VCC
8431 // then there is nothing to do because the defining instruction has been
8432 // converted to a VALU already. If SCC then that instruction needs to be
8433 // converted to a VALU.
8434 for (MachineInstr &MI :
8435 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
8436 SCCUseInst->getParent()->rend())) {
8437 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
8438 break;
8439 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
8440 Worklist.insert(&MI);
8441 break;
8442 }
8443 }
8444}
8445
8446const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
8447 const MachineInstr &Inst) const {
8448 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
8449
8450 switch (Inst.getOpcode()) {
8451 // For target instructions, getOpRegClass just returns the virtual register
8452 // class associated with the operand, so we need to find an equivalent VGPR
8453 // register class in order to move the instruction to the VALU.
8454 case AMDGPU::COPY:
8455 case AMDGPU::PHI:
8456 case AMDGPU::REG_SEQUENCE:
8457 case AMDGPU::INSERT_SUBREG:
8458 case AMDGPU::WQM:
8459 case AMDGPU::SOFT_WQM:
8460 case AMDGPU::STRICT_WWM:
8461 case AMDGPU::STRICT_WQM: {
8462 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
8463 if (RI.isAGPRClass(SrcRC)) {
8464 if (RI.isAGPRClass(NewDstRC))
8465 return nullptr;
8466
8467 switch (Inst.getOpcode()) {
8468 case AMDGPU::PHI:
8469 case AMDGPU::REG_SEQUENCE:
8470 case AMDGPU::INSERT_SUBREG:
8471 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
8472 break;
8473 default:
8474 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8475 }
8476
8477 if (!NewDstRC)
8478 return nullptr;
8479 } else {
8480 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
8481 return nullptr;
8482
8483 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8484 if (!NewDstRC)
8485 return nullptr;
8486 }
8487
8488 return NewDstRC;
8489 }
8490 default:
8491 return NewDstRC;
8492 }
8493}
8494
8495// Find the one SGPR operand we are allowed to use.
8496Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
8497 int OpIndices[3]) const {
8498 const MCInstrDesc &Desc = MI.getDesc();
8499
8500 // Find the one SGPR operand we are allowed to use.
8501 //
8502 // First we need to consider the instruction's operand requirements before
8503 // legalizing. Some operands are required to be SGPRs, such as implicit uses
8504 // of VCC, but we are still bound by the constant bus requirement to only use
8505 // one.
8506 //
8507 // If the operand's class is an SGPR, we can never move it.
8508
8509 Register SGPRReg = findImplicitSGPRRead(MI);
8510 if (SGPRReg)
8511 return SGPRReg;
8512
8513 Register UsedSGPRs[3] = {Register()};
8514 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8515
8516 for (unsigned i = 0; i < 3; ++i) {
8517 int Idx = OpIndices[i];
8518 if (Idx == -1)
8519 break;
8520
8521 const MachineOperand &MO = MI.getOperand(Idx);
8522 if (!MO.isReg())
8523 continue;
8524
8525 // Is this operand statically required to be an SGPR based on the operand
8526 // constraints?
8527 const TargetRegisterClass *OpRC =
8528 RI.getRegClass(Desc.operands()[Idx].RegClass);
8529 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
8530 if (IsRequiredSGPR)
8531 return MO.getReg();
8532
8533 // If this could be a VGPR or an SGPR, Check the dynamic register class.
8534 Register Reg = MO.getReg();
8535 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
8536 if (RI.isSGPRClass(RegRC))
8537 UsedSGPRs[i] = Reg;
8538 }
8539
8540 // We don't have a required SGPR operand, so we have a bit more freedom in
8541 // selecting operands to move.
8542
8543 // Try to select the most used SGPR. If an SGPR is equal to one of the
8544 // others, we choose that.
8545 //
8546 // e.g.
8547 // V_FMA_F32 v0, s0, s0, s0 -> No moves
8548 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
8549
8550 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
8551 // prefer those.
8552
8553 if (UsedSGPRs[0]) {
8554 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
8555 SGPRReg = UsedSGPRs[0];
8556 }
8557
8558 if (!SGPRReg && UsedSGPRs[1]) {
8559 if (UsedSGPRs[1] == UsedSGPRs[2])
8560 SGPRReg = UsedSGPRs[1];
8561 }
8562
8563 return SGPRReg;
8564}
8565
8567 unsigned OperandName) const {
8568 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
8569 if (Idx == -1)
8570 return nullptr;
8571
8572 return &MI.getOperand(Idx);
8573}
8574
8580 return (Format << 44) |
8581 (1ULL << 56) | // RESOURCE_LEVEL = 1
8582 (3ULL << 60); // OOB_SELECT = 3
8583 }
8584
8585 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
8586 if (ST.isAmdHsaOS()) {
8587 // Set ATC = 1. GFX9 doesn't have this bit.
8589 RsrcDataFormat |= (1ULL << 56);
8590
8591 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
8592 // BTW, it disables TC L2 and therefore decreases performance.
8594 RsrcDataFormat |= (2ULL << 59);
8595 }
8596
8597 return RsrcDataFormat;
8598}
8599
8603 0xffffffff; // Size;
8604
8605 // GFX9 doesn't have ELEMENT_SIZE.
8607 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
8608 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
8609 }
8610
8611 // IndexStride = 64 / 32.
8612 uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2;
8613 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
8614
8615 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
8616 // Clear them unless we want a huge stride.
8619 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
8620
8621 return Rsrc23;
8622}
8623
8625 unsigned Opc = MI.getOpcode();
8626
8627 return isSMRD(Opc);
8628}
8629
8631 return get(Opc).mayLoad() &&
8632 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
8633}
8634
8636 int &FrameIndex) const {
8637 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
8638 if (!Addr || !Addr->isFI())
8639 return Register();
8640
8641 assert(!MI.memoperands_empty() &&
8642 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
8643
8644 FrameIndex = Addr->getIndex();
8645 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
8646}
8647
8649 int &FrameIndex) const {
8650 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
8651 assert(Addr && Addr->isFI());
8652 FrameIndex = Addr->getIndex();
8653 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
8654}
8655
8657 int &FrameIndex) const {
8658 if (!MI.mayLoad())
8659 return Register();
8660
8661 if (isMUBUF(MI) || isVGPRSpill(MI))
8662 return isStackAccess(MI, FrameIndex);
8663
8664 if (isSGPRSpill(MI))
8665 return isSGPRStackAccess(MI, FrameIndex);
8666
8667 return Register();
8668}
8669
8671 int &FrameIndex) const {
8672 if (!MI.mayStore())
8673 return Register();
8674
8675 if (isMUBUF(MI) || isVGPRSpill(MI))
8676 return isStackAccess(MI, FrameIndex);
8677
8678 if (isSGPRSpill(MI))
8679 return isSGPRStackAccess(MI, FrameIndex);
8680
8681 return Register();
8682}
8683
8685 unsigned Size = 0;
8687 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
8688 while (++I != E && I->isInsideBundle()) {
8689 assert(!I->isBundle() && "No nested bundle!");
8691 }
8692
8693 return Size;
8694}
8695
8697 unsigned Opc = MI.getOpcode();
8699 unsigned DescSize = Desc.getSize();
8700
8701 // If we have a definitive size, we can use it. Otherwise we need to inspect
8702 // the operands to know the size.
8703 if (isFixedSize(MI)) {
8704 unsigned Size = DescSize;
8705
8706 // If we hit the buggy offset, an extra nop will be inserted in MC so
8707 // estimate the worst case.
8708 if (MI.isBranch() && ST.hasOffset3fBug())
8709 Size += 4;
8710
8711 return Size;
8712 }
8713
8714 // Instructions may have a 32-bit literal encoded after them. Check
8715 // operands that could ever be literals.
8716 if (isVALU(MI) || isSALU(MI)) {
8717 if (isDPP(MI))
8718 return DescSize;
8719 bool HasLiteral = false;
8720 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
8721 const MachineOperand &Op = MI.getOperand(I);
8722 const MCOperandInfo &OpInfo = Desc.operands()[I];
8723 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
8724 HasLiteral = true;
8725 break;
8726 }
8727 }
8728 return HasLiteral ? DescSize + 4 : DescSize;
8729 }
8730
8731 // Check whether we have extra NSA words.
8732 if (isMIMG(MI)) {
8733 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
8734 if (VAddr0Idx < 0)
8735 return 8;
8736
8737 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
8738 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
8739 }
8740
8741 switch (Opc) {
8742 case TargetOpcode::BUNDLE:
8743 return getInstBundleSize(MI);
8744 case TargetOpcode::INLINEASM:
8745 case TargetOpcode::INLINEASM_BR: {
8746 const MachineFunction *MF = MI.getParent()->getParent();
8747 const char *AsmStr = MI.getOperand(0).getSymbolName();
8748 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
8749 }
8750 default:
8751 if (MI.isMetaInstruction())
8752 return 0;
8753 return DescSize;
8754 }
8755}
8756
8758 if (!isFLAT(MI))
8759 return false;
8760
8761 if (MI.memoperands_empty())
8762 return true;
8763
8764 for (const MachineMemOperand *MMO : MI.memoperands()) {
8765 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
8766 return true;
8767 }
8768 return false;
8769}
8770
8772 return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
8773}
8774
8776 MachineBasicBlock *IfEnd) const {
8778 assert(TI != IfEntry->end());
8779
8780 MachineInstr *Branch = &(*TI);
8781 MachineFunction *MF = IfEntry->getParent();
8783
8784 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
8785 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
8786 MachineInstr *SIIF =
8787 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
8788 .add(Branch->getOperand(0))
8789 .add(Branch->getOperand(1));
8790 MachineInstr *SIEND =
8791 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
8792 .addReg(DstReg);
8793
8794 IfEntry->erase(TI);
8795 IfEntry->insert(IfEntry->end(), SIIF);
8796 IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
8797 }
8798}
8799
8801 MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
8803 // We expect 2 terminators, one conditional and one unconditional.
8804 assert(TI != LoopEnd->end());
8805
8806 MachineInstr *Branch = &(*TI);
8807 MachineFunction *MF = LoopEnd->getParent();
8809
8810 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
8811
8812 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
8813 Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC());
8814 MachineInstrBuilder HeaderPHIBuilder =
8815 BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
8816 for (MachineBasicBlock *PMBB : LoopEntry->predecessors()) {
8817 if (PMBB == LoopEnd) {
8818 HeaderPHIBuilder.addReg(BackEdgeReg);
8819 } else {
8820 Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC());
8821 materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
8822 ZeroReg, 0);
8823 HeaderPHIBuilder.addReg(ZeroReg);
8824 }
8825 HeaderPHIBuilder.addMBB(PMBB);
8826 }
8827 MachineInstr *HeaderPhi = HeaderPHIBuilder;
8828 MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
8829 get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
8830 .addReg(DstReg)
8831 .add(Branch->getOperand(0));
8832 MachineInstr *SILOOP =
8833 BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
8834 .addReg(BackEdgeReg)
8835 .addMBB(LoopEntry);
8836
8837 LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
8838 LoopEnd->erase(TI);
8839 LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
8840 LoopEnd->insert(LoopEnd->end(), SILOOP);
8841 }
8842}
8843
8846 static const std::pair<int, const char *> TargetIndices[] = {
8847 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
8848 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
8849 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
8850 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
8851 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
8852 return ArrayRef(TargetIndices);
8853}
8854
8855/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
8856/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
8859 const ScheduleDAG *DAG) const {
8860 return new GCNHazardRecognizer(DAG->MF);
8861}
8862
8863/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
8864/// pass.
8867 return new GCNHazardRecognizer(MF);
8868}
8869
8870// Called during:
8871// - pre-RA scheduling and post-RA scheduling
8874 const ScheduleDAGMI *DAG) const {
8875 // Borrowed from Arm Target
8876 // We would like to restrict this hazard recognizer to only
8877 // post-RA scheduling; we can tell that we're post-RA because we don't
8878 // track VRegLiveness.
8879 if (!DAG->hasVRegLiveness())
8880 return new GCNHazardRecognizer(DAG->MF);
8882}
8883
8884std::pair<unsigned, unsigned>
8886 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
8887}
8888
8891 static const std::pair<unsigned, const char *> TargetFlags[] = {
8892 { MO_GOTPCREL, "amdgpu-gotprel" },
8893 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
8894 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
8895 { MO_REL32_LO, "amdgpu-rel32-lo" },
8896 { MO_REL32_HI, "amdgpu-rel32-hi" },
8897 { MO_ABS32_LO, "amdgpu-abs32-lo" },
8898 { MO_ABS32_HI, "amdgpu-abs32-hi" },
8899 };
8900
8901 return ArrayRef(TargetFlags);
8902}
8903
8906 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
8907 {
8908 {MONoClobber, "amdgpu-noclobber"},
8909 {MOLastUse, "amdgpu-last-use"},
8910 };
8911
8912 return ArrayRef(TargetFlags);
8913}
8914
8916 const MachineFunction &MF) const {
8918 assert(SrcReg.isVirtual());
8919 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
8920 return AMDGPU::WWM_COPY;
8921
8922 return AMDGPU::COPY;
8923}
8924
8926 Register Reg) const {
8927 // We need to handle instructions which may be inserted during register
8928 // allocation to handle the prolog. The initial prolog instruction may have
8929 // been separated from the start of the block by spills and copies inserted
8930 // needed by the prolog. However, the insertions for scalar registers can
8931 // always be placed at the BB top as they are independent of the exec mask
8932 // value.
8933 bool IsNullOrVectorRegister = true;
8934 if (Reg) {
8935 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8936 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
8937 }
8938
8939 uint16_t Opcode = MI.getOpcode();
8940 // FIXME: Copies inserted in the block prolog for live-range split should also
8941 // be included.
8942 return IsNullOrVectorRegister &&
8943 (isSpill(Opcode) || (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
8944 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
8945}
8946
8950 const DebugLoc &DL,
8951 Register DestReg) const {
8952 if (ST.hasAddNoCarry())
8953 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
8954
8956 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
8957 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
8958
8959 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
8960 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
8961}
8962
8965 const DebugLoc &DL,
8966 Register DestReg,
8967 RegScavenger &RS) const {
8968 if (ST.hasAddNoCarry())
8969 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
8970
8971 // If available, prefer to use vcc.
8972 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
8973 ? Register(RI.getVCC())
8975 *RI.getBoolRC(), I, /* RestoreAfter */ false,
8976 0, /* AllowSpill */ false);
8977
8978 // TODO: Users need to deal with this.
8979 if (!UnusedCarry.isValid())
8980 return MachineInstrBuilder();
8981
8982 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
8983 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
8984}
8985
8986bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
8987 switch (Opcode) {
8988 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
8989 case AMDGPU::SI_KILL_I1_TERMINATOR:
8990 return true;
8991 default:
8992 return false;
8993 }
8994}
8995
8997 switch (Opcode) {
8998 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
8999 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
9000 case AMDGPU::SI_KILL_I1_PSEUDO:
9001 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
9002 default:
9003 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
9004 }
9005}
9006
9007bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9008 return Imm <= getMaxMUBUFImmOffset(ST);
9009}
9010
9012 // GFX12 field is non-negative 24-bit signed byte offset.
9013 const unsigned OffsetBits =
9014 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9015 return (1 << OffsetBits) - 1;
9016}
9017
9019 if (!ST.isWave32())
9020 return;
9021
9022 if (MI.isInlineAsm())
9023 return;
9024
9025 for (auto &Op : MI.implicit_operands()) {
9026 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9027 Op.setReg(AMDGPU::VCC_LO);
9028 }
9029}
9030
9032 if (!isSMRD(MI))
9033 return false;
9034
9035 // Check that it is using a buffer resource.
9036 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9037 if (Idx == -1) // e.g. s_memtime
9038 return false;
9039
9040 const auto RCID = MI.getDesc().operands()[Idx].RegClass;
9041 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9042}
9043
9044// Given Imm, split it into the values to put into the SOffset and ImmOffset
9045// fields in an MUBUF instruction. Return false if it is not possible (due to a
9046// hardware bug needing a workaround).
9047//
9048// The required alignment ensures that individual address components remain
9049// aligned if they are aligned to begin with. It also ensures that additional
9050// offsets within the given alignment can be added to the resulting ImmOffset.
9052 uint32_t &ImmOffset, Align Alignment) const {
9053 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9054 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9055 uint32_t Overflow = 0;
9056
9057 if (Imm > MaxImm) {
9058 if (Imm <= MaxImm + 64) {
9059 // Use an SOffset inline constant for 4..64
9060 Overflow = Imm - MaxImm;
9061 Imm = MaxImm;
9062 } else {
9063 // Try to keep the same value in SOffset for adjacent loads, so that
9064 // the corresponding register contents can be re-used.
9065 //
9066 // Load values with all low-bits (except for alignment bits) set into
9067 // SOffset, so that a larger range of values can be covered using
9068 // s_movk_i32.
9069 //
9070 // Atomic operations fail to work correctly when individual address
9071 // components are unaligned, even if their sum is aligned.
9072 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9073 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9074 Imm = Low;
9075 Overflow = High - Alignment.value();
9076 }
9077 }
9078
9079 if (Overflow > 0) {
9080 // There is a hardware bug in SI and CI which prevents address clamping in
9081 // MUBUF instructions from working correctly with SOffsets. The immediate
9082 // offset is unaffected.
9084 return false;
9085
9086 // It is not possible to set immediate in SOffset field on some targets.
9087 if (ST.hasRestrictedSOffset())
9088 return false;
9089 }
9090
9091 ImmOffset = Imm;
9092 SOffset = Overflow;
9093 return true;
9094}
9095
9096// Depending on the used address space and instructions, some immediate offsets
9097// are allowed and some are not.
9098// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9099// scratch instruction offsets can also be negative. On GFX12, offsets can be
9100// negative for all variants.
9101//
9102// There are several bugs related to these offsets:
9103// On gfx10.1, flat instructions that go into the global address space cannot
9104// use an offset.
9105//
9106// For scratch instructions, the address can be either an SGPR or a VGPR.
9107// The following offsets can be used, depending on the architecture (x means
9108// cannot be used):
9109// +----------------------------+------+------+
9110// | Address-Mode | SGPR | VGPR |
9111// +----------------------------+------+------+
9112// | gfx9 | | |
9113// | negative, 4-aligned offset | x | ok |
9114// | negative, unaligned offset | x | ok |
9115// +----------------------------+------+------+
9116// | gfx10 | | |
9117// | negative, 4-aligned offset | ok | ok |
9118// | negative, unaligned offset | ok | x |
9119// +----------------------------+------+------+
9120// | gfx10.3 | | |
9121// | negative, 4-aligned offset | ok | ok |
9122// | negative, unaligned offset | ok | ok |
9123// +----------------------------+------+------+
9124//
9125// This function ignores the addressing mode, so if an offset cannot be used in
9126// one addressing mode, it is considered illegal.
9127bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9128 uint64_t FlatVariant) const {
9129 // TODO: Should 0 be special cased?
9130 if (!ST.hasFlatInstOffsets())
9131 return false;
9132
9133 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9134 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9135 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9136 return false;
9137
9139 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9140 (Offset % 4) != 0) {
9141 return false;
9142 }
9143
9144 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9145 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9146 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9147}
9148
9149// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9150std::pair<int64_t, int64_t>
9151SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9152 uint64_t FlatVariant) const {
9153 int64_t RemainderOffset = COffsetVal;
9154 int64_t ImmField = 0;
9155
9156 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9157 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9158
9159 if (AllowNegative) {
9160 // Use signed division by a power of two to truncate towards 0.
9161 int64_t D = 1LL << NumBits;
9162 RemainderOffset = (COffsetVal / D) * D;
9163 ImmField = COffsetVal - RemainderOffset;
9164
9166 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9167 (ImmField % 4) != 0) {
9168 // Make ImmField a multiple of 4
9169 RemainderOffset += ImmField % 4;
9170 ImmField -= ImmField % 4;
9171 }
9172 } else if (COffsetVal >= 0) {
9173 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9174 RemainderOffset = COffsetVal - ImmField;
9175 }
9176
9177 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9178 assert(RemainderOffset + ImmField == COffsetVal);
9179 return {ImmField, RemainderOffset};
9180}
9181
9183 if (ST.hasNegativeScratchOffsetBug() &&
9184 FlatVariant == SIInstrFlags::FlatScratch)
9185 return false;
9186
9187 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9188}
9189
9190static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9191 switch (ST.getGeneration()) {
9192 default:
9193 break;
9196 return SIEncodingFamily::SI;
9199 return SIEncodingFamily::VI;
9206 }
9207 llvm_unreachable("Unknown subtarget generation!");
9208}
9209
9210bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9211 switch(MCOp) {
9212 // These opcodes use indirect register addressing so
9213 // they need special handling by codegen (currently missing).
9214 // Therefore it is too risky to allow these opcodes
9215 // to be selected by dpp combiner or sdwa peepholer.
9216 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
9217 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
9218 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
9219 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
9220 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
9221 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
9222 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
9223 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
9224 return true;
9225 default:
9226 return false;
9227 }
9228}
9229
9230int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
9231 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
9232
9233 unsigned Gen = subtargetEncodingFamily(ST);
9234
9235 if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
9238
9239 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
9240 // subtarget has UnpackedD16VMem feature.
9241 // TODO: remove this when we discard GFX80 encoding.
9242 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
9244
9245 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
9246 switch (ST.getGeneration()) {
9247 default:
9249 break;
9252 break;
9255 break;
9256 }
9257 }
9258
9259 if (isMAI(Opcode)) {
9260 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
9261 if (MFMAOp != -1)
9262 Opcode = MFMAOp;
9263 }
9264
9265 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
9266
9267 // -1 means that Opcode is already a native instruction.
9268 if (MCOp == -1)
9269 return Opcode;
9270
9271 if (ST.hasGFX90AInsts()) {
9272 uint16_t NMCOp = (uint16_t)-1;
9273 if (ST.hasGFX940Insts())
9275 if (NMCOp == (uint16_t)-1)
9277 if (NMCOp == (uint16_t)-1)
9279 if (NMCOp != (uint16_t)-1)
9280 MCOp = NMCOp;
9281 }
9282
9283 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
9284 // no encoding in the given subtarget generation.
9285 if (MCOp == (uint16_t)-1)
9286 return -1;
9287
9288 if (isAsmOnlyOpcode(MCOp))
9289 return -1;
9290
9291 return MCOp;
9292}
9293
9294static
9296 assert(RegOpnd.isReg());
9297 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
9298 getRegSubRegPair(RegOpnd);
9299}
9300
9303 assert(MI.isRegSequence());
9304 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
9305 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
9306 auto &RegOp = MI.getOperand(1 + 2 * I);
9307 return getRegOrUndef(RegOp);
9308 }
9310}
9311
9312// Try to find the definition of reg:subreg in subreg-manipulation pseudos
9313// Following a subreg of reg:subreg isn't supported
9316 if (!RSR.SubReg)
9317 return false;
9318 switch (MI.getOpcode()) {
9319 default: break;
9320 case AMDGPU::REG_SEQUENCE:
9321 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
9322 return true;
9323 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
9324 case AMDGPU::INSERT_SUBREG:
9325 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
9326 // inserted the subreg we're looking for
9327 RSR = getRegOrUndef(MI.getOperand(2));
9328 else { // the subreg in the rest of the reg
9329 auto R1 = getRegOrUndef(MI.getOperand(1));
9330 if (R1.SubReg) // subreg of subreg isn't supported
9331 return false;
9332 RSR.Reg = R1.Reg;
9333 }
9334 return true;
9335 }
9336 return false;
9337}
9338
9341 assert(MRI.isSSA());
9342 if (!P.Reg.isVirtual())
9343 return nullptr;
9344
9345 auto RSR = P;
9346 auto *DefInst = MRI.getVRegDef(RSR.Reg);
9347 while (auto *MI = DefInst) {
9348 DefInst = nullptr;
9349 switch (MI->getOpcode()) {
9350 case AMDGPU::COPY:
9351 case AMDGPU::V_MOV_B32_e32: {
9352 auto &Op1 = MI->getOperand(1);
9353 if (Op1.isReg() && Op1.getReg().isVirtual()) {
9354 if (Op1.isUndef())
9355 return nullptr;
9356 RSR = getRegSubRegPair(Op1);
9357 DefInst = MRI.getVRegDef(RSR.Reg);
9358 }
9359 break;
9360 }
9361 default:
9362 if (followSubRegDef(*MI, RSR)) {
9363 if (!RSR.Reg)
9364 return nullptr;
9365 DefInst = MRI.getVRegDef(RSR.Reg);
9366 }
9367 }
9368 if (!DefInst)
9369 return MI;
9370 }
9371 return nullptr;
9372}
9373
9375 Register VReg,
9376 const MachineInstr &DefMI,
9377 const MachineInstr &UseMI) {
9378 assert(MRI.isSSA() && "Must be run on SSA");
9379
9380 auto *TRI = MRI.getTargetRegisterInfo();
9381 auto *DefBB = DefMI.getParent();
9382
9383 // Don't bother searching between blocks, although it is possible this block
9384 // doesn't modify exec.
9385 if (UseMI.getParent() != DefBB)
9386 return true;
9387
9388 const int MaxInstScan = 20;
9389 int NumInst = 0;
9390
9391 // Stop scan at the use.
9392 auto E = UseMI.getIterator();
9393 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
9394 if (I->isDebugInstr())
9395 continue;
9396
9397 if (++NumInst > MaxInstScan)
9398 return true;
9399
9400 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
9401 return true;
9402 }
9403
9404 return false;
9405}
9406
9408 Register VReg,
9409 const MachineInstr &DefMI) {
9410 assert(MRI.isSSA() && "Must be run on SSA");
9411
9412 auto *TRI = MRI.getTargetRegisterInfo();
9413 auto *DefBB = DefMI.getParent();
9414
9415 const int MaxUseScan = 10;
9416 int NumUse = 0;
9417
9418 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
9419 auto &UseInst = *Use.getParent();
9420 // Don't bother searching between blocks, although it is possible this block
9421 // doesn't modify exec.
9422 if (UseInst.getParent() != DefBB || UseInst.isPHI())
9423 return true;
9424
9425 if (++NumUse > MaxUseScan)
9426 return true;
9427 }
9428
9429 if (NumUse == 0)
9430 return false;
9431
9432 const int MaxInstScan = 20;
9433 int NumInst = 0;
9434
9435 // Stop scan when we have seen all the uses.
9436 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
9437 assert(I != DefBB->end());
9438
9439 if (I->isDebugInstr())
9440 continue;
9441
9442 if (++NumInst > MaxInstScan)
9443 return true;
9444
9445 for (const MachineOperand &Op : I->operands()) {
9446 // We don't check reg masks here as they're used only on calls:
9447 // 1. EXEC is only considered const within one BB
9448 // 2. Call should be a terminator instruction if present in a BB
9449
9450 if (!Op.isReg())
9451 continue;
9452
9453 Register Reg = Op.getReg();
9454 if (Op.isUse()) {
9455 if (Reg == VReg && --NumUse == 0)
9456 return false;
9457 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
9458 return true;
9459 }
9460 }
9461}
9462
9465 const DebugLoc &DL, Register Src, Register Dst) const {
9466 auto Cur = MBB.begin();
9467 if (Cur != MBB.end())
9468 do {
9469 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
9470 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
9471 ++Cur;
9472 } while (Cur != MBB.end() && Cur != LastPHIIt);
9473
9474 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
9475 Dst);
9476}
9477
9480 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
9481 if (InsPt != MBB.end() &&
9482 (InsPt->getOpcode() == AMDGPU::SI_IF ||
9483 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
9484 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
9485 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
9486 InsPt++;
9487 return BuildMI(MBB, InsPt, DL,
9488 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
9489 : AMDGPU::S_MOV_B64_term),
9490 Dst)
9491 .addReg(Src, 0, SrcSubReg)
9492 .addReg(AMDGPU::EXEC, RegState::Implicit);
9493 }
9494 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
9495 Dst);
9496}
9497
9498bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
9499
9502 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
9503 VirtRegMap *VRM) const {
9504 // This is a bit of a hack (copied from AArch64). Consider this instruction:
9505 //
9506 // %0:sreg_32 = COPY $m0
9507 //
9508 // We explicitly chose SReg_32 for the virtual register so such a copy might
9509 // be eliminated by RegisterCoalescer. However, that may not be possible, and
9510 // %0 may even spill. We can't spill $m0 normally (it would require copying to
9511 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
9512 // TargetInstrInfo::foldMemoryOperand() is going to try.
9513 // A similar issue also exists with spilling and reloading $exec registers.
9514 //
9515 // To prevent that, constrain the %0 register class here.
9516 if (isFullCopyInstr(MI)) {
9517 Register DstReg = MI.getOperand(0).getReg();
9518 Register SrcReg = MI.getOperand(1).getReg();
9519 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
9520 (DstReg.isVirtual() != SrcReg.isVirtual())) {
9522 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
9523 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
9524 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
9525 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
9526 return nullptr;
9527 }
9528 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
9529 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
9530 return nullptr;
9531 }
9532 }
9533 }
9534
9535 return nullptr;
9536}
9537
9539 const MachineInstr &MI,
9540 unsigned *PredCost) const {
9541 if (MI.isBundle()) {
9543 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
9544 unsigned Lat = 0, Count = 0;
9545 for (++I; I != E && I->isBundledWithPred(); ++I) {
9546 ++Count;
9547 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
9548 }
9549 return Lat + Count - 1;
9550 }
9551
9552 return SchedModel.computeInstrLatency(&MI);
9553}
9554
9557 unsigned opcode = MI.getOpcode();
9558 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
9559 auto IID = GI->getIntrinsicID();
9564
9565 switch (IID) {
9566 case Intrinsic::amdgcn_if:
9567 case Intrinsic::amdgcn_else:
9568 // FIXME: Uniform if second result
9569 break;
9570 }
9571
9573 }
9574
9575 // Loads from the private and flat address spaces are divergent, because
9576 // threads can execute the load instruction with the same inputs and get
9577 // different results.
9578 //
9579 // All other loads are not divergent, because if threads issue loads with the
9580 // same arguments, they will always get the same result.
9581 if (opcode == AMDGPU::G_LOAD) {
9582 if (MI.memoperands_empty())
9583 return InstructionUniformity::NeverUniform; // conservative assumption
9584
9585 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9586 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9587 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9588 })) {
9589 // At least one MMO in a non-global address space.
9591 }
9593 }
9594
9595 if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) ||
9596 opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
9597 opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
9598 AMDGPU::isGenericAtomic(opcode)) {
9600 }
9602}
9603
9606
9607 if (isNeverUniform(MI))
9609
9610 unsigned opcode = MI.getOpcode();
9611 if (opcode == AMDGPU::V_READLANE_B32 ||
9612 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
9613 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
9615
9616 if (isCopyInstr(MI)) {
9617 const MachineOperand &srcOp = MI.getOperand(1);
9618 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
9619 const TargetRegisterClass *regClass =
9620 RI.getPhysRegBaseClass(srcOp.getReg());
9623 }
9625 }
9626
9627 // GMIR handling
9628 if (MI.isPreISelOpcode())
9630
9631 // Atomics are divergent because they are executed sequentially: when an
9632 // atomic operation refers to the same address in each thread, then each
9633 // thread after the first sees the value written by the previous thread as
9634 // original value.
9635
9636 if (isAtomic(MI))
9638
9639 // Loads from the private and flat address spaces are divergent, because
9640 // threads can execute the load instruction with the same inputs and get
9641 // different results.
9642 if (isFLAT(MI) && MI.mayLoad()) {
9643 if (MI.memoperands_empty())
9644 return InstructionUniformity::NeverUniform; // conservative assumption
9645
9646 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9647 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9648 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9649 })) {
9650 // At least one MMO in a non-global address space.
9652 }
9653
9655 }
9656
9657 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9658 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
9659
9660 // FIXME: It's conceptually broken to report this for an instruction, and not
9661 // a specific def operand. For inline asm in particular, there could be mixed
9662 // uniform and divergent results.
9663 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
9664 const MachineOperand &SrcOp = MI.getOperand(I);
9665 if (!SrcOp.isReg())
9666 continue;
9667
9668 Register Reg = SrcOp.getReg();
9669 if (!Reg || !SrcOp.readsReg())
9670 continue;
9671
9672 // If RegBank is null, this is unassigned or an unallocatable special
9673 // register, which are all scalars.
9674 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
9675 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
9677 }
9678
9679 // TODO: Uniformity check condtions above can be rearranged for more
9680 // redability
9681
9682 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
9683 // currently turned into no-op COPYs by SelectionDAG ISel and are
9684 // therefore no longer recognizable.
9685
9687}
9688
9690 switch (MF.getFunction().getCallingConv()) {
9692 return 1;
9694 return 2;
9696 return 3;
9700 report_fatal_error("ds_ordered_count unsupported for this calling conv");
9703 case CallingConv::C:
9704 case CallingConv::Fast:
9705 default:
9706 // Assume other calling conventions are various compute callable functions
9707 return 0;
9708 }
9709}
9710
9712 Register &SrcReg2, int64_t &CmpMask,
9713 int64_t &CmpValue) const {
9714 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
9715 return false;
9716
9717 switch (MI.getOpcode()) {
9718 default:
9719 break;
9720 case AMDGPU::S_CMP_EQ_U32:
9721 case AMDGPU::S_CMP_EQ_I32:
9722 case AMDGPU::S_CMP_LG_U32:
9723 case AMDGPU::S_CMP_LG_I32:
9724 case AMDGPU::S_CMP_LT_U32:
9725 case AMDGPU::S_CMP_LT_I32:
9726 case AMDGPU::S_CMP_GT_U32:
9727 case AMDGPU::S_CMP_GT_I32:
9728 case AMDGPU::S_CMP_LE_U32:
9729 case AMDGPU::S_CMP_LE_I32:
9730 case AMDGPU::S_CMP_GE_U32:
9731 case AMDGPU::S_CMP_GE_I32:
9732 case AMDGPU::S_CMP_EQ_U64:
9733 case AMDGPU::S_CMP_LG_U64:
9734 SrcReg = MI.getOperand(0).getReg();
9735 if (MI.getOperand(1).isReg()) {
9736 if (MI.getOperand(1).getSubReg())
9737 return false;
9738 SrcReg2 = MI.getOperand(1).getReg();
9739 CmpValue = 0;
9740 } else if (MI.getOperand(1).isImm()) {
9741 SrcReg2 = Register();
9742 CmpValue = MI.getOperand(1).getImm();
9743 } else {
9744 return false;
9745 }
9746 CmpMask = ~0;
9747 return true;
9748 case AMDGPU::S_CMPK_EQ_U32:
9749 case AMDGPU::S_CMPK_EQ_I32:
9750 case AMDGPU::S_CMPK_LG_U32:
9751 case AMDGPU::S_CMPK_LG_I32:
9752 case AMDGPU::S_CMPK_LT_U32:
9753 case AMDGPU::S_CMPK_LT_I32:
9754 case AMDGPU::S_CMPK_GT_U32:
9755 case AMDGPU::S_CMPK_GT_I32:
9756 case AMDGPU::S_CMPK_LE_U32:
9757 case AMDGPU::S_CMPK_LE_I32:
9758 case AMDGPU::S_CMPK_GE_U32:
9759 case AMDGPU::S_CMPK_GE_I32:
9760 SrcReg = MI.getOperand(0).getReg();
9761 SrcReg2 = Register();
9762 CmpValue = MI.getOperand(1).getImm();
9763 CmpMask = ~0;
9764 return true;
9765 }
9766
9767 return false;
9768}
9769
9771 Register SrcReg2, int64_t CmpMask,
9772 int64_t CmpValue,
9773 const MachineRegisterInfo *MRI) const {
9774 if (!SrcReg || SrcReg.isPhysical())
9775 return false;
9776
9777 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
9778 return false;
9779
9780 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
9781 this](int64_t ExpectedValue, unsigned SrcSize,
9782 bool IsReversible, bool IsSigned) -> bool {
9783 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9784 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9785 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9786 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9787 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
9788 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9789 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9790 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9791 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9792 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
9793 //
9794 // Signed ge/gt are not used for the sign bit.
9795 //
9796 // If result of the AND is unused except in the compare:
9797 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
9798 //
9799 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9800 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9801 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
9802 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9803 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9804 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
9805
9806 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
9807 if (!Def || Def->getParent() != CmpInstr.getParent())
9808 return false;
9809
9810 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
9811 Def->getOpcode() != AMDGPU::S_AND_B64)
9812 return false;
9813
9814 int64_t Mask;
9815 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
9816 if (MO->isImm())
9817 Mask = MO->getImm();
9818 else if (!getFoldableImm(MO, Mask))
9819 return false;
9820 Mask &= maxUIntN(SrcSize);
9821 return isPowerOf2_64(Mask);
9822 };
9823
9824 MachineOperand *SrcOp = &Def->getOperand(1);
9825 if (isMask(SrcOp))
9826 SrcOp = &Def->getOperand(2);
9827 else if (isMask(&Def->getOperand(2)))
9828 SrcOp = &Def->getOperand(1);
9829 else
9830 return false;
9831
9832 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
9833 if (IsSigned && BitNo == SrcSize - 1)
9834 return false;
9835
9836 ExpectedValue <<= BitNo;
9837
9838 bool IsReversedCC = false;
9839 if (CmpValue != ExpectedValue) {
9840 if (!IsReversible)
9841 return false;
9842 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
9843 if (!IsReversedCC)
9844 return false;
9845 }
9846
9847 Register DefReg = Def->getOperand(0).getReg();
9848 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
9849 return false;
9850
9851 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
9852 I != E; ++I) {
9853 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
9854 I->killsRegister(AMDGPU::SCC, &RI))
9855 return false;
9856 }
9857
9858 MachineOperand *SccDef =
9859 Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
9860 SccDef->setIsDead(false);
9861 CmpInstr.eraseFromParent();
9862
9863 if (!MRI->use_nodbg_empty(DefReg)) {
9864 assert(!IsReversedCC);
9865 return true;
9866 }
9867
9868 // Replace AND with unused result with a S_BITCMP.
9869 MachineBasicBlock *MBB = Def->getParent();
9870
9871 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
9872 : AMDGPU::S_BITCMP1_B32
9873 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
9874 : AMDGPU::S_BITCMP1_B64;
9875
9876 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
9877 .add(*SrcOp)
9878 .addImm(BitNo);
9879 Def->eraseFromParent();
9880
9881 return true;
9882 };
9883
9884 switch (CmpInstr.getOpcode()) {
9885 default:
9886 break;
9887 case AMDGPU::S_CMP_EQ_U32:
9888 case AMDGPU::S_CMP_EQ_I32:
9889 case AMDGPU::S_CMPK_EQ_U32:
9890 case AMDGPU::S_CMPK_EQ_I32:
9891 return optimizeCmpAnd(1, 32, true, false);
9892 case AMDGPU::S_CMP_GE_U32:
9893 case AMDGPU::S_CMPK_GE_U32:
9894 return optimizeCmpAnd(1, 32, false, false);
9895 case AMDGPU::S_CMP_GE_I32:
9896 case AMDGPU::S_CMPK_GE_I32:
9897 return optimizeCmpAnd(1, 32, false, true);
9898 case AMDGPU::S_CMP_EQ_U64:
9899 return optimizeCmpAnd(1, 64, true, false);
9900 case AMDGPU::S_CMP_LG_U32:
9901 case AMDGPU::S_CMP_LG_I32:
9902 case AMDGPU::S_CMPK_LG_U32:
9903 case AMDGPU::S_CMPK_LG_I32:
9904 return optimizeCmpAnd(0, 32, true, false);
9905 case AMDGPU::S_CMP_GT_U32:
9906 case AMDGPU::S_CMPK_GT_U32:
9907 return optimizeCmpAnd(0, 32, false, false);
9908 case AMDGPU::S_CMP_GT_I32:
9909 case AMDGPU::S_CMPK_GT_I32:
9910 return optimizeCmpAnd(0, 32, false, true);
9911 case AMDGPU::S_CMP_LG_U64:
9912 return optimizeCmpAnd(0, 64, true, false);
9913 }
9914
9915 return false;
9916}
9917
9919 unsigned OpName) const {
9920 if (!ST.needsAlignedVGPRs())
9921 return;
9922
9923 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
9924 if (OpNo < 0)
9925 return;
9926 MachineOperand &Op = MI.getOperand(OpNo);
9927 if (getOpSize(MI, OpNo) > 4)
9928 return;
9929
9930 // Add implicit aligned super-reg to force alignment on the data operand.
9931 const DebugLoc &DL = MI.getDebugLoc();
9932 MachineBasicBlock *BB = MI.getParent();
9934 Register DataReg = Op.getReg();
9935 bool IsAGPR = RI.isAGPR(MRI, DataReg);
9936 Register Undef = MRI.createVirtualRegister(
9937 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
9938 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
9939 Register NewVR =
9940 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
9941 : &AMDGPU::VReg_64_Align2RegClass);
9942 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
9943 .addReg(DataReg, 0, Op.getSubReg())
9944 .addImm(AMDGPU::sub0)
9945 .addReg(Undef)
9946 .addImm(AMDGPU::sub1);
9947 Op.setReg(NewVR);
9948 Op.setSubReg(AMDGPU::sub0);
9949 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
9950}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
unsigned RegSize
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
amdgpu AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
TargetInstrInfo::RegSubRegPair RegSubRegPair
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillRestoreOpcode(unsigned Size)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, const MachineRegisterInfo &MRI, const MCInstrDesc &TID, unsigned RCID, bool IsAllocatable)
static unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillSaveOpcode(unsigned Size)
static bool resultDependsOnExec(const MachineInstr &MI)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, unsigned OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
Definition: SIInstrInfo.cpp:82
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static constexpr unsigned ModifierOpNames[]
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
Definition: SIInstrInfo.cpp:73
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool IsDead
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool has16BitInsts() const
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
Class for arbitrary precision integers.
Definition: APInt.h:78
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:807
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1522
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:168
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:281
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:380
bool useVGPRIndexMode() const
bool hasSDWAOmod() const
Definition: GCNSubtarget.h:745
bool hasA16() const
bool hasSDWAScalar() const
Definition: GCNSubtarget.h:749
bool hasScalarCompareEq64() const
Definition: GCNSubtarget.h:999
bool hasOnlyRevVALUShifts() const
Definition: GCNSubtarget.h:391
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:627
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:765
bool hasMAIInsts() const
Definition: GCNSubtarget.h:815
bool hasMFMAInlineLiteralBug() const
bool hasNegativeScratchOffsetBug() const
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasPkMovB32() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasR128A16() const
bool hasOffset3fBug() const
bool hasGetPCZeroExtension() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:278
const AMDGPURegisterBankInfo * getRegBankInfo() const override
Definition: GCNSubtarget.h:298
bool hasSDWAOutModsVOPC() const
Definition: GCNSubtarget.h:761
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:680
bool hasGFX940Insts() const
bool hasSDWASdst() const
Definition: GCNSubtarget.h:753
bool hasMovB64() const
bool isWave32() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:344
bool hasNegativeUnalignedScratchOffsetBug() const
bool hasG16() const
unsigned getNSAMaxSize(bool HasSampler=false) const
Generation getGeneration() const
Definition: GCNSubtarget.h:317
bool hasVOP3Literal() const
Definition: GCNSubtarget.h:924
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:732
bool hasAddr64() const
Definition: GCNSubtarget.h:381
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:724
bool hasGDS() const
bool hasPartialNSAEncoding() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
A possibly irreducible generalization of a Loop.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:157
iterator find(SlotIndex Pos)
find - Return an iterator pointing to the first segment that ends after Pos, or end().
void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:537
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:607
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:617
static const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition: MCExpr.cpp:193
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
bool mayStore() const
Return true if this instruction could possibly modify memory.
Definition: MCInstrDesc.h:444
bool mayLoad() const
Return true if this instruction could possibly read memory.
Definition: MCInstrDesc.h:438
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
Definition: MCInstrDesc.h:248
unsigned short Opcode
Definition: MCInstrDesc.h:205
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
Definition: MCInstrDesc.h:565
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
uint8_t OperandType
Information about the type of the operand.
Definition: MCInstrDesc.h:97
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition: MCInstrDesc.h:91
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx)
Definition: MCExpr.h:393
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:41
void setVariableValue(const MCExpr *Value)
Definition: MCSymbol.cpp:47
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
unsigned pred_size() const
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
reverse_iterator rend()
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< iterator > terminators()
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineDomTreeNode * addNewBlock(MachineBasicBlock *BB, MachineBasicBlock *DomBB)
addNewBlock - Add a new node to the dominator tree information.
bool properlyDominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
void changeImmediateDominator(MachineBasicBlock *N, MachineBasicBlock *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:346
void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:572
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
iterator_range< mop_iterator > explicit_operands()
Definition: MachineInstr.h:691
unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
Definition: MachineInstr.h:815
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:800
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:782
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:498
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
iterator_range< mop_iterator > implicit_operands()
Definition: MachineInstr.h:699
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
uint32_t getFlags() const
Return the MI flags bitvector.
Definition: MachineInstr.h:391
int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
bool addRegisterDead(Register Reg, const TargetRegisterInfo *RegInfo, bool AddIfNotFound=false)
We have determined MI defined a register without a use.
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
reg_begin/reg_end - Provide iteration support to walk over all definitions and uses of a register wit...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
void enterBasicBlockEnd(MachineBasicBlock &MBB)
Start tracking liveness from the end of basic block MBB.
bool isRegUsed(Register Reg, bool includeReserved=true) const
Return if a specific register is currently used.
void setRegUsed(Register Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Tell the scavenger a register is used.
void backward()
Update internal register state and move MBB iterator backwards.
void enterBasicBlock(MachineBasicBlock &MBB)
Start tracking liveness from the begin of basic block MBB.
Register scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, bool AllowSpill=true)
Make a register of the specific register class available from the current position backwards to the p...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
static bool isMAI(const MachineInstr &MI)
Definition: SIInstrInfo.h:792
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:554
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
bool isNonUniformBranchInstr(MachineInstr &Instr) const
static bool isVOP3(const MachineInstr &MI)
Definition: SIInstrInfo.h:504
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
Definition: SIInstrInfo.h:924
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
Definition: SIInstrInfo.h:1144
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:636
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:544
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const
Get required immediate operand.
Definition: SIInstrInfo.h:1272
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:536
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:649
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:408
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:496
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:512
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
Definition: SIInstrInfo.h:604
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:520
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
bool isSpill(uint16_t Opcode) const
Definition: SIInstrInfo.h:740
static bool isVIMAGE(const MachineInstr &MI)
Definition: SIInstrInfo.h:588
static bool isSOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:448
static bool isGWS(const MachineInstr &MI)
Definition: SIInstrInfo.h:570
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, MachineOperand &Src1, unsigned Src1OpName) const
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:628
static bool isVSAMPLE(const MachineInstr &MI)
Definition: SIInstrInfo.h:596
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
unsigned isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isImage(const MachineInstr &MI)
Definition: SIInstrInfo.h:424
static bool isSOPK(const MachineInstr &MI)
Definition: SIInstrInfo.h:464
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
Definition: SIInstrInfo.h:957
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const override
bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, const MachineOperand &MO) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:528
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void convertNonUniformLoopRegion(MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const override final
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:618
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool isDPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:760
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
bool isLowLatencyInstruction(const MachineInstr &MI) const
void materializeImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, int64_t Value) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
bool isAlwaysGDS(uint16_t Opcode) const
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool isVGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:716
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool isVGPRCopy(const MachineInstr &MI) const
Definition: SIInstrInfo.h:1003
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:580
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description.
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
const TargetRegisterClass * getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI, const MachineFunction &MF) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc) const override
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
void convertNonUniformIfRegion(MachineBasicBlock *IfEntry, MachineBasicBlock *IfEnd) const
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
static bool isAtomic(const MachineInstr &MI)
Definition: SIInstrInfo.h:681
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
static bool sopkIsZext(unsigned Opcode)
Definition: SIInstrInfo.h:863
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const override
static bool isSGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:728
static bool isWMMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:809
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:456
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:612
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:416
bool isBarrier(unsigned Opcode) const
Definition: SIInstrInfo.h:939
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
Definition: SIInstrInfo.h:1285
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
bool hasModifiersSet(const MachineInstr &MI, unsigned OpName) const
static bool isFixedSize(const MachineInstr &MI)
Definition: SIInstrInfo.h:880
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isLDSDMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:562
unsigned isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isVOP1(const MachineInstr &MI)
Definition: SIInstrInfo.h:488
SIInstrInfo(const GCNSubtarget &ST)
Definition: SIInstrInfo.cpp:63
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
const TargetRegisterClass * getRegClass(unsigned RCID) const
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
static bool hasVectorRegisters(const TargetRegisterClass *RC)
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool opCanUseInlineConstant(unsigned OpType) const
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
bool opCanUseLiteralConstant(unsigned OpType) const
static bool hasVGPRs(const TargetRegisterClass *RC)
static bool isVGPRClass(const TargetRegisterClass *RC)
unsigned getHWRegIndex(MCRegister Reg) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
const TargetRegisterClass * getBoolRC() const
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getChannelFromSubReg(unsigned SubReg) const
MCRegister getVCC() const
static bool hasAGPRs(const TargetRegisterClass *RC)
const TargetRegisterClass * getWaveMaskRegClass() const
bool spillSGPRToVGPR() const
const TargetRegisterClass * getVGPR64Class() const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:577
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:65
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
Definition: SlotIndexes.h:237
SlotIndexes pass.
Definition: SlotIndexes.h:297
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:531
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:587
void push_back(const T &Elt)
Definition: SmallVector.h:427
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1210
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
const TargetRegisterClass * getAllocatableClass(const TargetRegisterClass *RC) const
Return the maximal subclass of the given register class that is allocatable or NULL.
unsigned getSubRegIdxSize(unsigned Idx) const
Get the size of the bit range covered by a sub-register index.
unsigned getSubRegIdxOffset(unsigned Idx) const
Get the offset of the bit range covered by a sub-register index.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
Definition: SIInstrInfo.h:1563
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isDPALU_DPP(const MCInstrDesc &OpDesc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
Definition: SIInstrInfo.h:1564
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
bool isTrue16Inst(unsigned Opc)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo)
Is this an AMDGPU specific source operand? These include registers, inline constants,...
const uint64_t RSRC_TID_ENABLE
Definition: SIInstrInfo.h:1566
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isGenericAtomic(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition: SIDefines.h:234
@ OPERAND_REG_IMM_INT64
Definition: SIDefines.h:201
@ OPERAND_REG_IMM_V2FP16
Definition: SIDefines.h:211
@ OPERAND_REG_INLINE_C_V2INT32
Definition: SIDefines.h:227
@ OPERAND_REG_INLINE_C_FP64
Definition: SIDefines.h:223
@ OPERAND_REG_INLINE_C_BF16
Definition: SIDefines.h:220
@ OPERAND_REG_INLINE_C_V2BF16
Definition: SIDefines.h:225
@ OPERAND_REG_IMM_V2INT16
Definition: SIDefines.h:212
@ OPERAND_REG_IMM_BF16
Definition: SIDefines.h:205
@ OPERAND_REG_INLINE_AC_V2FP16
Definition: SIDefines.h:246
@ OPERAND_REG_IMM_INT32
Operands with register or 32-bit immediate.
Definition: SIDefines.h:200
@ OPERAND_REG_IMM_V2BF16
Definition: SIDefines.h:210
@ OPERAND_REG_IMM_BF16_DEFERRED
Definition: SIDefines.h:207
@ OPERAND_REG_IMM_FP16
Definition: SIDefines.h:206
@ OPERAND_REG_INLINE_C_INT64
Definition: SIDefines.h:219
@ OPERAND_REG_INLINE_AC_BF16
Definition: SIDefines.h:240
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition: SIDefines.h:217
@ OPERAND_REG_INLINE_AC_INT16
Operands with an AccVGPR register or inline constant.
Definition: SIDefines.h:238
@ OPERAND_REG_IMM_FP64
Definition: SIDefines.h:204
@ OPERAND_REG_INLINE_C_V2FP16
Definition: SIDefines.h:226
@ OPERAND_REG_INLINE_AC_V2INT16
Definition: SIDefines.h:244
@ OPERAND_REG_INLINE_AC_FP16
Definition: SIDefines.h:241
@ OPERAND_REG_INLINE_AC_INT32
Definition: SIDefines.h:239
@ OPERAND_REG_INLINE_AC_FP32
Definition: SIDefines.h:242
@ OPERAND_REG_INLINE_AC_V2BF16
Definition: SIDefines.h:245
@ OPERAND_REG_IMM_V2INT32
Definition: SIDefines.h:213
@ OPERAND_REG_IMM_FP32
Definition: SIDefines.h:203
@ OPERAND_INPUT_MODS
Definition: SIDefines.h:251
@ OPERAND_REG_INLINE_C_FP32
Definition: SIDefines.h:222
@ OPERAND_REG_INLINE_C_INT32
Definition: SIDefines.h:218
@ OPERAND_REG_INLINE_C_V2INT16
Definition: SIDefines.h:224
@ OPERAND_REG_IMM_V2FP32
Definition: SIDefines.h:214
@ OPERAND_REG_INLINE_AC_FP64
Definition: SIDefines.h:243
@ OPERAND_REG_INLINE_C_FP16
Definition: SIDefines.h:221
@ OPERAND_REG_IMM_INT16
Definition: SIDefines.h:202
@ OPERAND_REG_INLINE_C_V2FP32
Definition: SIDefines.h:228
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition: SIDefines.h:231
@ OPERAND_REG_IMM_FP32_DEFERRED
Definition: SIDefines.h:209
@ OPERAND_REG_IMM_FP16_DEFERRED
Definition: SIDefines.h:208
@ TI_SCRATCH_RSRC_DWORD1
Definition: AMDGPU.h:452
@ TI_SCRATCH_RSRC_DWORD3
Definition: AMDGPU.h:454
@ TI_SCRATCH_RSRC_DWORD0
Definition: AMDGPU.h:451
@ TI_SCRATCH_RSRC_DWORD2
Definition: AMDGPU.h:453
@ TI_CONSTDATA_START
Definition: AMDGPU.h:450
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
int getMCOpcode(uint16_t Opcode, unsigned Gen)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
Definition: SIInstrInfo.h:1565
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
bool isGraphics(CallingConv::ID cc)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ OPERAND_GENERIC_4
Definition: MCInstrDesc.h:70
@ OPERAND_GENERIC_2
Definition: MCInstrDesc.h:68
@ OPERAND_GENERIC_1
Definition: MCInstrDesc.h:67
@ OPERAND_REGISTER
Definition: MCInstrDesc.h:61
@ OPERAND_GENERIC_3
Definition: MCInstrDesc.h:69
@ OPERAND_IMMEDIATE
Definition: MCInstrDesc.h:60
@ OPERAND_UNKNOWN
Definition: MCInstrDesc.h:59
@ OPERAND_GENERIC_0
Definition: MCInstrDesc.h:66
@ OPERAND_GENERIC_5
Definition: MCInstrDesc.h:71
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:138
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:480
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1454
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:547
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:296
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:41
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:260
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
Definition: TargetOpcodes.h:36
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Error
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:45
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition: Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
@ NeverUniform
The result values can never be assumed to be uniform.
@ Default
The result values are uniform if and only if all operands are uniform.
uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition: MathExtras.h:219
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition: APFloat.cpp:238
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Description of the encoding of one expression Op.
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
Definition: LiveVariables.h:83
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition: SIInstrInfo.h:49
MachineInstr * top() const
Definition: SIInstrInfo.h:54
bool empty() const
Definition: SIInstrInfo.h:64
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition: SIInstrInfo.h:73
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.