LLVM 20.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "GCNHazardRecognizer.h"
18#include "GCNSubtarget.h"
31#include "llvm/IR/IntrinsicsAMDGPU.h"
32#include "llvm/MC/MCContext.h"
35
36using namespace llvm;
37
38#define DEBUG_TYPE "si-instr-info"
39
40#define GET_INSTRINFO_CTOR_DTOR
41#include "AMDGPUGenInstrInfo.inc"
42
43namespace llvm::AMDGPU {
44#define GET_D16ImageDimIntrinsics_IMPL
45#define GET_ImageDimIntrinsicTable_IMPL
46#define GET_RsrcIntrinsics_IMPL
47#include "AMDGPUGenSearchableTables.inc"
48} // namespace llvm::AMDGPU
49
50// Must be at least 4 to be able to branch over minimum unconditional branch
51// code. This is only for making it possible to write reasonably small tests for
52// long branches.
54BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
55 cl::desc("Restrict range of branch instructions (DEBUG)"));
56
58 "amdgpu-fix-16-bit-physreg-copies",
59 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
60 cl::init(true),
62
64 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
65 RI(ST), ST(ST) {
66 SchedModel.init(&ST);
67}
68
69//===----------------------------------------------------------------------===//
70// TargetInstrInfo callbacks
71//===----------------------------------------------------------------------===//
72
73static unsigned getNumOperandsNoGlue(SDNode *Node) {
74 unsigned N = Node->getNumOperands();
75 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
76 --N;
77 return N;
78}
79
80/// Returns true if both nodes have the same value for the given
81/// operand \p Op, or if both nodes do not have this operand.
82static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
83 unsigned Opc0 = N0->getMachineOpcode();
84 unsigned Opc1 = N1->getMachineOpcode();
85
86 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
87 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
88
89 if (Op0Idx == -1 && Op1Idx == -1)
90 return true;
91
92
93 if ((Op0Idx == -1 && Op1Idx != -1) ||
94 (Op1Idx == -1 && Op0Idx != -1))
95 return false;
96
97 // getNamedOperandIdx returns the index for the MachineInstr's operands,
98 // which includes the result as the first operand. We are indexing into the
99 // MachineSDNode's operands, so we need to skip the result operand to get
100 // the real index.
101 --Op0Idx;
102 --Op1Idx;
103
104 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
105}
106
107static bool canRemat(const MachineInstr &MI) {
108
112 return true;
113
114 if (SIInstrInfo::isSMRD(MI)) {
115 return !MI.memoperands_empty() &&
116 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
117 return MMO->isLoad() && MMO->isInvariant();
118 });
119 }
120
121 return false;
122}
123
125 const MachineInstr &MI) const {
126
127 if (canRemat(MI)) {
128 // Normally VALU use of exec would block the rematerialization, but that
129 // is OK in this case to have an implicit exec read as all VALU do.
130 // We really want all of the generic logic for this except for this.
131
132 // Another potential implicit use is mode register. The core logic of
133 // the RA will not attempt rematerialization if mode is set anywhere
134 // in the function, otherwise it is safe since mode is not changed.
135
136 // There is difference to generic method which does not allow
137 // rematerialization if there are virtual register uses. We allow this,
138 // therefore this method includes SOP instructions as well.
139 if (!MI.hasImplicitDef() &&
140 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
141 !MI.mayRaiseFPException())
142 return true;
143 }
144
146}
147
148// Returns true if the scalar result of a VALU instruction depends on exec.
150 // Ignore comparisons which are only used masked with exec.
151 // This allows some hoisting/sinking of VALU comparisons.
152 if (MI.isCompare()) {
153 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
154 Register DstReg = MI.getOperand(0).getReg();
155 if (!DstReg.isVirtual())
156 return true;
157 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
158 switch (Use.getOpcode()) {
159 case AMDGPU::S_AND_SAVEEXEC_B32:
160 case AMDGPU::S_AND_SAVEEXEC_B64:
161 break;
162 case AMDGPU::S_AND_B32:
163 case AMDGPU::S_AND_B64:
164 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
165 return true;
166 break;
167 default:
168 return true;
169 }
170 }
171 return false;
172 }
173
174 switch (MI.getOpcode()) {
175 default:
176 break;
177 case AMDGPU::V_READFIRSTLANE_B32:
178 return true;
179 }
180
181 return false;
182}
183
185 // Any implicit use of exec by VALU is not a real register read.
186 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
188}
189
191 MachineBasicBlock *SuccToSinkTo,
192 MachineCycleInfo *CI) const {
193 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
194 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
195 return true;
196
197 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
198 // Check if sinking of MI would create temporal divergent use.
199 for (auto Op : MI.uses()) {
200 if (Op.isReg() && Op.getReg().isVirtual() &&
201 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
202 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
203
204 // SgprDef defined inside cycle
205 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
206 if (FromCycle == nullptr)
207 continue;
208
209 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
210 // Check if there is a FromCycle that contains SgprDef's basic block but
211 // does not contain SuccToSinkTo and also has divergent exit condition.
212 while (FromCycle && !FromCycle->contains(ToCycle)) {
214 FromCycle->getExitingBlocks(ExitingBlocks);
215
216 // FromCycle has divergent exit condition.
217 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
218 if (hasDivergentBranch(ExitingBlock))
219 return false;
220 }
221
222 FromCycle = FromCycle->getParentCycle();
223 }
224 }
225 }
226
227 return true;
228}
229
231 int64_t &Offset0,
232 int64_t &Offset1) const {
233 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
234 return false;
235
236 unsigned Opc0 = Load0->getMachineOpcode();
237 unsigned Opc1 = Load1->getMachineOpcode();
238
239 // Make sure both are actually loads.
240 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
241 return false;
242
243 // A mayLoad instruction without a def is not a load. Likely a prefetch.
244 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
245 return false;
246
247 if (isDS(Opc0) && isDS(Opc1)) {
248
249 // FIXME: Handle this case:
250 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
251 return false;
252
253 // Check base reg.
254 if (Load0->getOperand(0) != Load1->getOperand(0))
255 return false;
256
257 // Skip read2 / write2 variants for simplicity.
258 // TODO: We should report true if the used offsets are adjacent (excluded
259 // st64 versions).
260 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
261 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
262 if (Offset0Idx == -1 || Offset1Idx == -1)
263 return false;
264
265 // XXX - be careful of dataless loads
266 // getNamedOperandIdx returns the index for MachineInstrs. Since they
267 // include the output in the operand list, but SDNodes don't, we need to
268 // subtract the index by one.
269 Offset0Idx -= get(Opc0).NumDefs;
270 Offset1Idx -= get(Opc1).NumDefs;
271 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
272 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
273 return true;
274 }
275
276 if (isSMRD(Opc0) && isSMRD(Opc1)) {
277 // Skip time and cache invalidation instructions.
278 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
279 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
280 return false;
281
282 unsigned NumOps = getNumOperandsNoGlue(Load0);
283 if (NumOps != getNumOperandsNoGlue(Load1))
284 return false;
285
286 // Check base reg.
287 if (Load0->getOperand(0) != Load1->getOperand(0))
288 return false;
289
290 // Match register offsets, if both register and immediate offsets present.
291 assert(NumOps == 4 || NumOps == 5);
292 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
293 return false;
294
295 const ConstantSDNode *Load0Offset =
296 dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3));
297 const ConstantSDNode *Load1Offset =
298 dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3));
299
300 if (!Load0Offset || !Load1Offset)
301 return false;
302
303 Offset0 = Load0Offset->getZExtValue();
304 Offset1 = Load1Offset->getZExtValue();
305 return true;
306 }
307
308 // MUBUF and MTBUF can access the same addresses.
309 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
310
311 // MUBUF and MTBUF have vaddr at different indices.
312 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
313 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
314 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
315 return false;
316
317 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
318 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
319
320 if (OffIdx0 == -1 || OffIdx1 == -1)
321 return false;
322
323 // getNamedOperandIdx returns the index for MachineInstrs. Since they
324 // include the output in the operand list, but SDNodes don't, we need to
325 // subtract the index by one.
326 OffIdx0 -= get(Opc0).NumDefs;
327 OffIdx1 -= get(Opc1).NumDefs;
328
329 SDValue Off0 = Load0->getOperand(OffIdx0);
330 SDValue Off1 = Load1->getOperand(OffIdx1);
331
332 // The offset might be a FrameIndexSDNode.
333 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
334 return false;
335
336 Offset0 = Off0->getAsZExtVal();
337 Offset1 = Off1->getAsZExtVal();
338 return true;
339 }
340
341 return false;
342}
343
344static bool isStride64(unsigned Opc) {
345 switch (Opc) {
346 case AMDGPU::DS_READ2ST64_B32:
347 case AMDGPU::DS_READ2ST64_B64:
348 case AMDGPU::DS_WRITE2ST64_B32:
349 case AMDGPU::DS_WRITE2ST64_B64:
350 return true;
351 default:
352 return false;
353 }
354}
355
358 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
359 const TargetRegisterInfo *TRI) const {
360 if (!LdSt.mayLoadOrStore())
361 return false;
362
363 unsigned Opc = LdSt.getOpcode();
364 OffsetIsScalable = false;
365 const MachineOperand *BaseOp, *OffsetOp;
366 int DataOpIdx;
367
368 if (isDS(LdSt)) {
369 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
370 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
371 if (OffsetOp) {
372 // Normal, single offset LDS instruction.
373 if (!BaseOp) {
374 // DS_CONSUME/DS_APPEND use M0 for the base address.
375 // TODO: find the implicit use operand for M0 and use that as BaseOp?
376 return false;
377 }
378 BaseOps.push_back(BaseOp);
379 Offset = OffsetOp->getImm();
380 // Get appropriate operand, and compute width accordingly.
381 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
382 if (DataOpIdx == -1)
383 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
384 Width = getOpSize(LdSt, DataOpIdx);
385 } else {
386 // The 2 offset instructions use offset0 and offset1 instead. We can treat
387 // these as a load with a single offset if the 2 offsets are consecutive.
388 // We will use this for some partially aligned loads.
389 const MachineOperand *Offset0Op =
390 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
391 const MachineOperand *Offset1Op =
392 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
393
394 unsigned Offset0 = Offset0Op->getImm() & 0xff;
395 unsigned Offset1 = Offset1Op->getImm() & 0xff;
396 if (Offset0 + 1 != Offset1)
397 return false;
398
399 // Each of these offsets is in element sized units, so we need to convert
400 // to bytes of the individual reads.
401
402 unsigned EltSize;
403 if (LdSt.mayLoad())
404 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
405 else {
406 assert(LdSt.mayStore());
407 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
408 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
409 }
410
411 if (isStride64(Opc))
412 EltSize *= 64;
413
414 BaseOps.push_back(BaseOp);
415 Offset = EltSize * Offset0;
416 // Get appropriate operand(s), and compute width accordingly.
417 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
418 if (DataOpIdx == -1) {
419 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
420 Width = getOpSize(LdSt, DataOpIdx);
421 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
422 Width = Width.getValue() + getOpSize(LdSt, DataOpIdx);
423 } else {
424 Width = getOpSize(LdSt, DataOpIdx);
425 }
426 }
427 return true;
428 }
429
430 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
431 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
432 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
433 return false;
434 BaseOps.push_back(RSrc);
435 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
436 if (BaseOp && !BaseOp->isFI())
437 BaseOps.push_back(BaseOp);
438 const MachineOperand *OffsetImm =
439 getNamedOperand(LdSt, AMDGPU::OpName::offset);
440 Offset = OffsetImm->getImm();
441 const MachineOperand *SOffset =
442 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
443 if (SOffset) {
444 if (SOffset->isReg())
445 BaseOps.push_back(SOffset);
446 else
447 Offset += SOffset->getImm();
448 }
449 // Get appropriate operand, and compute width accordingly.
450 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
451 if (DataOpIdx == -1)
452 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
453 if (DataOpIdx == -1) // LDS DMA
454 return false;
455 Width = getOpSize(LdSt, DataOpIdx);
456 return true;
457 }
458
459 if (isImage(LdSt)) {
460 auto RsrcOpName =
461 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
462 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
463 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
464 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
465 if (VAddr0Idx >= 0) {
466 // GFX10 possible NSA encoding.
467 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
468 BaseOps.push_back(&LdSt.getOperand(I));
469 } else {
470 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
471 }
472 Offset = 0;
473 // Get appropriate operand, and compute width accordingly.
474 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
475 if (DataOpIdx == -1)
476 return false; // no return sampler
477 Width = getOpSize(LdSt, DataOpIdx);
478 return true;
479 }
480
481 if (isSMRD(LdSt)) {
482 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
483 if (!BaseOp) // e.g. S_MEMTIME
484 return false;
485 BaseOps.push_back(BaseOp);
486 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
487 Offset = OffsetOp ? OffsetOp->getImm() : 0;
488 // Get appropriate operand, and compute width accordingly.
489 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
490 if (DataOpIdx == -1)
491 return false;
492 Width = getOpSize(LdSt, DataOpIdx);
493 return true;
494 }
495
496 if (isFLAT(LdSt)) {
497 // Instructions have either vaddr or saddr or both or none.
498 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
499 if (BaseOp)
500 BaseOps.push_back(BaseOp);
501 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
502 if (BaseOp)
503 BaseOps.push_back(BaseOp);
504 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
505 // Get appropriate operand, and compute width accordingly.
506 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
507 if (DataOpIdx == -1)
508 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
509 if (DataOpIdx == -1) // LDS DMA
510 return false;
511 Width = getOpSize(LdSt, DataOpIdx);
512 return true;
513 }
514
515 return false;
516}
517
518static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
520 const MachineInstr &MI2,
522 // Only examine the first "base" operand of each instruction, on the
523 // assumption that it represents the real base address of the memory access.
524 // Other operands are typically offsets or indices from this base address.
525 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
526 return true;
527
528 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
529 return false;
530
531 auto *MO1 = *MI1.memoperands_begin();
532 auto *MO2 = *MI2.memoperands_begin();
533 if (MO1->getAddrSpace() != MO2->getAddrSpace())
534 return false;
535
536 const auto *Base1 = MO1->getValue();
537 const auto *Base2 = MO2->getValue();
538 if (!Base1 || !Base2)
539 return false;
540 Base1 = getUnderlyingObject(Base1);
541 Base2 = getUnderlyingObject(Base2);
542
543 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
544 return false;
545
546 return Base1 == Base2;
547}
548
550 int64_t Offset1, bool OffsetIsScalable1,
552 int64_t Offset2, bool OffsetIsScalable2,
553 unsigned ClusterSize,
554 unsigned NumBytes) const {
555 // If the mem ops (to be clustered) do not have the same base ptr, then they
556 // should not be clustered
557 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
558 if (!BaseOps1.empty() && !BaseOps2.empty()) {
559 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
560 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
561 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
562 return false;
563
564 const SIMachineFunctionInfo *MFI =
565 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
566 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
567 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
568 // If only one base op is empty, they do not have the same base ptr
569 return false;
570 }
571
572 // In order to avoid register pressure, on an average, the number of DWORDS
573 // loaded together by all clustered mem ops should not exceed
574 // MaxMemoryClusterDWords. This is an empirical value based on certain
575 // observations and performance related experiments.
576 // The good thing about this heuristic is - it avoids clustering of too many
577 // sub-word loads, and also avoids clustering of wide loads. Below is the
578 // brief summary of how the heuristic behaves for various `LoadSize` when
579 // MaxMemoryClusterDWords is 8.
580 //
581 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
582 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
583 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
584 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
585 // (5) LoadSize >= 17: do not cluster
586 const unsigned LoadSize = NumBytes / ClusterSize;
587 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
588 return NumDWords <= MaxMemoryClusterDWords;
589}
590
591// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
592// the first 16 loads will be interleaved with the stores, and the next 16 will
593// be clustered as expected. It should really split into 2 16 store batches.
594//
595// Loads are clustered until this returns false, rather than trying to schedule
596// groups of stores. This also means we have to deal with saying different
597// address space loads should be clustered, and ones which might cause bank
598// conflicts.
599//
600// This might be deprecated so it might not be worth that much effort to fix.
602 int64_t Offset0, int64_t Offset1,
603 unsigned NumLoads) const {
604 assert(Offset1 > Offset0 &&
605 "Second offset should be larger than first offset!");
606 // If we have less than 16 loads in a row, and the offsets are within 64
607 // bytes, then schedule together.
608
609 // A cacheline is 64 bytes (for global memory).
610 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
611}
612
615 const DebugLoc &DL, MCRegister DestReg,
616 MCRegister SrcReg, bool KillSrc,
617 const char *Msg = "illegal VGPR to SGPR copy") {
619 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
621 C.diagnose(IllegalCopy);
622
623 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
624 .addReg(SrcReg, getKillRegState(KillSrc));
625}
626
627/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
628/// possible to have a direct copy in these cases on GFX908, so an intermediate
629/// VGPR copy is required.
633 const DebugLoc &DL, MCRegister DestReg,
634 MCRegister SrcReg, bool KillSrc,
635 RegScavenger &RS, bool RegsOverlap,
636 Register ImpDefSuperReg = Register(),
637 Register ImpUseSuperReg = Register()) {
638 assert((TII.getSubtarget().hasMAIInsts() &&
639 !TII.getSubtarget().hasGFX90AInsts()) &&
640 "Expected GFX908 subtarget.");
641
642 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
643 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
644 "Source register of the copy should be either an SGPR or an AGPR.");
645
646 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
647 "Destination register of the copy should be an AGPR.");
648
649 const SIRegisterInfo &RI = TII.getRegisterInfo();
650
651 // First try to find defining accvgpr_write to avoid temporary registers.
652 // In the case of copies of overlapping AGPRs, we conservatively do not
653 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
654 // an accvgpr_write used for this same copy due to implicit-defs
655 if (!RegsOverlap) {
656 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
657 --Def;
658
659 if (!Def->modifiesRegister(SrcReg, &RI))
660 continue;
661
662 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
663 Def->getOperand(0).getReg() != SrcReg)
664 break;
665
666 MachineOperand &DefOp = Def->getOperand(1);
667 assert(DefOp.isReg() || DefOp.isImm());
668
669 if (DefOp.isReg()) {
670 bool SafeToPropagate = true;
671 // Check that register source operand is not clobbered before MI.
672 // Immediate operands are always safe to propagate.
673 for (auto I = Def; I != MI && SafeToPropagate; ++I)
674 if (I->modifiesRegister(DefOp.getReg(), &RI))
675 SafeToPropagate = false;
676
677 if (!SafeToPropagate)
678 break;
679
680 DefOp.setIsKill(false);
681 }
682
683 MachineInstrBuilder Builder =
684 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
685 .add(DefOp);
686 if (ImpDefSuperReg)
687 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
688
689 if (ImpUseSuperReg) {
690 Builder.addReg(ImpUseSuperReg,
692 }
693
694 return;
695 }
696 }
697
699 RS.backward(std::next(MI));
700
701 // Ideally we want to have three registers for a long reg_sequence copy
702 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
703 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
704 *MBB.getParent());
705
706 // Registers in the sequence are allocated contiguously so we can just
707 // use register number to pick one of three round-robin temps.
708 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
709 Register Tmp =
710 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
712 "VGPR used for an intermediate copy should have been reserved.");
713
714 // Only loop through if there are any free registers left. We don't want to
715 // spill.
716 while (RegNo--) {
717 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
718 /* RestoreAfter */ false, 0,
719 /* AllowSpill */ false);
720 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
721 break;
722 Tmp = Tmp2;
723 RS.setRegUsed(Tmp);
724 }
725
726 // Insert copy to temporary VGPR.
727 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
728 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
729 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
730 } else {
731 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
732 }
733
734 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
735 .addReg(SrcReg, getKillRegState(KillSrc));
736 if (ImpUseSuperReg) {
737 UseBuilder.addReg(ImpUseSuperReg,
739 }
740
741 MachineInstrBuilder DefBuilder
742 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
743 .addReg(Tmp, RegState::Kill);
744
745 if (ImpDefSuperReg)
746 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
747}
748
751 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
752 const TargetRegisterClass *RC, bool Forward) {
753 const SIRegisterInfo &RI = TII.getRegisterInfo();
754 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
756 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
757
758 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
759 int16_t SubIdx = BaseIndices[Idx];
760 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
761 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
762 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
763 unsigned Opcode = AMDGPU::S_MOV_B32;
764
765 // Is SGPR aligned? If so try to combine with next.
766 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
767 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
768 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
769 // Can use SGPR64 copy
770 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
771 SubIdx = RI.getSubRegFromChannel(Channel, 2);
772 DestSubReg = RI.getSubReg(DestReg, SubIdx);
773 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
774 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
775 Opcode = AMDGPU::S_MOV_B64;
776 Idx++;
777 }
778
779 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
780 .addReg(SrcSubReg)
781 .addReg(SrcReg, RegState::Implicit);
782
783 if (!FirstMI)
784 FirstMI = LastMI;
785
786 if (!Forward)
787 I--;
788 }
789
790 assert(FirstMI && LastMI);
791 if (!Forward)
792 std::swap(FirstMI, LastMI);
793
794 FirstMI->addOperand(
795 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
796
797 if (KillSrc)
798 LastMI->addRegisterKilled(SrcReg, &RI);
799}
800
803 const DebugLoc &DL, MCRegister DestReg,
804 MCRegister SrcReg, bool KillSrc,
805 bool RenamableDest, bool RenamableSrc) const {
806 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
807 unsigned Size = RI.getRegSizeInBits(*RC);
808 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
809 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
810
811 // The rest of copyPhysReg assumes Src and Dst size are the same size.
812 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
813 // we remove Fix16BitCopies and this code block?
814 if (Fix16BitCopies) {
815 if (((Size == 16) != (SrcSize == 16))) {
816 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
818 MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg;
819 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
820 RegToFix = SubReg;
821
822 if (DestReg == SrcReg) {
823 // Identity copy. Insert empty bundle since ExpandPostRA expects an
824 // instruction here.
825 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
826 return;
827 }
828 RC = RI.getPhysRegBaseClass(DestReg);
829 Size = RI.getRegSizeInBits(*RC);
830 SrcRC = RI.getPhysRegBaseClass(SrcReg);
831 SrcSize = RI.getRegSizeInBits(*SrcRC);
832 }
833 }
834
835 if (RC == &AMDGPU::VGPR_32RegClass) {
836 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
837 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
838 AMDGPU::AGPR_32RegClass.contains(SrcReg));
839 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
840 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
841 BuildMI(MBB, MI, DL, get(Opc), DestReg)
842 .addReg(SrcReg, getKillRegState(KillSrc));
843 return;
844 }
845
846 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
847 RC == &AMDGPU::SReg_32RegClass) {
848 if (SrcReg == AMDGPU::SCC) {
849 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
850 .addImm(1)
851 .addImm(0);
852 return;
853 }
854
855 if (DestReg == AMDGPU::VCC_LO) {
856 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
857 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
858 .addReg(SrcReg, getKillRegState(KillSrc));
859 } else {
860 // FIXME: Hack until VReg_1 removed.
861 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
862 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
863 .addImm(0)
864 .addReg(SrcReg, getKillRegState(KillSrc));
865 }
866
867 return;
868 }
869
870 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
871 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
872 return;
873 }
874
875 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
876 .addReg(SrcReg, getKillRegState(KillSrc));
877 return;
878 }
879
880 if (RC == &AMDGPU::SReg_64RegClass) {
881 if (SrcReg == AMDGPU::SCC) {
882 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
883 .addImm(1)
884 .addImm(0);
885 return;
886 }
887
888 if (DestReg == AMDGPU::VCC) {
889 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
890 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
891 .addReg(SrcReg, getKillRegState(KillSrc));
892 } else {
893 // FIXME: Hack until VReg_1 removed.
894 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
895 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
896 .addImm(0)
897 .addReg(SrcReg, getKillRegState(KillSrc));
898 }
899
900 return;
901 }
902
903 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
904 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
905 return;
906 }
907
908 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
909 .addReg(SrcReg, getKillRegState(KillSrc));
910 return;
911 }
912
913 if (DestReg == AMDGPU::SCC) {
914 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
915 // but SelectionDAG emits such copies for i1 sources.
916 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
917 // This copy can only be produced by patterns
918 // with explicit SCC, which are known to be enabled
919 // only for subtargets with S_CMP_LG_U64 present.
921 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
922 .addReg(SrcReg, getKillRegState(KillSrc))
923 .addImm(0);
924 } else {
925 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
926 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
927 .addReg(SrcReg, getKillRegState(KillSrc))
928 .addImm(0);
929 }
930
931 return;
932 }
933
934 if (RC == &AMDGPU::AGPR_32RegClass) {
935 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
936 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
937 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
938 .addReg(SrcReg, getKillRegState(KillSrc));
939 return;
940 }
941
942 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
943 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
944 .addReg(SrcReg, getKillRegState(KillSrc));
945 return;
946 }
947
948 // FIXME: Pass should maintain scavenger to avoid scan through the block on
949 // every AGPR spill.
950 RegScavenger RS;
951 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
952 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
953 return;
954 }
955
956 if (Size == 16) {
957 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
958 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
959 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
960
961 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
962 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
963 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
964 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
965 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
966 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
967 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
968 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
969
970 if (IsSGPRDst) {
971 if (!IsSGPRSrc) {
972 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
973 return;
974 }
975
976 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
977 .addReg(NewSrcReg, getKillRegState(KillSrc));
978 return;
979 }
980
981 if (IsAGPRDst || IsAGPRSrc) {
982 if (!DstLow || !SrcLow) {
983 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
984 "Cannot use hi16 subreg with an AGPR!");
985 }
986
987 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
988 return;
989 }
990
991 if (ST.hasTrue16BitInsts()) {
992 if (IsSGPRSrc) {
993 assert(SrcLow);
994 SrcReg = NewSrcReg;
995 }
996 // Use the smaller instruction encoding if possible.
997 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
998 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
999 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1000 .addReg(SrcReg);
1001 } else {
1002 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1003 .addImm(0) // src0_modifiers
1004 .addReg(SrcReg)
1005 .addImm(0); // op_sel
1006 }
1007 return;
1008 }
1009
1010 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1011 if (!DstLow || !SrcLow) {
1012 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1013 "Cannot use hi16 subreg on VI!");
1014 }
1015
1016 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1017 .addReg(NewSrcReg, getKillRegState(KillSrc));
1018 return;
1019 }
1020
1021 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1022 .addImm(0) // src0_modifiers
1023 .addReg(NewSrcReg)
1024 .addImm(0) // clamp
1031 // First implicit operand is $exec.
1032 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1033 return;
1034 }
1035
1036 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1037 if (ST.hasMovB64()) {
1038 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1039 .addReg(SrcReg, getKillRegState(KillSrc));
1040 return;
1041 }
1042 if (ST.hasPkMovB32()) {
1043 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1045 .addReg(SrcReg)
1047 .addReg(SrcReg)
1048 .addImm(0) // op_sel_lo
1049 .addImm(0) // op_sel_hi
1050 .addImm(0) // neg_lo
1051 .addImm(0) // neg_hi
1052 .addImm(0) // clamp
1053 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1054 return;
1055 }
1056 }
1057
1058 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1059 if (RI.isSGPRClass(RC)) {
1060 if (!RI.isSGPRClass(SrcRC)) {
1061 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1062 return;
1063 }
1064 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1065 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1066 Forward);
1067 return;
1068 }
1069
1070 unsigned EltSize = 4;
1071 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1072 if (RI.isAGPRClass(RC)) {
1073 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1074 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1075 else if (RI.hasVGPRs(SrcRC) ||
1076 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1077 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1078 else
1079 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1080 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1081 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1082 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1083 (RI.isProperlyAlignedRC(*RC) &&
1084 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1085 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1086 if (ST.hasMovB64()) {
1087 Opcode = AMDGPU::V_MOV_B64_e32;
1088 EltSize = 8;
1089 } else if (ST.hasPkMovB32()) {
1090 Opcode = AMDGPU::V_PK_MOV_B32;
1091 EltSize = 8;
1092 }
1093 }
1094
1095 // For the cases where we need an intermediate instruction/temporary register
1096 // (destination is an AGPR), we need a scavenger.
1097 //
1098 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1099 // whole block for every handled copy.
1100 std::unique_ptr<RegScavenger> RS;
1101 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1102 RS = std::make_unique<RegScavenger>();
1103
1104 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1105
1106 // If there is an overlap, we can't kill the super-register on the last
1107 // instruction, since it will also kill the components made live by this def.
1108 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1109 const bool CanKillSuperReg = KillSrc && !Overlap;
1110
1111 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1112 unsigned SubIdx;
1113 if (Forward)
1114 SubIdx = SubIndices[Idx];
1115 else
1116 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1117 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1118 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1119 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1120
1121 bool IsFirstSubreg = Idx == 0;
1122 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1123
1124 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1125 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1126 Register ImpUseSuper = SrcReg;
1127 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1128 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1129 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1131 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1133 .addReg(SrcSubReg)
1135 .addReg(SrcSubReg)
1136 .addImm(0) // op_sel_lo
1137 .addImm(0) // op_sel_hi
1138 .addImm(0) // neg_lo
1139 .addImm(0) // neg_hi
1140 .addImm(0) // clamp
1141 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1142 if (IsFirstSubreg)
1144 } else {
1145 MachineInstrBuilder Builder =
1146 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1147 if (IsFirstSubreg)
1148 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1149
1150 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1151 }
1152 }
1153}
1154
1155int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1156 int NewOpc;
1157
1158 // Try to map original to commuted opcode
1159 NewOpc = AMDGPU::getCommuteRev(Opcode);
1160 if (NewOpc != -1)
1161 // Check if the commuted (REV) opcode exists on the target.
1162 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1163
1164 // Try to map commuted to original opcode
1165 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1166 if (NewOpc != -1)
1167 // Check if the original (non-REV) opcode exists on the target.
1168 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1169
1170 return Opcode;
1171}
1172
1175 const DebugLoc &DL, Register DestReg,
1176 int64_t Value) const {
1178 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
1179 if (RegClass == &AMDGPU::SReg_32RegClass ||
1180 RegClass == &AMDGPU::SGPR_32RegClass ||
1181 RegClass == &AMDGPU::SReg_32_XM0RegClass ||
1182 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
1183 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
1184 .addImm(Value);
1185 return;
1186 }
1187
1188 if (RegClass == &AMDGPU::SReg_64RegClass ||
1189 RegClass == &AMDGPU::SGPR_64RegClass ||
1190 RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
1191 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
1192 .addImm(Value);
1193 return;
1194 }
1195
1196 if (RegClass == &AMDGPU::VGPR_32RegClass) {
1197 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
1198 .addImm(Value);
1199 return;
1200 }
1201 if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
1202 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
1203 .addImm(Value);
1204 return;
1205 }
1206
1207 unsigned EltSize = 4;
1208 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1209 if (RI.isSGPRClass(RegClass)) {
1210 if (RI.getRegSizeInBits(*RegClass) > 32) {
1211 Opcode = AMDGPU::S_MOV_B64;
1212 EltSize = 8;
1213 } else {
1214 Opcode = AMDGPU::S_MOV_B32;
1215 EltSize = 4;
1216 }
1217 }
1218
1219 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
1220 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1221 int64_t IdxValue = Idx == 0 ? Value : 0;
1222
1223 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
1224 get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
1225 Builder.addImm(IdxValue);
1226 }
1227}
1228
1229const TargetRegisterClass *
1231 return &AMDGPU::VGPR_32RegClass;
1232}
1233
1236 const DebugLoc &DL, Register DstReg,
1238 Register TrueReg,
1239 Register FalseReg) const {
1241 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1242 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1243 "Not a VGPR32 reg");
1244
1245 if (Cond.size() == 1) {
1246 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1247 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1248 .add(Cond[0]);
1249 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1250 .addImm(0)
1251 .addReg(FalseReg)
1252 .addImm(0)
1253 .addReg(TrueReg)
1254 .addReg(SReg);
1255 } else if (Cond.size() == 2) {
1256 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1257 switch (Cond[0].getImm()) {
1258 case SIInstrInfo::SCC_TRUE: {
1259 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1260 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1261 : AMDGPU::S_CSELECT_B64), SReg)
1262 .addImm(1)
1263 .addImm(0);
1264 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1265 .addImm(0)
1266 .addReg(FalseReg)
1267 .addImm(0)
1268 .addReg(TrueReg)
1269 .addReg(SReg);
1270 break;
1271 }
1272 case SIInstrInfo::SCC_FALSE: {
1273 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1274 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1275 : AMDGPU::S_CSELECT_B64), SReg)
1276 .addImm(0)
1277 .addImm(1);
1278 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1279 .addImm(0)
1280 .addReg(FalseReg)
1281 .addImm(0)
1282 .addReg(TrueReg)
1283 .addReg(SReg);
1284 break;
1285 }
1286 case SIInstrInfo::VCCNZ: {
1287 MachineOperand RegOp = Cond[1];
1288 RegOp.setImplicit(false);
1289 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1290 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1291 .add(RegOp);
1292 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1293 .addImm(0)
1294 .addReg(FalseReg)
1295 .addImm(0)
1296 .addReg(TrueReg)
1297 .addReg(SReg);
1298 break;
1299 }
1300 case SIInstrInfo::VCCZ: {
1301 MachineOperand RegOp = Cond[1];
1302 RegOp.setImplicit(false);
1303 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1304 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1305 .add(RegOp);
1306 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1307 .addImm(0)
1308 .addReg(TrueReg)
1309 .addImm(0)
1310 .addReg(FalseReg)
1311 .addReg(SReg);
1312 break;
1313 }
1314 case SIInstrInfo::EXECNZ: {
1315 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1316 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1317 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1318 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1319 .addImm(0);
1320 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1321 : AMDGPU::S_CSELECT_B64), SReg)
1322 .addImm(1)
1323 .addImm(0);
1324 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1325 .addImm(0)
1326 .addReg(FalseReg)
1327 .addImm(0)
1328 .addReg(TrueReg)
1329 .addReg(SReg);
1330 break;
1331 }
1332 case SIInstrInfo::EXECZ: {
1333 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1334 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1335 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1336 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1337 .addImm(0);
1338 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1339 : AMDGPU::S_CSELECT_B64), SReg)
1340 .addImm(0)
1341 .addImm(1);
1342 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1343 .addImm(0)
1344 .addReg(FalseReg)
1345 .addImm(0)
1346 .addReg(TrueReg)
1347 .addReg(SReg);
1348 llvm_unreachable("Unhandled branch predicate EXECZ");
1349 break;
1350 }
1351 default:
1352 llvm_unreachable("invalid branch predicate");
1353 }
1354 } else {
1355 llvm_unreachable("Can only handle Cond size 1 or 2");
1356 }
1357}
1358
1361 const DebugLoc &DL,
1362 Register SrcReg, int Value) const {
1364 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1365 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1366 .addImm(Value)
1367 .addReg(SrcReg);
1368
1369 return Reg;
1370}
1371
1374 const DebugLoc &DL,
1375 Register SrcReg, int Value) const {
1377 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1378 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1379 .addImm(Value)
1380 .addReg(SrcReg);
1381
1382 return Reg;
1383}
1384
1386
1387 if (RI.isAGPRClass(DstRC))
1388 return AMDGPU::COPY;
1389 if (RI.getRegSizeInBits(*DstRC) == 16) {
1390 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1391 // before RA.
1392 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1393 }
1394 if (RI.getRegSizeInBits(*DstRC) == 32)
1395 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1396 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1397 return AMDGPU::S_MOV_B64;
1398 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1399 return AMDGPU::V_MOV_B64_PSEUDO;
1400 return AMDGPU::COPY;
1401}
1402
1403const MCInstrDesc &
1405 bool IsIndirectSrc) const {
1406 if (IsIndirectSrc) {
1407 if (VecSize <= 32) // 4 bytes
1408 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1409 if (VecSize <= 64) // 8 bytes
1410 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1411 if (VecSize <= 96) // 12 bytes
1412 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1413 if (VecSize <= 128) // 16 bytes
1414 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1415 if (VecSize <= 160) // 20 bytes
1416 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1417 if (VecSize <= 256) // 32 bytes
1418 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1419 if (VecSize <= 288) // 36 bytes
1420 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1421 if (VecSize <= 320) // 40 bytes
1422 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1423 if (VecSize <= 352) // 44 bytes
1424 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1425 if (VecSize <= 384) // 48 bytes
1426 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1427 if (VecSize <= 512) // 64 bytes
1428 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1429 if (VecSize <= 1024) // 128 bytes
1430 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1431
1432 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1433 }
1434
1435 if (VecSize <= 32) // 4 bytes
1436 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1437 if (VecSize <= 64) // 8 bytes
1438 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1439 if (VecSize <= 96) // 12 bytes
1440 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1441 if (VecSize <= 128) // 16 bytes
1442 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1443 if (VecSize <= 160) // 20 bytes
1444 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1445 if (VecSize <= 256) // 32 bytes
1446 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1447 if (VecSize <= 288) // 36 bytes
1448 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1449 if (VecSize <= 320) // 40 bytes
1450 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1451 if (VecSize <= 352) // 44 bytes
1452 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1453 if (VecSize <= 384) // 48 bytes
1454 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1455 if (VecSize <= 512) // 64 bytes
1456 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1457 if (VecSize <= 1024) // 128 bytes
1458 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1459
1460 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1461}
1462
1463static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1464 if (VecSize <= 32) // 4 bytes
1465 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1466 if (VecSize <= 64) // 8 bytes
1467 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1468 if (VecSize <= 96) // 12 bytes
1469 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1470 if (VecSize <= 128) // 16 bytes
1471 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1472 if (VecSize <= 160) // 20 bytes
1473 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1474 if (VecSize <= 256) // 32 bytes
1475 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1476 if (VecSize <= 288) // 36 bytes
1477 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1478 if (VecSize <= 320) // 40 bytes
1479 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1480 if (VecSize <= 352) // 44 bytes
1481 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1482 if (VecSize <= 384) // 48 bytes
1483 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1484 if (VecSize <= 512) // 64 bytes
1485 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1486 if (VecSize <= 1024) // 128 bytes
1487 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1488
1489 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1490}
1491
1492static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1493 if (VecSize <= 32) // 4 bytes
1494 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1495 if (VecSize <= 64) // 8 bytes
1496 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1497 if (VecSize <= 96) // 12 bytes
1498 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1499 if (VecSize <= 128) // 16 bytes
1500 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1501 if (VecSize <= 160) // 20 bytes
1502 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1503 if (VecSize <= 256) // 32 bytes
1504 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1505 if (VecSize <= 288) // 36 bytes
1506 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1507 if (VecSize <= 320) // 40 bytes
1508 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1509 if (VecSize <= 352) // 44 bytes
1510 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1511 if (VecSize <= 384) // 48 bytes
1512 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1513 if (VecSize <= 512) // 64 bytes
1514 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1515 if (VecSize <= 1024) // 128 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1517
1518 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1519}
1520
1521static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1522 if (VecSize <= 64) // 8 bytes
1523 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1524 if (VecSize <= 128) // 16 bytes
1525 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1526 if (VecSize <= 256) // 32 bytes
1527 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1528 if (VecSize <= 512) // 64 bytes
1529 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1530 if (VecSize <= 1024) // 128 bytes
1531 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1532
1533 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1534}
1535
1536const MCInstrDesc &
1537SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1538 bool IsSGPR) const {
1539 if (IsSGPR) {
1540 switch (EltSize) {
1541 case 32:
1542 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1543 case 64:
1544 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1545 default:
1546 llvm_unreachable("invalid reg indexing elt size");
1547 }
1548 }
1549
1550 assert(EltSize == 32 && "invalid reg indexing elt size");
1552}
1553
1554static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1555 switch (Size) {
1556 case 4:
1557 return AMDGPU::SI_SPILL_S32_SAVE;
1558 case 8:
1559 return AMDGPU::SI_SPILL_S64_SAVE;
1560 case 12:
1561 return AMDGPU::SI_SPILL_S96_SAVE;
1562 case 16:
1563 return AMDGPU::SI_SPILL_S128_SAVE;
1564 case 20:
1565 return AMDGPU::SI_SPILL_S160_SAVE;
1566 case 24:
1567 return AMDGPU::SI_SPILL_S192_SAVE;
1568 case 28:
1569 return AMDGPU::SI_SPILL_S224_SAVE;
1570 case 32:
1571 return AMDGPU::SI_SPILL_S256_SAVE;
1572 case 36:
1573 return AMDGPU::SI_SPILL_S288_SAVE;
1574 case 40:
1575 return AMDGPU::SI_SPILL_S320_SAVE;
1576 case 44:
1577 return AMDGPU::SI_SPILL_S352_SAVE;
1578 case 48:
1579 return AMDGPU::SI_SPILL_S384_SAVE;
1580 case 64:
1581 return AMDGPU::SI_SPILL_S512_SAVE;
1582 case 128:
1583 return AMDGPU::SI_SPILL_S1024_SAVE;
1584 default:
1585 llvm_unreachable("unknown register size");
1586 }
1587}
1588
1589static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1590 switch (Size) {
1591 case 4:
1592 return AMDGPU::SI_SPILL_V32_SAVE;
1593 case 8:
1594 return AMDGPU::SI_SPILL_V64_SAVE;
1595 case 12:
1596 return AMDGPU::SI_SPILL_V96_SAVE;
1597 case 16:
1598 return AMDGPU::SI_SPILL_V128_SAVE;
1599 case 20:
1600 return AMDGPU::SI_SPILL_V160_SAVE;
1601 case 24:
1602 return AMDGPU::SI_SPILL_V192_SAVE;
1603 case 28:
1604 return AMDGPU::SI_SPILL_V224_SAVE;
1605 case 32:
1606 return AMDGPU::SI_SPILL_V256_SAVE;
1607 case 36:
1608 return AMDGPU::SI_SPILL_V288_SAVE;
1609 case 40:
1610 return AMDGPU::SI_SPILL_V320_SAVE;
1611 case 44:
1612 return AMDGPU::SI_SPILL_V352_SAVE;
1613 case 48:
1614 return AMDGPU::SI_SPILL_V384_SAVE;
1615 case 64:
1616 return AMDGPU::SI_SPILL_V512_SAVE;
1617 case 128:
1618 return AMDGPU::SI_SPILL_V1024_SAVE;
1619 default:
1620 llvm_unreachable("unknown register size");
1621 }
1622}
1623
1624static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
1625 switch (Size) {
1626 case 4:
1627 return AMDGPU::SI_SPILL_A32_SAVE;
1628 case 8:
1629 return AMDGPU::SI_SPILL_A64_SAVE;
1630 case 12:
1631 return AMDGPU::SI_SPILL_A96_SAVE;
1632 case 16:
1633 return AMDGPU::SI_SPILL_A128_SAVE;
1634 case 20:
1635 return AMDGPU::SI_SPILL_A160_SAVE;
1636 case 24:
1637 return AMDGPU::SI_SPILL_A192_SAVE;
1638 case 28:
1639 return AMDGPU::SI_SPILL_A224_SAVE;
1640 case 32:
1641 return AMDGPU::SI_SPILL_A256_SAVE;
1642 case 36:
1643 return AMDGPU::SI_SPILL_A288_SAVE;
1644 case 40:
1645 return AMDGPU::SI_SPILL_A320_SAVE;
1646 case 44:
1647 return AMDGPU::SI_SPILL_A352_SAVE;
1648 case 48:
1649 return AMDGPU::SI_SPILL_A384_SAVE;
1650 case 64:
1651 return AMDGPU::SI_SPILL_A512_SAVE;
1652 case 128:
1653 return AMDGPU::SI_SPILL_A1024_SAVE;
1654 default:
1655 llvm_unreachable("unknown register size");
1656 }
1657}
1658
1659static unsigned getAVSpillSaveOpcode(unsigned Size) {
1660 switch (Size) {
1661 case 4:
1662 return AMDGPU::SI_SPILL_AV32_SAVE;
1663 case 8:
1664 return AMDGPU::SI_SPILL_AV64_SAVE;
1665 case 12:
1666 return AMDGPU::SI_SPILL_AV96_SAVE;
1667 case 16:
1668 return AMDGPU::SI_SPILL_AV128_SAVE;
1669 case 20:
1670 return AMDGPU::SI_SPILL_AV160_SAVE;
1671 case 24:
1672 return AMDGPU::SI_SPILL_AV192_SAVE;
1673 case 28:
1674 return AMDGPU::SI_SPILL_AV224_SAVE;
1675 case 32:
1676 return AMDGPU::SI_SPILL_AV256_SAVE;
1677 case 36:
1678 return AMDGPU::SI_SPILL_AV288_SAVE;
1679 case 40:
1680 return AMDGPU::SI_SPILL_AV320_SAVE;
1681 case 44:
1682 return AMDGPU::SI_SPILL_AV352_SAVE;
1683 case 48:
1684 return AMDGPU::SI_SPILL_AV384_SAVE;
1685 case 64:
1686 return AMDGPU::SI_SPILL_AV512_SAVE;
1687 case 128:
1688 return AMDGPU::SI_SPILL_AV1024_SAVE;
1689 default:
1690 llvm_unreachable("unknown register size");
1691 }
1692}
1693
1694static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1695 bool IsVectorSuperClass) {
1696 // Currently, there is only 32-bit WWM register spills needed.
1697 if (Size != 4)
1698 llvm_unreachable("unknown wwm register spill size");
1699
1700 if (IsVectorSuperClass)
1701 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1702
1703 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1704}
1705
1707 const TargetRegisterClass *RC,
1708 unsigned Size,
1709 const SIRegisterInfo &TRI,
1710 const SIMachineFunctionInfo &MFI) {
1711 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1712
1713 // Choose the right opcode if spilling a WWM register.
1715 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1716
1717 if (IsVectorSuperClass)
1718 return getAVSpillSaveOpcode(Size);
1719
1720 return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
1722}
1723
1726 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1727 const TargetRegisterInfo *TRI, Register VReg) const {
1730 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1731 const DebugLoc &DL = MBB.findDebugLoc(MI);
1732
1733 MachinePointerInfo PtrInfo
1734 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1736 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1737 FrameInfo.getObjectAlign(FrameIndex));
1738 unsigned SpillSize = TRI->getSpillSize(*RC);
1739
1741 if (RI.isSGPRClass(RC)) {
1742 MFI->setHasSpilledSGPRs();
1743 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1744 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1745 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1746
1747 // We are only allowed to create one new instruction when spilling
1748 // registers, so we need to use pseudo instruction for spilling SGPRs.
1749 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1750
1751 // The SGPR spill/restore instructions only work on number sgprs, so we need
1752 // to make sure we are using the correct register class.
1753 if (SrcReg.isVirtual() && SpillSize == 4) {
1754 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1755 }
1756
1757 BuildMI(MBB, MI, DL, OpDesc)
1758 .addReg(SrcReg, getKillRegState(isKill)) // data
1759 .addFrameIndex(FrameIndex) // addr
1760 .addMemOperand(MMO)
1762
1763 if (RI.spillSGPRToVGPR())
1764 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1765 return;
1766 }
1767
1768 unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
1769 SpillSize, RI, *MFI);
1770 MFI->setHasSpilledVGPRs();
1771
1772 BuildMI(MBB, MI, DL, get(Opcode))
1773 .addReg(SrcReg, getKillRegState(isKill)) // data
1774 .addFrameIndex(FrameIndex) // addr
1775 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1776 .addImm(0) // offset
1777 .addMemOperand(MMO);
1778}
1779
1780static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1781 switch (Size) {
1782 case 4:
1783 return AMDGPU::SI_SPILL_S32_RESTORE;
1784 case 8:
1785 return AMDGPU::SI_SPILL_S64_RESTORE;
1786 case 12:
1787 return AMDGPU::SI_SPILL_S96_RESTORE;
1788 case 16:
1789 return AMDGPU::SI_SPILL_S128_RESTORE;
1790 case 20:
1791 return AMDGPU::SI_SPILL_S160_RESTORE;
1792 case 24:
1793 return AMDGPU::SI_SPILL_S192_RESTORE;
1794 case 28:
1795 return AMDGPU::SI_SPILL_S224_RESTORE;
1796 case 32:
1797 return AMDGPU::SI_SPILL_S256_RESTORE;
1798 case 36:
1799 return AMDGPU::SI_SPILL_S288_RESTORE;
1800 case 40:
1801 return AMDGPU::SI_SPILL_S320_RESTORE;
1802 case 44:
1803 return AMDGPU::SI_SPILL_S352_RESTORE;
1804 case 48:
1805 return AMDGPU::SI_SPILL_S384_RESTORE;
1806 case 64:
1807 return AMDGPU::SI_SPILL_S512_RESTORE;
1808 case 128:
1809 return AMDGPU::SI_SPILL_S1024_RESTORE;
1810 default:
1811 llvm_unreachable("unknown register size");
1812 }
1813}
1814
1815static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1816 switch (Size) {
1817 case 4:
1818 return AMDGPU::SI_SPILL_V32_RESTORE;
1819 case 8:
1820 return AMDGPU::SI_SPILL_V64_RESTORE;
1821 case 12:
1822 return AMDGPU::SI_SPILL_V96_RESTORE;
1823 case 16:
1824 return AMDGPU::SI_SPILL_V128_RESTORE;
1825 case 20:
1826 return AMDGPU::SI_SPILL_V160_RESTORE;
1827 case 24:
1828 return AMDGPU::SI_SPILL_V192_RESTORE;
1829 case 28:
1830 return AMDGPU::SI_SPILL_V224_RESTORE;
1831 case 32:
1832 return AMDGPU::SI_SPILL_V256_RESTORE;
1833 case 36:
1834 return AMDGPU::SI_SPILL_V288_RESTORE;
1835 case 40:
1836 return AMDGPU::SI_SPILL_V320_RESTORE;
1837 case 44:
1838 return AMDGPU::SI_SPILL_V352_RESTORE;
1839 case 48:
1840 return AMDGPU::SI_SPILL_V384_RESTORE;
1841 case 64:
1842 return AMDGPU::SI_SPILL_V512_RESTORE;
1843 case 128:
1844 return AMDGPU::SI_SPILL_V1024_RESTORE;
1845 default:
1846 llvm_unreachable("unknown register size");
1847 }
1848}
1849
1850static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1851 switch (Size) {
1852 case 4:
1853 return AMDGPU::SI_SPILL_A32_RESTORE;
1854 case 8:
1855 return AMDGPU::SI_SPILL_A64_RESTORE;
1856 case 12:
1857 return AMDGPU::SI_SPILL_A96_RESTORE;
1858 case 16:
1859 return AMDGPU::SI_SPILL_A128_RESTORE;
1860 case 20:
1861 return AMDGPU::SI_SPILL_A160_RESTORE;
1862 case 24:
1863 return AMDGPU::SI_SPILL_A192_RESTORE;
1864 case 28:
1865 return AMDGPU::SI_SPILL_A224_RESTORE;
1866 case 32:
1867 return AMDGPU::SI_SPILL_A256_RESTORE;
1868 case 36:
1869 return AMDGPU::SI_SPILL_A288_RESTORE;
1870 case 40:
1871 return AMDGPU::SI_SPILL_A320_RESTORE;
1872 case 44:
1873 return AMDGPU::SI_SPILL_A352_RESTORE;
1874 case 48:
1875 return AMDGPU::SI_SPILL_A384_RESTORE;
1876 case 64:
1877 return AMDGPU::SI_SPILL_A512_RESTORE;
1878 case 128:
1879 return AMDGPU::SI_SPILL_A1024_RESTORE;
1880 default:
1881 llvm_unreachable("unknown register size");
1882 }
1883}
1884
1885static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1886 switch (Size) {
1887 case 4:
1888 return AMDGPU::SI_SPILL_AV32_RESTORE;
1889 case 8:
1890 return AMDGPU::SI_SPILL_AV64_RESTORE;
1891 case 12:
1892 return AMDGPU::SI_SPILL_AV96_RESTORE;
1893 case 16:
1894 return AMDGPU::SI_SPILL_AV128_RESTORE;
1895 case 20:
1896 return AMDGPU::SI_SPILL_AV160_RESTORE;
1897 case 24:
1898 return AMDGPU::SI_SPILL_AV192_RESTORE;
1899 case 28:
1900 return AMDGPU::SI_SPILL_AV224_RESTORE;
1901 case 32:
1902 return AMDGPU::SI_SPILL_AV256_RESTORE;
1903 case 36:
1904 return AMDGPU::SI_SPILL_AV288_RESTORE;
1905 case 40:
1906 return AMDGPU::SI_SPILL_AV320_RESTORE;
1907 case 44:
1908 return AMDGPU::SI_SPILL_AV352_RESTORE;
1909 case 48:
1910 return AMDGPU::SI_SPILL_AV384_RESTORE;
1911 case 64:
1912 return AMDGPU::SI_SPILL_AV512_RESTORE;
1913 case 128:
1914 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1915 default:
1916 llvm_unreachable("unknown register size");
1917 }
1918}
1919
1920static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1921 bool IsVectorSuperClass) {
1922 // Currently, there is only 32-bit WWM register spills needed.
1923 if (Size != 4)
1924 llvm_unreachable("unknown wwm register spill size");
1925
1926 if (IsVectorSuperClass)
1927 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1928
1929 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1930}
1931
1932static unsigned
1934 unsigned Size, const SIRegisterInfo &TRI,
1935 const SIMachineFunctionInfo &MFI) {
1936 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1937
1938 // Choose the right opcode if restoring a WWM register.
1940 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1941
1942 if (IsVectorSuperClass)
1944
1945 return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
1947}
1948
1951 Register DestReg, int FrameIndex,
1952 const TargetRegisterClass *RC,
1953 const TargetRegisterInfo *TRI,
1954 Register VReg) const {
1957 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1958 const DebugLoc &DL = MBB.findDebugLoc(MI);
1959 unsigned SpillSize = TRI->getSpillSize(*RC);
1960
1961 MachinePointerInfo PtrInfo
1962 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1963
1965 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1966 FrameInfo.getObjectAlign(FrameIndex));
1967
1968 if (RI.isSGPRClass(RC)) {
1969 MFI->setHasSpilledSGPRs();
1970 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1971 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1972 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1973
1974 // FIXME: Maybe this should not include a memoperand because it will be
1975 // lowered to non-memory instructions.
1976 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1977 if (DestReg.isVirtual() && SpillSize == 4) {
1979 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1980 }
1981
1982 if (RI.spillSGPRToVGPR())
1983 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1984 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1985 .addFrameIndex(FrameIndex) // addr
1986 .addMemOperand(MMO)
1988
1989 return;
1990 }
1991
1992 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1993 SpillSize, RI, *MFI);
1994 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1995 .addFrameIndex(FrameIndex) // vaddr
1996 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1997 .addImm(0) // offset
1998 .addMemOperand(MMO);
1999}
2000
2003 insertNoops(MBB, MI, 1);
2004}
2005
2008 unsigned Quantity) const {
2010 while (Quantity > 0) {
2011 unsigned Arg = std::min(Quantity, 8u);
2012 Quantity -= Arg;
2013 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
2014 }
2015}
2016
2018 auto *MF = MBB.getParent();
2020
2021 assert(Info->isEntryFunction());
2022
2023 if (MBB.succ_empty()) {
2024 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
2025 if (HasNoTerminator) {
2026 if (Info->returnsVoid()) {
2027 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
2028 } else {
2029 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
2030 }
2031 }
2032 }
2033}
2034
2038 const DebugLoc &DL) const {
2040 constexpr unsigned DoorbellIDMask = 0x3ff;
2041 constexpr unsigned ECQueueWaveAbort = 0x400;
2042
2043 MachineBasicBlock *TrapBB = &MBB;
2044 MachineBasicBlock *ContBB = &MBB;
2045 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2046
2047 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
2048 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
2049 TrapBB = MF->CreateMachineBasicBlock();
2050 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
2051 MF->push_back(TrapBB);
2052 MBB.addSuccessor(TrapBB);
2053 }
2054
2055 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2056 // will be a nop.
2057 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
2058 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2059 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2060 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2061 DoorbellReg)
2063 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2064 .addUse(AMDGPU::M0);
2065 Register DoorbellRegMasked =
2066 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2067 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2068 .addUse(DoorbellReg)
2069 .addImm(DoorbellIDMask);
2070 Register SetWaveAbortBit =
2071 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2072 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2073 .addUse(DoorbellRegMasked)
2074 .addImm(ECQueueWaveAbort);
2075 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2076 .addUse(SetWaveAbortBit);
2077 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2079 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2080 .addUse(AMDGPU::TTMP2);
2081 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2082 TrapBB->addSuccessor(HaltLoopBB);
2083
2084 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2085 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2086 .addMBB(HaltLoopBB);
2087 MF->push_back(HaltLoopBB);
2088 HaltLoopBB->addSuccessor(HaltLoopBB);
2089
2090 return ContBB;
2091}
2092
2094 switch (MI.getOpcode()) {
2095 default:
2096 if (MI.isMetaInstruction())
2097 return 0;
2098 return 1; // FIXME: Do wait states equal cycles?
2099
2100 case AMDGPU::S_NOP:
2101 return MI.getOperand(0).getImm() + 1;
2102 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2103 // hazard, even if one exist, won't really be visible. Should we handle it?
2104 }
2105}
2106
2108 MachineBasicBlock &MBB = *MI.getParent();
2110 switch (MI.getOpcode()) {
2111 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2112 case AMDGPU::S_MOV_B64_term:
2113 // This is only a terminator to get the correct spill code placement during
2114 // register allocation.
2115 MI.setDesc(get(AMDGPU::S_MOV_B64));
2116 break;
2117
2118 case AMDGPU::S_MOV_B32_term:
2119 // This is only a terminator to get the correct spill code placement during
2120 // register allocation.
2121 MI.setDesc(get(AMDGPU::S_MOV_B32));
2122 break;
2123
2124 case AMDGPU::S_XOR_B64_term:
2125 // This is only a terminator to get the correct spill code placement during
2126 // register allocation.
2127 MI.setDesc(get(AMDGPU::S_XOR_B64));
2128 break;
2129
2130 case AMDGPU::S_XOR_B32_term:
2131 // This is only a terminator to get the correct spill code placement during
2132 // register allocation.
2133 MI.setDesc(get(AMDGPU::S_XOR_B32));
2134 break;
2135 case AMDGPU::S_OR_B64_term:
2136 // This is only a terminator to get the correct spill code placement during
2137 // register allocation.
2138 MI.setDesc(get(AMDGPU::S_OR_B64));
2139 break;
2140 case AMDGPU::S_OR_B32_term:
2141 // This is only a terminator to get the correct spill code placement during
2142 // register allocation.
2143 MI.setDesc(get(AMDGPU::S_OR_B32));
2144 break;
2145
2146 case AMDGPU::S_ANDN2_B64_term:
2147 // This is only a terminator to get the correct spill code placement during
2148 // register allocation.
2149 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2150 break;
2151
2152 case AMDGPU::S_ANDN2_B32_term:
2153 // This is only a terminator to get the correct spill code placement during
2154 // register allocation.
2155 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2156 break;
2157
2158 case AMDGPU::S_AND_B64_term:
2159 // This is only a terminator to get the correct spill code placement during
2160 // register allocation.
2161 MI.setDesc(get(AMDGPU::S_AND_B64));
2162 break;
2163
2164 case AMDGPU::S_AND_B32_term:
2165 // This is only a terminator to get the correct spill code placement during
2166 // register allocation.
2167 MI.setDesc(get(AMDGPU::S_AND_B32));
2168 break;
2169
2170 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2171 // This is only a terminator to get the correct spill code placement during
2172 // register allocation.
2173 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2174 break;
2175
2176 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2177 // This is only a terminator to get the correct spill code placement during
2178 // register allocation.
2179 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2180 break;
2181
2182 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2183 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2184 break;
2185
2186 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2187 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2188 break;
2189
2190 case AMDGPU::V_MOV_B64_PSEUDO: {
2191 Register Dst = MI.getOperand(0).getReg();
2192 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2193 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2194
2195 const MachineOperand &SrcOp = MI.getOperand(1);
2196 // FIXME: Will this work for 64-bit floating point immediates?
2197 assert(!SrcOp.isFPImm());
2198 if (ST.hasMovB64()) {
2199 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2200 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2201 isUInt<32>(SrcOp.getImm()))
2202 break;
2203 }
2204 if (SrcOp.isImm()) {
2205 APInt Imm(64, SrcOp.getImm());
2206 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2207 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2208 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2209 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2211 .addImm(Lo.getSExtValue())
2213 .addImm(Lo.getSExtValue())
2214 .addImm(0) // op_sel_lo
2215 .addImm(0) // op_sel_hi
2216 .addImm(0) // neg_lo
2217 .addImm(0) // neg_hi
2218 .addImm(0); // clamp
2219 } else {
2220 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2221 .addImm(Lo.getSExtValue())
2223 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2224 .addImm(Hi.getSExtValue())
2226 }
2227 } else {
2228 assert(SrcOp.isReg());
2229 if (ST.hasPkMovB32() &&
2230 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2231 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2232 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2233 .addReg(SrcOp.getReg())
2235 .addReg(SrcOp.getReg())
2236 .addImm(0) // op_sel_lo
2237 .addImm(0) // op_sel_hi
2238 .addImm(0) // neg_lo
2239 .addImm(0) // neg_hi
2240 .addImm(0); // clamp
2241 } else {
2242 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2243 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2245 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2246 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2248 }
2249 }
2250 MI.eraseFromParent();
2251 break;
2252 }
2253 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2255 break;
2256 }
2257 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2258 const MachineOperand &SrcOp = MI.getOperand(1);
2259 assert(!SrcOp.isFPImm());
2260 APInt Imm(64, SrcOp.getImm());
2261 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2262 MI.setDesc(get(AMDGPU::S_MOV_B64));
2263 break;
2264 }
2265
2266 Register Dst = MI.getOperand(0).getReg();
2267 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2268 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2269
2270 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2271 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2272 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2273 .addImm(Lo.getSExtValue())
2275 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2276 .addImm(Hi.getSExtValue())
2278 MI.eraseFromParent();
2279 break;
2280 }
2281 case AMDGPU::V_SET_INACTIVE_B32: {
2282 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2283 Register DstReg = MI.getOperand(0).getReg();
2284 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2285 .add(MI.getOperand(3))
2286 .add(MI.getOperand(4))
2287 .add(MI.getOperand(1))
2288 .add(MI.getOperand(2))
2289 .add(MI.getOperand(5));
2290 MI.eraseFromParent();
2291 break;
2292 }
2293 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2294 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2295 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2296 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2297 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2298 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2299 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2300 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2301 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2302 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2303 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2304 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2305 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2306 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2307 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2308 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2309 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2310 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2311 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2312 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2313 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2314 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2315 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2316 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2317 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2318 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2319 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2320 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2321 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2322 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2323
2324 unsigned Opc;
2325 if (RI.hasVGPRs(EltRC)) {
2326 Opc = AMDGPU::V_MOVRELD_B32_e32;
2327 } else {
2328 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2329 : AMDGPU::S_MOVRELD_B32;
2330 }
2331
2332 const MCInstrDesc &OpDesc = get(Opc);
2333 Register VecReg = MI.getOperand(0).getReg();
2334 bool IsUndef = MI.getOperand(1).isUndef();
2335 unsigned SubReg = MI.getOperand(3).getImm();
2336 assert(VecReg == MI.getOperand(1).getReg());
2337
2339 BuildMI(MBB, MI, DL, OpDesc)
2340 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2341 .add(MI.getOperand(2))
2343 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2344
2345 const int ImpDefIdx =
2346 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2347 const int ImpUseIdx = ImpDefIdx + 1;
2348 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2349 MI.eraseFromParent();
2350 break;
2351 }
2352 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2353 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2354 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2355 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2356 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2357 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2358 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2359 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2360 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2361 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2362 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2363 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2365 Register VecReg = MI.getOperand(0).getReg();
2366 bool IsUndef = MI.getOperand(1).isUndef();
2367 Register Idx = MI.getOperand(3).getReg();
2368 Register SubReg = MI.getOperand(4).getImm();
2369
2370 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2371 .addReg(Idx)
2373 SetOn->getOperand(3).setIsUndef();
2374
2375 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2377 BuildMI(MBB, MI, DL, OpDesc)
2378 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2379 .add(MI.getOperand(2))
2381 .addReg(VecReg,
2382 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2383
2384 const int ImpDefIdx =
2385 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2386 const int ImpUseIdx = ImpDefIdx + 1;
2387 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2388
2389 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2390
2391 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2392
2393 MI.eraseFromParent();
2394 break;
2395 }
2396 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2397 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2398 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2399 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2400 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2401 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2402 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2403 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2404 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2405 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2406 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2407 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2409 Register Dst = MI.getOperand(0).getReg();
2410 Register VecReg = MI.getOperand(1).getReg();
2411 bool IsUndef = MI.getOperand(1).isUndef();
2412 Register Idx = MI.getOperand(2).getReg();
2413 Register SubReg = MI.getOperand(3).getImm();
2414
2415 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2416 .addReg(Idx)
2418 SetOn->getOperand(3).setIsUndef();
2419
2420 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2421 .addDef(Dst)
2422 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2423 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2424
2425 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2426
2427 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2428
2429 MI.eraseFromParent();
2430 break;
2431 }
2432 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2433 MachineFunction &MF = *MBB.getParent();
2434 Register Reg = MI.getOperand(0).getReg();
2435 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2436 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2437 MachineOperand OpLo = MI.getOperand(1);
2438 MachineOperand OpHi = MI.getOperand(2);
2439
2440 // Create a bundle so these instructions won't be re-ordered by the
2441 // post-RA scheduler.
2442 MIBundleBuilder Bundler(MBB, MI);
2443 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2444
2445 // What we want here is an offset from the value returned by s_getpc (which
2446 // is the address of the s_add_u32 instruction) to the global variable, but
2447 // since the encoding of $symbol starts 4 bytes after the start of the
2448 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2449 // small. This requires us to add 4 to the global variable offset in order
2450 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2451 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2452 // instruction.
2453
2454 int64_t Adjust = 0;
2455 if (ST.hasGetPCZeroExtension()) {
2456 // Fix up hardware that does not sign-extend the 48-bit PC value by
2457 // inserting: s_sext_i32_i16 reghi, reghi
2458 Bundler.append(
2459 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2460 Adjust += 4;
2461 }
2462
2463 if (OpLo.isGlobal())
2464 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2465 Bundler.append(
2466 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2467
2468 if (OpHi.isGlobal())
2469 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2470 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2471 .addReg(RegHi)
2472 .add(OpHi));
2473
2474 finalizeBundle(MBB, Bundler.begin());
2475
2476 MI.eraseFromParent();
2477 break;
2478 }
2479 case AMDGPU::ENTER_STRICT_WWM: {
2480 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2481 // Whole Wave Mode is entered.
2482 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
2483 : AMDGPU::S_OR_SAVEEXEC_B64));
2484 break;
2485 }
2486 case AMDGPU::ENTER_STRICT_WQM: {
2487 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2488 // STRICT_WQM is entered.
2489 const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2490 const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
2491 const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2492 BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
2493 BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
2494
2495 MI.eraseFromParent();
2496 break;
2497 }
2498 case AMDGPU::EXIT_STRICT_WWM:
2499 case AMDGPU::EXIT_STRICT_WQM: {
2500 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2501 // WWM/STICT_WQM is exited.
2502 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
2503 break;
2504 }
2505 case AMDGPU::SI_RETURN: {
2506 const MachineFunction *MF = MBB.getParent();
2507 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2508 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2509 // Hiding the return address use with SI_RETURN may lead to extra kills in
2510 // the function and missing live-ins. We are fine in practice because callee
2511 // saved register handling ensures the register value is restored before
2512 // RET, but we need the undef flag here to appease the MachineVerifier
2513 // liveness checks.
2515 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2516 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2517
2518 MIB.copyImplicitOps(MI);
2519 MI.eraseFromParent();
2520 break;
2521 }
2522
2523 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2524 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2525 MI.setDesc(get(AMDGPU::S_MUL_U64));
2526 break;
2527
2528 case AMDGPU::S_GETPC_B64_pseudo:
2529 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2530 if (ST.hasGetPCZeroExtension()) {
2531 Register Dst = MI.getOperand(0).getReg();
2532 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2533 // Fix up hardware that does not sign-extend the 48-bit PC value by
2534 // inserting: s_sext_i32_i16 dsthi, dsthi
2535 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2536 DstHi)
2537 .addReg(DstHi);
2538 }
2539 break;
2540 }
2541 return true;
2542}
2543
2546 unsigned SubIdx, const MachineInstr &Orig,
2547 const TargetRegisterInfo &RI) const {
2548
2549 // Try shrinking the instruction to remat only the part needed for current
2550 // context.
2551 // TODO: Handle more cases.
2552 unsigned Opcode = Orig.getOpcode();
2553 switch (Opcode) {
2554 case AMDGPU::S_LOAD_DWORDX16_IMM:
2555 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2556 if (SubIdx != 0)
2557 break;
2558
2559 if (I == MBB.end())
2560 break;
2561
2562 if (I->isBundled())
2563 break;
2564
2565 // Look for a single use of the register that is also a subreg.
2566 Register RegToFind = Orig.getOperand(0).getReg();
2567 MachineOperand *UseMO = nullptr;
2568 for (auto &CandMO : I->operands()) {
2569 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2570 continue;
2571 if (UseMO) {
2572 UseMO = nullptr;
2573 break;
2574 }
2575 UseMO = &CandMO;
2576 }
2577 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2578 break;
2579
2580 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2581 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2582
2585 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2586
2587 unsigned NewOpcode = -1;
2588 if (SubregSize == 256)
2589 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2590 else if (SubregSize == 128)
2591 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2592 else
2593 break;
2594
2595 const MCInstrDesc &TID = get(NewOpcode);
2596 const TargetRegisterClass *NewRC =
2597 RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
2598 MRI.setRegClass(DestReg, NewRC);
2599
2600 UseMO->setReg(DestReg);
2601 UseMO->setSubReg(AMDGPU::NoSubRegister);
2602
2603 // Use a smaller load with the desired size, possibly with updated offset.
2604 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2605 MI->setDesc(TID);
2606 MI->getOperand(0).setReg(DestReg);
2607 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2608 if (Offset) {
2609 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2610 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2611 OffsetMO->setImm(FinalOffset);
2612 }
2614 for (const MachineMemOperand *MemOp : Orig.memoperands())
2615 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2616 SubregSize / 8));
2617 MI->setMemRefs(*MF, NewMMOs);
2618
2619 MBB.insert(I, MI);
2620 return;
2621 }
2622
2623 default:
2624 break;
2625 }
2626
2627 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2628}
2629
2630std::pair<MachineInstr*, MachineInstr*>
2632 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2633
2634 if (ST.hasMovB64() &&
2636 getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2637 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2638 return std::pair(&MI, nullptr);
2639 }
2640
2641 MachineBasicBlock &MBB = *MI.getParent();
2645 Register Dst = MI.getOperand(0).getReg();
2646 unsigned Part = 0;
2647 MachineInstr *Split[2];
2648
2649 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2650 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2651 if (Dst.isPhysical()) {
2652 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2653 } else {
2654 assert(MRI.isSSA());
2655 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2656 MovDPP.addDef(Tmp);
2657 }
2658
2659 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2660 const MachineOperand &SrcOp = MI.getOperand(I);
2661 assert(!SrcOp.isFPImm());
2662 if (SrcOp.isImm()) {
2663 APInt Imm(64, SrcOp.getImm());
2664 Imm.ashrInPlace(Part * 32);
2665 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2666 } else {
2667 assert(SrcOp.isReg());
2668 Register Src = SrcOp.getReg();
2669 if (Src.isPhysical())
2670 MovDPP.addReg(RI.getSubReg(Src, Sub));
2671 else
2672 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2673 }
2674 }
2675
2676 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2677 MovDPP.addImm(MO.getImm());
2678
2679 Split[Part] = MovDPP;
2680 ++Part;
2681 }
2682
2683 if (Dst.isVirtual())
2684 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2685 .addReg(Split[0]->getOperand(0).getReg())
2686 .addImm(AMDGPU::sub0)
2687 .addReg(Split[1]->getOperand(0).getReg())
2688 .addImm(AMDGPU::sub1);
2689
2690 MI.eraseFromParent();
2691 return std::pair(Split[0], Split[1]);
2692}
2693
2694std::optional<DestSourcePair>
2696 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2697 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2698
2699 return std::nullopt;
2700}
2701
2703 MachineOperand &Src0,
2704 unsigned Src0OpName,
2705 MachineOperand &Src1,
2706 unsigned Src1OpName) const {
2707 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2708 if (!Src0Mods)
2709 return false;
2710
2711 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2712 assert(Src1Mods &&
2713 "All commutable instructions have both src0 and src1 modifiers");
2714
2715 int Src0ModsVal = Src0Mods->getImm();
2716 int Src1ModsVal = Src1Mods->getImm();
2717
2718 Src1Mods->setImm(Src0ModsVal);
2719 Src0Mods->setImm(Src1ModsVal);
2720 return true;
2721}
2722
2724 MachineOperand &RegOp,
2725 MachineOperand &NonRegOp) {
2726 Register Reg = RegOp.getReg();
2727 unsigned SubReg = RegOp.getSubReg();
2728 bool IsKill = RegOp.isKill();
2729 bool IsDead = RegOp.isDead();
2730 bool IsUndef = RegOp.isUndef();
2731 bool IsDebug = RegOp.isDebug();
2732
2733 if (NonRegOp.isImm())
2734 RegOp.ChangeToImmediate(NonRegOp.getImm());
2735 else if (NonRegOp.isFI())
2736 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2737 else if (NonRegOp.isGlobal()) {
2738 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2739 NonRegOp.getTargetFlags());
2740 } else
2741 return nullptr;
2742
2743 // Make sure we don't reinterpret a subreg index in the target flags.
2744 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2745
2746 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2747 NonRegOp.setSubReg(SubReg);
2748
2749 return &MI;
2750}
2751
2753 unsigned Src0Idx,
2754 unsigned Src1Idx) const {
2755 assert(!NewMI && "this should never be used");
2756
2757 unsigned Opc = MI.getOpcode();
2758 int CommutedOpcode = commuteOpcode(Opc);
2759 if (CommutedOpcode == -1)
2760 return nullptr;
2761
2762 if (Src0Idx > Src1Idx)
2763 std::swap(Src0Idx, Src1Idx);
2764
2765 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2766 static_cast<int>(Src0Idx) &&
2767 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2768 static_cast<int>(Src1Idx) &&
2769 "inconsistency with findCommutedOpIndices");
2770
2771 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2772 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2773
2774 MachineInstr *CommutedMI = nullptr;
2775 if (Src0.isReg() && Src1.isReg()) {
2776 if (isOperandLegal(MI, Src1Idx, &Src0)) {
2777 // Be sure to copy the source modifiers to the right place.
2778 CommutedMI
2779 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2780 }
2781
2782 } else if (Src0.isReg() && !Src1.isReg()) {
2783 if (isOperandLegal(MI, Src1Idx, &Src0))
2784 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2785 } else if (!Src0.isReg() && Src1.isReg()) {
2786 if (isOperandLegal(MI, Src1Idx, &Src0))
2787 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2788 } else {
2789 // FIXME: Found two non registers to commute. This does happen.
2790 return nullptr;
2791 }
2792
2793 if (CommutedMI) {
2794 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2795 Src1, AMDGPU::OpName::src1_modifiers);
2796
2797 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2798 AMDGPU::OpName::src1_sel);
2799
2800 CommutedMI->setDesc(get(CommutedOpcode));
2801 }
2802
2803 return CommutedMI;
2804}
2805
2806// This needs to be implemented because the source modifiers may be inserted
2807// between the true commutable operands, and the base
2808// TargetInstrInfo::commuteInstruction uses it.
2810 unsigned &SrcOpIdx0,
2811 unsigned &SrcOpIdx1) const {
2812 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2813}
2814
2816 unsigned &SrcOpIdx0,
2817 unsigned &SrcOpIdx1) const {
2818 if (!Desc.isCommutable())
2819 return false;
2820
2821 unsigned Opc = Desc.getOpcode();
2822 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2823 if (Src0Idx == -1)
2824 return false;
2825
2826 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2827 if (Src1Idx == -1)
2828 return false;
2829
2830 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2831}
2832
2834 int64_t BrOffset) const {
2835 // BranchRelaxation should never have to check s_setpc_b64 because its dest
2836 // block is unanalyzable.
2837 assert(BranchOp != AMDGPU::S_SETPC_B64);
2838
2839 // Convert to dwords.
2840 BrOffset /= 4;
2841
2842 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2843 // from the next instruction.
2844 BrOffset -= 1;
2845
2846 return isIntN(BranchOffsetBits, BrOffset);
2847}
2848
2851 return MI.getOperand(0).getMBB();
2852}
2853
2855 for (const MachineInstr &MI : MBB->terminators()) {
2856 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2857 MI.getOpcode() == AMDGPU::SI_LOOP)
2858 return true;
2859 }
2860 return false;
2861}
2862
2864 MachineBasicBlock &DestBB,
2865 MachineBasicBlock &RestoreBB,
2866 const DebugLoc &DL, int64_t BrOffset,
2867 RegScavenger *RS) const {
2868 assert(RS && "RegScavenger required for long branching");
2869 assert(MBB.empty() &&
2870 "new block should be inserted for expanding unconditional branch");
2871 assert(MBB.pred_size() == 1);
2872 assert(RestoreBB.empty() &&
2873 "restore block should be inserted for restoring clobbered registers");
2874
2878
2879 // FIXME: Virtual register workaround for RegScavenger not working with empty
2880 // blocks.
2881 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2882
2883 auto I = MBB.end();
2884
2885 // Note: as this is used after hazard recognizer we need to apply some hazard
2886 // workarounds directly.
2887 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2889 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2890 if (FlushSGPRWrites)
2891 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2893 };
2894
2895 // We need to compute the offset relative to the instruction immediately after
2896 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2897 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2898 ApplyHazardWorkarounds();
2899
2900 auto &MCCtx = MF->getContext();
2901 MCSymbol *PostGetPCLabel =
2902 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2903 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2904
2905 MCSymbol *OffsetLo =
2906 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2907 MCSymbol *OffsetHi =
2908 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2909 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2910 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2911 .addReg(PCReg, 0, AMDGPU::sub0)
2912 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2913 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2914 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2915 .addReg(PCReg, 0, AMDGPU::sub1)
2916 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2917 ApplyHazardWorkarounds();
2918
2919 // Insert the indirect branch after the other terminator.
2920 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2921 .addReg(PCReg);
2922
2923 // If a spill is needed for the pc register pair, we need to insert a spill
2924 // restore block right before the destination block, and insert a short branch
2925 // into the old destination block's fallthrough predecessor.
2926 // e.g.:
2927 //
2928 // s_cbranch_scc0 skip_long_branch:
2929 //
2930 // long_branch_bb:
2931 // spill s[8:9]
2932 // s_getpc_b64 s[8:9]
2933 // s_add_u32 s8, s8, restore_bb
2934 // s_addc_u32 s9, s9, 0
2935 // s_setpc_b64 s[8:9]
2936 //
2937 // skip_long_branch:
2938 // foo;
2939 //
2940 // .....
2941 //
2942 // dest_bb_fallthrough_predecessor:
2943 // bar;
2944 // s_branch dest_bb
2945 //
2946 // restore_bb:
2947 // restore s[8:9]
2948 // fallthrough dest_bb
2949 ///
2950 // dest_bb:
2951 // buzz;
2952
2953 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
2954 Register Scav;
2955
2956 // If we've previously reserved a register for long branches
2957 // avoid running the scavenger and just use those registers
2958 if (LongBranchReservedReg) {
2959 RS->enterBasicBlock(MBB);
2960 Scav = LongBranchReservedReg;
2961 } else {
2963 Scav = RS->scavengeRegisterBackwards(
2964 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
2965 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
2966 }
2967 if (Scav) {
2968 RS->setRegUsed(Scav);
2969 MRI.replaceRegWith(PCReg, Scav);
2970 MRI.clearVirtRegs();
2971 } else {
2972 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
2973 // SGPR spill.
2974 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2975 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2976 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
2977 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
2978 MRI.clearVirtRegs();
2979 }
2980
2981 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
2982 // Now, the distance could be defined.
2984 MCSymbolRefExpr::create(DestLabel, MCCtx),
2985 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
2986 // Add offset assignments.
2987 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
2988 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
2989 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
2990 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
2991}
2992
2993unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
2994 switch (Cond) {
2995 case SIInstrInfo::SCC_TRUE:
2996 return AMDGPU::S_CBRANCH_SCC1;
2997 case SIInstrInfo::SCC_FALSE:
2998 return AMDGPU::S_CBRANCH_SCC0;
2999 case SIInstrInfo::VCCNZ:
3000 return AMDGPU::S_CBRANCH_VCCNZ;
3001 case SIInstrInfo::VCCZ:
3002 return AMDGPU::S_CBRANCH_VCCZ;
3003 case SIInstrInfo::EXECNZ:
3004 return AMDGPU::S_CBRANCH_EXECNZ;
3005 case SIInstrInfo::EXECZ:
3006 return AMDGPU::S_CBRANCH_EXECZ;
3007 default:
3008 llvm_unreachable("invalid branch predicate");
3009 }
3010}
3011
3012SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3013 switch (Opcode) {
3014 case AMDGPU::S_CBRANCH_SCC0:
3015 return SCC_FALSE;
3016 case AMDGPU::S_CBRANCH_SCC1:
3017 return SCC_TRUE;
3018 case AMDGPU::S_CBRANCH_VCCNZ:
3019 return VCCNZ;
3020 case AMDGPU::S_CBRANCH_VCCZ:
3021 return VCCZ;
3022 case AMDGPU::S_CBRANCH_EXECNZ:
3023 return EXECNZ;
3024 case AMDGPU::S_CBRANCH_EXECZ:
3025 return EXECZ;
3026 default:
3027 return INVALID_BR;
3028 }
3029}
3030
3034 MachineBasicBlock *&FBB,
3036 bool AllowModify) const {
3037 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3038 // Unconditional Branch
3039 TBB = I->getOperand(0).getMBB();
3040 return false;
3041 }
3042
3043 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3044 if (Pred == INVALID_BR)
3045 return true;
3046
3047 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3048 Cond.push_back(MachineOperand::CreateImm(Pred));
3049 Cond.push_back(I->getOperand(1)); // Save the branch register.
3050
3051 ++I;
3052
3053 if (I == MBB.end()) {
3054 // Conditional branch followed by fall-through.
3055 TBB = CondBB;
3056 return false;
3057 }
3058
3059 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3060 TBB = CondBB;
3061 FBB = I->getOperand(0).getMBB();
3062 return false;
3063 }
3064
3065 return true;
3066}
3067
3069 MachineBasicBlock *&FBB,
3071 bool AllowModify) const {
3073 auto E = MBB.end();
3074 if (I == E)
3075 return false;
3076
3077 // Skip over the instructions that are artificially terminators for special
3078 // exec management.
3079 while (I != E && !I->isBranch() && !I->isReturn()) {
3080 switch (I->getOpcode()) {
3081 case AMDGPU::S_MOV_B64_term:
3082 case AMDGPU::S_XOR_B64_term:
3083 case AMDGPU::S_OR_B64_term:
3084 case AMDGPU::S_ANDN2_B64_term:
3085 case AMDGPU::S_AND_B64_term:
3086 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3087 case AMDGPU::S_MOV_B32_term:
3088 case AMDGPU::S_XOR_B32_term:
3089 case AMDGPU::S_OR_B32_term:
3090 case AMDGPU::S_ANDN2_B32_term:
3091 case AMDGPU::S_AND_B32_term:
3092 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3093 break;
3094 case AMDGPU::SI_IF:
3095 case AMDGPU::SI_ELSE:
3096 case AMDGPU::SI_KILL_I1_TERMINATOR:
3097 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3098 // FIXME: It's messy that these need to be considered here at all.
3099 return true;
3100 default:
3101 llvm_unreachable("unexpected non-branch terminator inst");
3102 }
3103
3104 ++I;
3105 }
3106
3107 if (I == E)
3108 return false;
3109
3110 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3111}
3112
3114 int *BytesRemoved) const {
3115 unsigned Count = 0;
3116 unsigned RemovedSize = 0;
3118 // Skip over artificial terminators when removing instructions.
3119 if (MI.isBranch() || MI.isReturn()) {
3120 RemovedSize += getInstSizeInBytes(MI);
3121 MI.eraseFromParent();
3122 ++Count;
3123 }
3124 }
3125
3126 if (BytesRemoved)
3127 *BytesRemoved = RemovedSize;
3128
3129 return Count;
3130}
3131
3132// Copy the flags onto the implicit condition register operand.
3134 const MachineOperand &OrigCond) {
3135 CondReg.setIsUndef(OrigCond.isUndef());
3136 CondReg.setIsKill(OrigCond.isKill());
3137}
3138
3141 MachineBasicBlock *FBB,
3143 const DebugLoc &DL,
3144 int *BytesAdded) const {
3145 if (!FBB && Cond.empty()) {
3146 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3147 .addMBB(TBB);
3148 if (BytesAdded)
3149 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3150 return 1;
3151 }
3152
3153 assert(TBB && Cond[0].isImm());
3154
3155 unsigned Opcode
3156 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3157
3158 if (!FBB) {
3159 MachineInstr *CondBr =
3160 BuildMI(&MBB, DL, get(Opcode))
3161 .addMBB(TBB);
3162
3163 // Copy the flags onto the implicit condition register operand.
3164 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3165 fixImplicitOperands(*CondBr);
3166
3167 if (BytesAdded)
3168 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3169 return 1;
3170 }
3171
3172 assert(TBB && FBB);
3173
3174 MachineInstr *CondBr =
3175 BuildMI(&MBB, DL, get(Opcode))
3176 .addMBB(TBB);
3177 fixImplicitOperands(*CondBr);
3178 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3179 .addMBB(FBB);
3180
3181 MachineOperand &CondReg = CondBr->getOperand(1);
3182 CondReg.setIsUndef(Cond[1].isUndef());
3183 CondReg.setIsKill(Cond[1].isKill());
3184
3185 if (BytesAdded)
3186 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3187
3188 return 2;
3189}
3190
3193 if (Cond.size() != 2) {
3194 return true;
3195 }
3196
3197 if (Cond[0].isImm()) {
3198 Cond[0].setImm(-Cond[0].getImm());
3199 return false;
3200 }
3201
3202 return true;
3203}
3204
3207 Register DstReg, Register TrueReg,
3208 Register FalseReg, int &CondCycles,
3209 int &TrueCycles, int &FalseCycles) const {
3210 switch (Cond[0].getImm()) {
3211 case VCCNZ:
3212 case VCCZ: {
3214 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3215 if (MRI.getRegClass(FalseReg) != RC)
3216 return false;
3217
3218 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3219 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3220
3221 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3222 return RI.hasVGPRs(RC) && NumInsts <= 6;
3223 }
3224 case SCC_TRUE:
3225 case SCC_FALSE: {
3226 // FIXME: We could insert for VGPRs if we could replace the original compare
3227 // with a vector one.
3229 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3230 if (MRI.getRegClass(FalseReg) != RC)
3231 return false;
3232
3233 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3234
3235 // Multiples of 8 can do s_cselect_b64
3236 if (NumInsts % 2 == 0)
3237 NumInsts /= 2;
3238
3239 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3240 return RI.isSGPRClass(RC);
3241 }
3242 default:
3243 return false;
3244 }
3245}
3246
3250 Register TrueReg, Register FalseReg) const {
3251 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3252 if (Pred == VCCZ || Pred == SCC_FALSE) {
3253 Pred = static_cast<BranchPredicate>(-Pred);
3254 std::swap(TrueReg, FalseReg);
3255 }
3256
3258 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3259 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3260
3261 if (DstSize == 32) {
3263 if (Pred == SCC_TRUE) {
3264 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3265 .addReg(TrueReg)
3266 .addReg(FalseReg);
3267 } else {
3268 // Instruction's operands are backwards from what is expected.
3269 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3270 .addReg(FalseReg)
3271 .addReg(TrueReg);
3272 }
3273
3274 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3275 return;
3276 }
3277
3278 if (DstSize == 64 && Pred == SCC_TRUE) {
3280 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3281 .addReg(TrueReg)
3282 .addReg(FalseReg);
3283
3284 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3285 return;
3286 }
3287
3288 static const int16_t Sub0_15[] = {
3289 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3290 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3291 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3292 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3293 };
3294
3295 static const int16_t Sub0_15_64[] = {
3296 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3297 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3298 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3299 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3300 };
3301
3302 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3303 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3304 const int16_t *SubIndices = Sub0_15;
3305 int NElts = DstSize / 32;
3306
3307 // 64-bit select is only available for SALU.
3308 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3309 if (Pred == SCC_TRUE) {
3310 if (NElts % 2) {
3311 SelOp = AMDGPU::S_CSELECT_B32;
3312 EltRC = &AMDGPU::SGPR_32RegClass;
3313 } else {
3314 SelOp = AMDGPU::S_CSELECT_B64;
3315 EltRC = &AMDGPU::SGPR_64RegClass;
3316 SubIndices = Sub0_15_64;
3317 NElts /= 2;
3318 }
3319 }
3320
3322 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3323
3324 I = MIB->getIterator();
3325
3327 for (int Idx = 0; Idx != NElts; ++Idx) {
3328 Register DstElt = MRI.createVirtualRegister(EltRC);
3329 Regs.push_back(DstElt);
3330
3331 unsigned SubIdx = SubIndices[Idx];
3332
3334 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3335 Select =
3336 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3337 .addReg(FalseReg, 0, SubIdx)
3338 .addReg(TrueReg, 0, SubIdx);
3339 } else {
3340 Select =
3341 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3342 .addReg(TrueReg, 0, SubIdx)
3343 .addReg(FalseReg, 0, SubIdx);
3344 }
3345
3346 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3348
3349 MIB.addReg(DstElt)
3350 .addImm(SubIdx);
3351 }
3352}
3353
3355 switch (MI.getOpcode()) {
3356 case AMDGPU::V_MOV_B16_t16_e32:
3357 case AMDGPU::V_MOV_B16_t16_e64:
3358 case AMDGPU::V_MOV_B32_e32:
3359 case AMDGPU::V_MOV_B32_e64:
3360 case AMDGPU::V_MOV_B64_PSEUDO:
3361 case AMDGPU::V_MOV_B64_e32:
3362 case AMDGPU::V_MOV_B64_e64:
3363 case AMDGPU::S_MOV_B32:
3364 case AMDGPU::S_MOV_B64:
3365 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3366 case AMDGPU::COPY:
3367 case AMDGPU::WWM_COPY:
3368 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3369 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3370 case AMDGPU::V_ACCVGPR_MOV_B32:
3371 return true;
3372 default:
3373 return false;
3374 }
3375}
3376
3377static constexpr unsigned ModifierOpNames[] = {
3378 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3379 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3380 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3381
3383 unsigned Opc = MI.getOpcode();
3384 for (unsigned Name : reverse(ModifierOpNames)) {
3386 if (Idx >= 0)
3387 MI.removeOperand(Idx);
3388 }
3389}
3390
3392 Register Reg, MachineRegisterInfo *MRI) const {
3393 if (!MRI->hasOneNonDBGUse(Reg))
3394 return false;
3395
3396 switch (DefMI.getOpcode()) {
3397 default:
3398 return false;
3399 case AMDGPU::V_MOV_B64_e32:
3400 case AMDGPU::S_MOV_B64:
3401 case AMDGPU::V_MOV_B64_PSEUDO:
3402 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3403 case AMDGPU::V_MOV_B32_e32:
3404 case AMDGPU::S_MOV_B32:
3405 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3406 break;
3407 }
3408
3409 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
3410 assert(ImmOp);
3411 // FIXME: We could handle FrameIndex values here.
3412 if (!ImmOp->isImm())
3413 return false;
3414
3415 auto getImmFor = [ImmOp](const MachineOperand &UseOp) -> int64_t {
3416 int64_t Imm = ImmOp->getImm();
3417 switch (UseOp.getSubReg()) {
3418 default:
3419 return Imm;
3420 case AMDGPU::sub0:
3421 return Lo_32(Imm);
3422 case AMDGPU::sub1:
3423 return Hi_32(Imm);
3424 case AMDGPU::lo16:
3425 return SignExtend64<16>(Imm);
3426 case AMDGPU::hi16:
3427 return SignExtend64<16>(Imm >> 16);
3428 case AMDGPU::sub1_lo16:
3429 return SignExtend64<16>(Imm >> 32);
3430 case AMDGPU::sub1_hi16:
3431 return SignExtend64<16>(Imm >> 48);
3432 }
3433 };
3434
3435 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3436
3437 unsigned Opc = UseMI.getOpcode();
3438 if (Opc == AMDGPU::COPY) {
3439 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3440
3441 Register DstReg = UseMI.getOperand(0).getReg();
3442 unsigned OpSize = getOpSize(UseMI, 0);
3443 bool Is16Bit = OpSize == 2;
3444 bool Is64Bit = OpSize == 8;
3445 bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
3446 unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
3447 : AMDGPU::V_MOV_B32_e32
3448 : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
3449 : AMDGPU::S_MOV_B32;
3450 APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1)),
3451 /*isSigned=*/true, /*implicitTrunc=*/true);
3452
3453 if (RI.isAGPR(*MRI, DstReg)) {
3454 if (Is64Bit || !isInlineConstant(Imm))
3455 return false;
3456 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
3457 }
3458
3459 if (Is16Bit) {
3460 if (isVGPRCopy)
3461 return false; // Do not clobber vgpr_hi16
3462
3463 if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
3464 return false;
3465
3466 UseMI.getOperand(0).setSubReg(0);
3467 if (DstReg.isPhysical()) {
3468 DstReg = RI.get32BitRegister(DstReg);
3469 UseMI.getOperand(0).setReg(DstReg);
3470 }
3471 assert(UseMI.getOperand(1).getReg().isVirtual());
3472 }
3473
3474 const MCInstrDesc &NewMCID = get(NewOpc);
3475 if (DstReg.isPhysical() &&
3476 !RI.getRegClass(NewMCID.operands()[0].RegClass)->contains(DstReg))
3477 return false;
3478
3479 UseMI.setDesc(NewMCID);
3480 UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
3481 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
3482 return true;
3483 }
3484
3485 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3486 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3487 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3488 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3489 Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
3490 // Don't fold if we are using source or output modifiers. The new VOP2
3491 // instructions don't have them.
3493 return false;
3494
3495 // If this is a free constant, there's no reason to do this.
3496 // TODO: We could fold this here instead of letting SIFoldOperands do it
3497 // later.
3498 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
3499
3500 // Any src operand can be used for the legality check.
3501 if (isInlineConstant(UseMI, *Src0, *ImmOp))
3502 return false;
3503
3504 bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3505 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64;
3506 bool IsFMA =
3507 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3508 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3509 Opc == AMDGPU::V_FMAC_F16_fake16_e64;
3510 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3511 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3512
3513 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3514 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3515 (Src1->isReg() && Src1->getReg() == Reg)) {
3516 MachineOperand *RegSrc =
3517 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3518 if (!RegSrc->isReg())
3519 return false;
3520 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3521 ST.getConstantBusLimit(Opc) < 2)
3522 return false;
3523
3524 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3525 return false;
3526
3527 // If src2 is also a literal constant then we have to choose which one to
3528 // fold. In general it is better to choose madak so that the other literal
3529 // can be materialized in an sgpr instead of a vgpr:
3530 // s_mov_b32 s0, literal
3531 // v_madak_f32 v0, s0, v0, literal
3532 // Instead of:
3533 // v_mov_b32 v1, literal
3534 // v_madmk_f32 v0, v0, literal, v1
3535 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3536 if (Def && Def->isMoveImmediate() &&
3537 !isInlineConstant(Def->getOperand(1)))
3538 return false;
3539
3540 unsigned NewOpc =
3541 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3542 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
3543 : AMDGPU::V_FMAMK_F16)
3544 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
3545 if (pseudoToMCOpcode(NewOpc) == -1)
3546 return false;
3547
3548 // V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3549 // would also require restricting their register classes. For now
3550 // just bail out.
3551 if (NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3552 return false;
3553
3554 const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
3555
3556 // FIXME: This would be a lot easier if we could return a new instruction
3557 // instead of having to modify in place.
3558
3559 Register SrcReg = RegSrc->getReg();
3560 unsigned SrcSubReg = RegSrc->getSubReg();
3561 Src0->setReg(SrcReg);
3562 Src0->setSubReg(SrcSubReg);
3563 Src0->setIsKill(RegSrc->isKill());
3564
3565 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3566 Opc == AMDGPU::V_FMAC_F32_e64 ||
3567 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
3568 UseMI.untieRegOperand(
3569 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3570
3571 Src1->ChangeToImmediate(Imm);
3572
3574 UseMI.setDesc(get(NewOpc));
3575
3576 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3577 if (DeleteDef)
3578 DefMI.eraseFromParent();
3579
3580 return true;
3581 }
3582
3583 // Added part is the constant: Use v_madak_{f16, f32}.
3584 if (Src2->isReg() && Src2->getReg() == Reg) {
3585 if (ST.getConstantBusLimit(Opc) < 2) {
3586 // Not allowed to use constant bus for another operand.
3587 // We can however allow an inline immediate as src0.
3588 bool Src0Inlined = false;
3589 if (Src0->isReg()) {
3590 // Try to inline constant if possible.
3591 // If the Def moves immediate and the use is single
3592 // We are saving VGPR here.
3593 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3594 if (Def && Def->isMoveImmediate() &&
3595 isInlineConstant(Def->getOperand(1)) &&
3596 MRI->hasOneUse(Src0->getReg())) {
3597 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3598 Src0Inlined = true;
3599 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3600 RI.isSGPRReg(*MRI, Src0->getReg())) {
3601 return false;
3602 }
3603 // VGPR is okay as Src0 - fallthrough
3604 }
3605
3606 if (Src1->isReg() && !Src0Inlined) {
3607 // We have one slot for inlinable constant so far - try to fill it
3608 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3609 if (Def && Def->isMoveImmediate() &&
3610 isInlineConstant(Def->getOperand(1)) &&
3611 MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI))
3612 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3613 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3614 return false;
3615 // VGPR is okay as Src1 - fallthrough
3616 }
3617 }
3618
3619 unsigned NewOpc =
3620 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3621 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
3622 : AMDGPU::V_FMAAK_F16)
3623 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
3624 if (pseudoToMCOpcode(NewOpc) == -1)
3625 return false;
3626
3627 // V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3628 // would also require restricting their register classes. For now
3629 // just bail out.
3630 if (NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3631 return false;
3632
3633 // FIXME: This would be a lot easier if we could return a new instruction
3634 // instead of having to modify in place.
3635
3636 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3637 Opc == AMDGPU::V_FMAC_F32_e64 ||
3638 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
3639 UseMI.untieRegOperand(
3640 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3641
3642 // ChangingToImmediate adds Src2 back to the instruction.
3643 Src2->ChangeToImmediate(getImmFor(*Src2));
3644
3645 // These come before src2.
3647 UseMI.setDesc(get(NewOpc));
3648 // It might happen that UseMI was commuted
3649 // and we now have SGPR as SRC1. If so 2 inlined
3650 // constant and SGPR are illegal.
3652
3653 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3654 if (DeleteDef)
3655 DefMI.eraseFromParent();
3656
3657 return true;
3658 }
3659 }
3660
3661 return false;
3662}
3663
3664static bool
3667 if (BaseOps1.size() != BaseOps2.size())
3668 return false;
3669 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3670 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3671 return false;
3672 }
3673 return true;
3674}
3675
3676static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3677 LocationSize WidthB, int OffsetB) {
3678 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3679 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3680 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3681 return LowWidth.hasValue() &&
3682 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3683}
3684
3685bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3686 const MachineInstr &MIb) const {
3687 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3688 int64_t Offset0, Offset1;
3689 LocationSize Dummy0 = 0, Dummy1 = 0;
3690 bool Offset0IsScalable, Offset1IsScalable;
3691 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3692 Dummy0, &RI) ||
3693 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3694 Dummy1, &RI))
3695 return false;
3696
3697 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3698 return false;
3699
3700 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3701 // FIXME: Handle ds_read2 / ds_write2.
3702 return false;
3703 }
3704 LocationSize Width0 = MIa.memoperands().front()->getSize();
3705 LocationSize Width1 = MIb.memoperands().front()->getSize();
3706 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3707}
3708
3710 const MachineInstr &MIb) const {
3711 assert(MIa.mayLoadOrStore() &&
3712 "MIa must load from or modify a memory location");
3713 assert(MIb.mayLoadOrStore() &&
3714 "MIb must load from or modify a memory location");
3715
3717 return false;
3718
3719 // XXX - Can we relax this between address spaces?
3720 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3721 return false;
3722
3723 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3724 return false;
3725
3726 // TODO: Should we check the address space from the MachineMemOperand? That
3727 // would allow us to distinguish objects we know don't alias based on the
3728 // underlying address space, even if it was lowered to a different one,
3729 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3730 // buffer.
3731 if (isDS(MIa)) {
3732 if (isDS(MIb))
3733 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3734
3735 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3736 }
3737
3738 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3739 if (isMUBUF(MIb) || isMTBUF(MIb))
3740 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3741
3742 if (isFLAT(MIb))
3743 return isFLATScratch(MIb);
3744
3745 return !isSMRD(MIb);
3746 }
3747
3748 if (isSMRD(MIa)) {
3749 if (isSMRD(MIb))
3750 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3751
3752 if (isFLAT(MIb))
3753 return isFLATScratch(MIb);
3754
3755 return !isMUBUF(MIb) && !isMTBUF(MIb);
3756 }
3757
3758 if (isFLAT(MIa)) {
3759 if (isFLAT(MIb)) {
3760 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3761 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3762 return true;
3763
3764 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3765 }
3766
3767 return false;
3768 }
3769
3770 return false;
3771}
3772
3774 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3775 if (Reg.isPhysical())
3776 return false;
3777 auto *Def = MRI.getUniqueVRegDef(Reg);
3778 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3779 Imm = Def->getOperand(1).getImm();
3780 if (DefMI)
3781 *DefMI = Def;
3782 return true;
3783 }
3784 return false;
3785}
3786
3787static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3788 MachineInstr **DefMI = nullptr) {
3789 if (!MO->isReg())
3790 return false;
3791 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3792 const MachineRegisterInfo &MRI = MF->getRegInfo();
3793 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3794}
3795
3797 MachineInstr &NewMI) {
3798 if (LV) {
3799 unsigned NumOps = MI.getNumOperands();
3800 for (unsigned I = 1; I < NumOps; ++I) {
3801 MachineOperand &Op = MI.getOperand(I);
3802 if (Op.isReg() && Op.isKill())
3803 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3804 }
3805 }
3806}
3807
3809 LiveVariables *LV,
3810 LiveIntervals *LIS) const {
3811 MachineBasicBlock &MBB = *MI.getParent();
3812 unsigned Opc = MI.getOpcode();
3813
3814 // Handle MFMA.
3815 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
3816 if (NewMFMAOpc != -1) {
3818 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
3819 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3820 MIB.add(MI.getOperand(I));
3821 updateLiveVariables(LV, MI, *MIB);
3822 if (LIS) {
3823 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3824 // SlotIndex of defs needs to be updated when converting to early-clobber
3825 MachineOperand &Def = MIB->getOperand(0);
3826 if (Def.isEarlyClobber() && Def.isReg() &&
3827 LIS->hasInterval(Def.getReg())) {
3828 SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
3829 SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
3830 auto &LI = LIS->getInterval(Def.getReg());
3831 auto UpdateDefIndex = [&](LiveRange &LR) {
3832 auto *S = LR.find(OldIndex);
3833 if (S != LR.end() && S->start == OldIndex) {
3834 assert(S->valno && S->valno->def == OldIndex);
3835 S->start = NewIndex;
3836 S->valno->def = NewIndex;
3837 }
3838 };
3839 UpdateDefIndex(LI);
3840 for (auto &SR : LI.subranges())
3841 UpdateDefIndex(SR);
3842 }
3843 }
3844 return MIB;
3845 }
3846
3847 if (SIInstrInfo::isWMMA(MI)) {
3848 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
3849 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3850 .setMIFlags(MI.getFlags());
3851 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3852 MIB->addOperand(MI.getOperand(I));
3853
3854 updateLiveVariables(LV, MI, *MIB);
3855 if (LIS)
3856 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3857
3858 return MIB;
3859 }
3860
3861 assert(
3862 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3863 "V_FMAC_F16_fake16_e32 is not supported and not expected to be present "
3864 "pre-RA");
3865
3866 // Handle MAC/FMAC.
3867 bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
3868 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3869 Opc == AMDGPU::V_FMAC_F16_fake16_e64;
3870 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3871 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3872 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
3873 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3874 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3875 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3876 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3877 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
3878 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
3879 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3880 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
3881 bool Src0Literal = false;
3882
3883 switch (Opc) {
3884 default:
3885 return nullptr;
3886 case AMDGPU::V_MAC_F16_e64:
3887 case AMDGPU::V_FMAC_F16_e64:
3888 case AMDGPU::V_FMAC_F16_fake16_e64:
3889 case AMDGPU::V_MAC_F32_e64:
3890 case AMDGPU::V_MAC_LEGACY_F32_e64:
3891 case AMDGPU::V_FMAC_F32_e64:
3892 case AMDGPU::V_FMAC_LEGACY_F32_e64:
3893 case AMDGPU::V_FMAC_F64_e64:
3894 break;
3895 case AMDGPU::V_MAC_F16_e32:
3896 case AMDGPU::V_FMAC_F16_e32:
3897 case AMDGPU::V_MAC_F32_e32:
3898 case AMDGPU::V_MAC_LEGACY_F32_e32:
3899 case AMDGPU::V_FMAC_F32_e32:
3900 case AMDGPU::V_FMAC_LEGACY_F32_e32:
3901 case AMDGPU::V_FMAC_F64_e32: {
3902 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
3903 AMDGPU::OpName::src0);
3904 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
3905 if (!Src0->isReg() && !Src0->isImm())
3906 return nullptr;
3907
3908 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
3909 Src0Literal = true;
3910
3911 break;
3912 }
3913 }
3914
3916 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
3917 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
3918 const MachineOperand *Src0Mods =
3919 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
3920 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3921 const MachineOperand *Src1Mods =
3922 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
3923 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3924 const MachineOperand *Src2Mods =
3925 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
3926 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
3927 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
3928 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
3929
3930 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 &&
3931 !IsLegacy &&
3932 // If we have an SGPR input, we will violate the constant bus restriction.
3933 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
3934 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
3936 const auto killDef = [&]() -> void {
3938 // The only user is the instruction which will be killed.
3939 Register DefReg = DefMI->getOperand(0).getReg();
3940
3941 if (MRI.hasOneNonDBGUse(DefReg)) {
3942 // We cannot just remove the DefMI here, calling pass will crash.
3943 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
3944 DefMI->getOperand(0).setIsDead(true);
3945 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
3947 if (LV)
3948 LV->getVarInfo(DefReg).AliveBlocks.clear();
3949 }
3950
3951 if (LIS) {
3952 LiveInterval &DefLI = LIS->getInterval(DefReg);
3953
3954 // We cannot delete the original instruction here, so hack out the use
3955 // in the original instruction with a dummy register so we can use
3956 // shrinkToUses to deal with any multi-use edge cases. Other targets do
3957 // not have the complexity of deleting a use to consider here.
3958 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
3959 for (MachineOperand &MIOp : MI.uses()) {
3960 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
3961 MIOp.setIsUndef(true);
3962 MIOp.setReg(DummyReg);
3963 }
3964 }
3965
3966 LIS->shrinkToUses(&DefLI);
3967 }
3968 };
3969
3970 int64_t Imm;
3971 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
3972 unsigned NewOpc =
3973 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
3974 : AMDGPU::V_FMAAK_F16)
3975 : AMDGPU::V_FMAAK_F32)
3976 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
3977 if (pseudoToMCOpcode(NewOpc) != -1) {
3978 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3979 .add(*Dst)
3980 .add(*Src0)
3981 .add(*Src1)
3982 .addImm(Imm)
3983 .setMIFlags(MI.getFlags());
3984 updateLiveVariables(LV, MI, *MIB);
3985 if (LIS)
3986 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3987 killDef();
3988 return MIB;
3989 }
3990 }
3991 unsigned NewOpc =
3992 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
3993 : AMDGPU::V_FMAMK_F16)
3994 : AMDGPU::V_FMAMK_F32)
3995 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
3996 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
3997 if (pseudoToMCOpcode(NewOpc) != -1) {
3998 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3999 .add(*Dst)
4000 .add(*Src0)
4001 .addImm(Imm)
4002 .add(*Src2)
4003 .setMIFlags(MI.getFlags());
4004 updateLiveVariables(LV, MI, *MIB);
4005
4006 if (LIS)
4007 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4008 killDef();
4009 return MIB;
4010 }
4011 }
4012 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4013 if (Src0Literal) {
4014 Imm = Src0->getImm();
4015 DefMI = nullptr;
4016 }
4017 if (pseudoToMCOpcode(NewOpc) != -1 &&
4019 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4020 Src1)) {
4021 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4022 .add(*Dst)
4023 .add(*Src1)
4024 .addImm(Imm)
4025 .add(*Src2)
4026 .setMIFlags(MI.getFlags());
4027 updateLiveVariables(LV, MI, *MIB);
4028
4029 if (LIS)
4030 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4031 if (DefMI)
4032 killDef();
4033 return MIB;
4034 }
4035 }
4036 }
4037
4038 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4039 // if VOP3 does not allow a literal operand.
4040 if (Src0Literal && !ST.hasVOP3Literal())
4041 return nullptr;
4042
4043 unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64
4044 : IsF64 ? AMDGPU::V_FMA_F64_e64
4045 : IsLegacy
4046 ? AMDGPU::V_FMA_LEGACY_F32_e64
4047 : AMDGPU::V_FMA_F32_e64
4048 : IsF16 ? AMDGPU::V_MAD_F16_e64
4049 : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64
4050 : AMDGPU::V_MAD_F32_e64;
4051 if (pseudoToMCOpcode(NewOpc) == -1)
4052 return nullptr;
4053
4054 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4055 .add(*Dst)
4056 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4057 .add(*Src0)
4058 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4059 .add(*Src1)
4060 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4061 .add(*Src2)
4062 .addImm(Clamp ? Clamp->getImm() : 0)
4063 .addImm(Omod ? Omod->getImm() : 0)
4064 .setMIFlags(MI.getFlags());
4065 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4066 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4067 updateLiveVariables(LV, MI, *MIB);
4068 if (LIS)
4069 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4070 return MIB;
4071}
4072
4073// It's not generally safe to move VALU instructions across these since it will
4074// start using the register as a base index rather than directly.
4075// XXX - Why isn't hasSideEffects sufficient for these?
4077 switch (MI.getOpcode()) {
4078 case AMDGPU::S_SET_GPR_IDX_ON:
4079 case AMDGPU::S_SET_GPR_IDX_MODE:
4080 case AMDGPU::S_SET_GPR_IDX_OFF:
4081 return true;
4082 default:
4083 return false;
4084 }
4085}
4086
4088 const MachineBasicBlock *MBB,
4089 const MachineFunction &MF) const {
4090 // Skipping the check for SP writes in the base implementation. The reason it
4091 // was added was apparently due to compile time concerns.
4092 //
4093 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4094 // but is probably avoidable.
4095
4096 // Copied from base implementation.
4097 // Terminators and labels can't be scheduled around.
4098 if (MI.isTerminator() || MI.isPosition())
4099 return true;
4100
4101 // INLINEASM_BR can jump to another block
4102 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4103 return true;
4104
4105 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4106 return true;
4107
4108 // Target-independent instructions do not have an implicit-use of EXEC, even
4109 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4110 // boundaries prevents incorrect movements of such instructions.
4111 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4112 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4113 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4114 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4116}
4117
4119 return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
4120}
4121
4123 // Skip the full operand and register alias search modifiesRegister
4124 // does. There's only a handful of instructions that touch this, it's only an
4125 // implicit def, and doesn't alias any other registers.
4126 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4127}
4128
4130 unsigned Opcode = MI.getOpcode();
4131
4132 if (MI.mayStore() && isSMRD(MI))
4133 return true; // scalar store or atomic
4134
4135 // This will terminate the function when other lanes may need to continue.
4136 if (MI.isReturn())
4137 return true;
4138
4139 // These instructions cause shader I/O that may cause hardware lockups
4140 // when executed with an empty EXEC mask.
4141 //
4142 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4143 // EXEC = 0, but checking for that case here seems not worth it
4144 // given the typical code patterns.
4145 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4146 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4147 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4148 return true;
4149
4150 if (MI.isCall() || MI.isInlineAsm())
4151 return true; // conservative assumption
4152
4153 // Assume that barrier interactions are only intended with active lanes.
4154 if (isBarrier(Opcode))
4155 return true;
4156
4157 // A mode change is a scalar operation that influences vector instructions.
4159 return true;
4160
4161 // These are like SALU instructions in terms of effects, so it's questionable
4162 // whether we should return true for those.
4163 //
4164 // However, executing them with EXEC = 0 causes them to operate on undefined
4165 // data, which we avoid by returning true here.
4166 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4167 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4168 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4169 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4170 return true;
4171
4172 return false;
4173}
4174
4176 const MachineInstr &MI) const {
4177 if (MI.isMetaInstruction())
4178 return false;
4179
4180 // This won't read exec if this is an SGPR->SGPR copy.
4181 if (MI.isCopyLike()) {
4182 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4183 return true;
4184
4185 // Make sure this isn't copying exec as a normal operand
4186 return MI.readsRegister(AMDGPU::EXEC, &RI);
4187 }
4188
4189 // Make a conservative assumption about the callee.
4190 if (MI.isCall())
4191 return true;
4192
4193 // Be conservative with any unhandled generic opcodes.
4194 if (!isTargetSpecificOpcode(MI.getOpcode()))
4195 return true;
4196
4197 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4198}
4199
4200bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4201 switch (Imm.getBitWidth()) {
4202 case 1: // This likely will be a condition code mask.
4203 return true;
4204
4205 case 32:
4206 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4207 ST.hasInv2PiInlineImm());
4208 case 64:
4209 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4210 ST.hasInv2PiInlineImm());
4211 case 16:
4212 return ST.has16BitInsts() &&
4213 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4214 ST.hasInv2PiInlineImm());
4215 default:
4216 llvm_unreachable("invalid bitwidth");
4217 }
4218}
4219
4221 APInt IntImm = Imm.bitcastToAPInt();
4222 int64_t IntImmVal = IntImm.getSExtValue();
4223 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4224 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4225 default:
4226 llvm_unreachable("invalid fltSemantics");
4229 return isInlineConstant(IntImm);
4231 return ST.has16BitInsts() &&
4232 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4234 return ST.has16BitInsts() &&
4235 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4236 }
4237}
4238
4240 uint8_t OperandType) const {
4241 assert(!MO.isReg() && "isInlineConstant called on register operand!");
4242 if (!MO.isImm())
4243 return false;
4244
4245 // MachineOperand provides no way to tell the true operand size, since it only
4246 // records a 64-bit value. We need to know the size to determine if a 32-bit
4247 // floating point immediate bit pattern is legal for an integer immediate. It
4248 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4249
4250 int64_t Imm = MO.getImm();
4251 switch (OperandType) {
4264 int32_t Trunc = static_cast<int32_t>(Imm);
4266 }
4273 ST.hasInv2PiInlineImm());
4277 // We would expect inline immediates to not be concerned with an integer/fp
4278 // distinction. However, in the case of 16-bit integer operations, the
4279 // "floating point" values appear to not work. It seems read the low 16-bits
4280 // of 32-bit immediates, which happens to always work for the integer
4281 // values.
4282 //
4283 // See llvm bugzilla 46302.
4284 //
4285 // TODO: Theoretically we could use op-sel to use the high bits of the
4286 // 32-bit FP values.
4304 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4305 // A few special case instructions have 16-bit operands on subtargets
4306 // where 16-bit instructions are not legal.
4307 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4308 // constants in these cases
4309 int16_t Trunc = static_cast<int16_t>(Imm);
4310 return ST.has16BitInsts() &&
4312 }
4313
4314 return false;
4315 }
4320 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4321 int16_t Trunc = static_cast<int16_t>(Imm);
4322 return ST.has16BitInsts() &&
4324 }
4325 return false;
4326 }
4329 return false;
4332 // Always embedded in the instruction for free.
4333 return true;
4343 // Just ignore anything else.
4344 return true;
4345 default:
4346 llvm_unreachable("invalid operand type");
4347 }
4348}
4349
4350static bool compareMachineOp(const MachineOperand &Op0,
4351 const MachineOperand &Op1) {
4352 if (Op0.getType() != Op1.getType())
4353 return false;
4354
4355 switch (Op0.getType()) {
4357 return Op0.getReg() == Op1.getReg();
4359 return Op0.getImm() == Op1.getImm();
4360 default:
4361 llvm_unreachable("Didn't expect to be comparing these operand types");
4362 }
4363}
4364
4366 const MachineOperand &MO) const {
4367 const MCInstrDesc &InstDesc = MI.getDesc();
4368 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4369
4370 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
4371
4373 return true;
4374
4375 if (OpInfo.RegClass < 0)
4376 return false;
4377
4378 if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
4379 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
4380 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4381 AMDGPU::OpName::src2))
4382 return false;
4383 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4384 }
4385
4386 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4387 return false;
4388
4389 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
4390 return true;
4391
4392 return ST.hasVOP3Literal();
4393}
4394
4395bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4396 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4397 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4398 return false;
4399
4400 int Op32 = AMDGPU::getVOPe32(Opcode);
4401 if (Op32 == -1)
4402 return false;
4403
4404 return pseudoToMCOpcode(Op32) != -1;
4405}
4406
4407bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4408 // The src0_modifier operand is present on all instructions
4409 // that have modifiers.
4410
4411 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4412}
4413
4415 unsigned OpName) const {
4416 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4417 return Mods && Mods->getImm();
4418}
4419
4421 return any_of(ModifierOpNames,
4422 [&](unsigned Name) { return hasModifiersSet(MI, Name); });
4423}
4424
4426 const MachineRegisterInfo &MRI) const {
4427 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4428 // Can't shrink instruction with three operands.
4429 if (Src2) {
4430 switch (MI.getOpcode()) {
4431 default: return false;
4432
4433 case AMDGPU::V_ADDC_U32_e64:
4434 case AMDGPU::V_SUBB_U32_e64:
4435 case AMDGPU::V_SUBBREV_U32_e64: {
4436 const MachineOperand *Src1
4437 = getNamedOperand(MI, AMDGPU::OpName::src1);
4438 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4439 return false;
4440 // Additional verification is needed for sdst/src2.
4441 return true;
4442 }
4443 case AMDGPU::V_MAC_F16_e64:
4444 case AMDGPU::V_MAC_F32_e64:
4445 case AMDGPU::V_MAC_LEGACY_F32_e64:
4446 case AMDGPU::V_FMAC_F16_e64:
4447 case AMDGPU::V_FMAC_F16_fake16_e64:
4448 case AMDGPU::V_FMAC_F32_e64:
4449 case AMDGPU::V_FMAC_F64_e64:
4450 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4451 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4452 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4453 return false;
4454 break;
4455
4456 case AMDGPU::V_CNDMASK_B32_e64:
4457 break;
4458 }
4459 }
4460
4461 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4462 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4463 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4464 return false;
4465
4466 // We don't need to check src0, all input types are legal, so just make sure
4467 // src0 isn't using any modifiers.
4468 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4469 return false;
4470
4471 // Can it be shrunk to a valid 32 bit opcode?
4472 if (!hasVALU32BitEncoding(MI.getOpcode()))
4473 return false;
4474
4475 // Check output modifiers
4476 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4477 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4478 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4479 // TODO: Can we avoid checking bound_ctrl/fi here?
4480 // They are only used by permlane*_swap special case.
4481 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4482 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4483}
4484
4485// Set VCC operand with all flags from \p Orig, except for setting it as
4486// implicit.
4488 const MachineOperand &Orig) {
4489
4490 for (MachineOperand &Use : MI.implicit_operands()) {
4491 if (Use.isUse() &&
4492 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4493 Use.setIsUndef(Orig.isUndef());
4494 Use.setIsKill(Orig.isKill());
4495 return;
4496 }
4497 }
4498}
4499
4501 unsigned Op32) const {
4502 MachineBasicBlock *MBB = MI.getParent();
4503
4504 const MCInstrDesc &Op32Desc = get(Op32);
4505 MachineInstrBuilder Inst32 =
4506 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4507 .setMIFlags(MI.getFlags());
4508
4509 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4510 // For VOPC instructions, this is replaced by an implicit def of vcc.
4511
4512 // We assume the defs of the shrunk opcode are in the same order, and the
4513 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4514 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4515 Inst32.add(MI.getOperand(I));
4516
4517 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4518
4519 int Idx = MI.getNumExplicitDefs();
4520 for (const MachineOperand &Use : MI.explicit_uses()) {
4521 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4523 continue;
4524
4525 if (&Use == Src2) {
4526 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4527 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4528 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4529 // of vcc was already added during the initial BuildMI, but we
4530 // 1) may need to change vcc to vcc_lo to preserve the original register
4531 // 2) have to preserve the original flags.
4532 copyFlagsToImplicitVCC(*Inst32, *Src2);
4533 continue;
4534 }
4535 }
4536
4537 Inst32.add(Use);
4538 }
4539
4540 // FIXME: Losing implicit operands
4541 fixImplicitOperands(*Inst32);
4542 return Inst32;
4543}
4544
4546 const MachineOperand &MO,
4547 const MCOperandInfo &OpInfo) const {
4548 // Literal constants use the constant bus.
4549 if (!MO.isReg())
4550 return !isInlineConstant(MO, OpInfo);
4551
4552 if (!MO.isUse())
4553 return false;
4554
4555 if (MO.getReg().isVirtual())
4556 return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
4557
4558 // Null is free
4559 if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64)
4560 return false;
4561
4562 // SGPRs use the constant bus
4563 if (MO.isImplicit()) {
4564 return MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
4565 MO.getReg() == AMDGPU::VCC_LO;
4566 }
4567 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
4568 AMDGPU::SReg_64RegClass.contains(MO.getReg());
4569}
4570
4572 for (const MachineOperand &MO : MI.implicit_operands()) {
4573 // We only care about reads.
4574 if (MO.isDef())
4575 continue;
4576
4577 switch (MO.getReg()) {
4578 case AMDGPU::VCC:
4579 case AMDGPU::VCC_LO:
4580 case AMDGPU::VCC_HI:
4581 case AMDGPU::M0:
4582 case AMDGPU::FLAT_SCR:
4583 return MO.getReg();
4584
4585 default:
4586 break;
4587 }
4588 }
4589
4590 return Register();
4591}
4592
4593static bool shouldReadExec(const MachineInstr &MI) {
4594 if (SIInstrInfo::isVALU(MI)) {
4595 switch (MI.getOpcode()) {
4596 case AMDGPU::V_READLANE_B32:
4597 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4598 case AMDGPU::V_WRITELANE_B32:
4599 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4600 return false;
4601 }
4602
4603 return true;
4604 }
4605
4606 if (MI.isPreISelOpcode() ||
4607 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4610 return false;
4611
4612 return true;
4613}
4614
4615static bool isRegOrFI(const MachineOperand &MO) {
4616 return MO.isReg() || MO.isFI();
4617}
4618
4619static bool isSubRegOf(const SIRegisterInfo &TRI,
4620 const MachineOperand &SuperVec,
4621 const MachineOperand &SubReg) {
4622 if (SubReg.getReg().isPhysical())
4623 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4624
4625 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4626 SubReg.getReg() == SuperVec.getReg();
4627}
4628
4629// Verify the illegal copy from vector register to SGPR for generic opcode COPY
4630bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
4631 const MachineRegisterInfo &MRI,
4632 StringRef &ErrInfo) const {
4633 Register DstReg = MI.getOperand(0).getReg();
4634 Register SrcReg = MI.getOperand(1).getReg();
4635 // This is a check for copy from vector register to SGPR
4636 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
4637 ErrInfo = "illegal copy from vector register to SGPR";
4638 return false;
4639 }
4640 return true;
4641}
4642
4644 StringRef &ErrInfo) const {