LLVM 18.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "GCNHazardRecognizer.h"
18#include "GCNSubtarget.h"
30#include "llvm/IR/IntrinsicsAMDGPU.h"
31#include "llvm/MC/MCContext.h"
34
35using namespace llvm;
36
37#define DEBUG_TYPE "si-instr-info"
38
39#define GET_INSTRINFO_CTOR_DTOR
40#include "AMDGPUGenInstrInfo.inc"
41
42namespace llvm {
43namespace AMDGPU {
44#define GET_D16ImageDimIntrinsics_IMPL
45#define GET_ImageDimIntrinsicTable_IMPL
46#define GET_RsrcIntrinsics_IMPL
47#include "AMDGPUGenSearchableTables.inc"
48}
49}
50
51
52// Must be at least 4 to be able to branch over minimum unconditional branch
53// code. This is only for making it possible to write reasonably small tests for
54// long branches.
56BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
57 cl::desc("Restrict range of branch instructions (DEBUG)"));
58
60 "amdgpu-fix-16-bit-physreg-copies",
61 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
62 cl::init(true),
64
66 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
67 RI(ST), ST(ST) {
68 SchedModel.init(&ST);
69}
70
71//===----------------------------------------------------------------------===//
72// TargetInstrInfo callbacks
73//===----------------------------------------------------------------------===//
74
75static unsigned getNumOperandsNoGlue(SDNode *Node) {
76 unsigned N = Node->getNumOperands();
77 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
78 --N;
79 return N;
80}
81
82/// Returns true if both nodes have the same value for the given
83/// operand \p Op, or if both nodes do not have this operand.
84static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
85 unsigned Opc0 = N0->getMachineOpcode();
86 unsigned Opc1 = N1->getMachineOpcode();
87
88 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
89 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
90
91 if (Op0Idx == -1 && Op1Idx == -1)
92 return true;
93
94
95 if ((Op0Idx == -1 && Op1Idx != -1) ||
96 (Op1Idx == -1 && Op0Idx != -1))
97 return false;
98
99 // getNamedOperandIdx returns the index for the MachineInstr's operands,
100 // which includes the result as the first operand. We are indexing into the
101 // MachineSDNode's operands, so we need to skip the result operand to get
102 // the real index.
103 --Op0Idx;
104 --Op1Idx;
105
106 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
107}
108
110 const MachineInstr &MI) const {
111 if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI) || isSALU(MI)) {
112 // Normally VALU use of exec would block the rematerialization, but that
113 // is OK in this case to have an implicit exec read as all VALU do.
114 // We really want all of the generic logic for this except for this.
115
116 // Another potential implicit use is mode register. The core logic of
117 // the RA will not attempt rematerialization if mode is set anywhere
118 // in the function, otherwise it is safe since mode is not changed.
119
120 // There is difference to generic method which does not allow
121 // rematerialization if there are virtual register uses. We allow this,
122 // therefore this method includes SOP instructions as well.
123 if (!MI.hasImplicitDef() &&
124 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
125 !MI.mayRaiseFPException())
126 return true;
127 }
128
130}
131
132// Returns true if the scalar result of a VALU instruction depends on exec.
134 // Ignore comparisons which are only used masked with exec.
135 // This allows some hoisting/sinking of VALU comparisons.
136 if (MI.isCompare()) {
137 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
138 Register DstReg = MI.getOperand(0).getReg();
139 if (!DstReg.isVirtual())
140 return true;
141 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
142 switch (Use.getOpcode()) {
143 case AMDGPU::S_AND_SAVEEXEC_B32:
144 case AMDGPU::S_AND_SAVEEXEC_B64:
145 break;
146 case AMDGPU::S_AND_B32:
147 case AMDGPU::S_AND_B64:
148 if (!Use.readsRegister(AMDGPU::EXEC))
149 return true;
150 break;
151 default:
152 return true;
153 }
154 }
155 return false;
156 }
157
158 switch (MI.getOpcode()) {
159 default:
160 break;
161 case AMDGPU::V_READFIRSTLANE_B32:
162 return true;
163 }
164
165 return false;
166}
167
169 // Any implicit use of exec by VALU is not a real register read.
170 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
172}
173
175 int64_t &Offset0,
176 int64_t &Offset1) const {
177 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
178 return false;
179
180 unsigned Opc0 = Load0->getMachineOpcode();
181 unsigned Opc1 = Load1->getMachineOpcode();
182
183 // Make sure both are actually loads.
184 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
185 return false;
186
187 if (isDS(Opc0) && isDS(Opc1)) {
188
189 // FIXME: Handle this case:
190 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
191 return false;
192
193 // Check base reg.
194 if (Load0->getOperand(0) != Load1->getOperand(0))
195 return false;
196
197 // Skip read2 / write2 variants for simplicity.
198 // TODO: We should report true if the used offsets are adjacent (excluded
199 // st64 versions).
200 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
201 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
202 if (Offset0Idx == -1 || Offset1Idx == -1)
203 return false;
204
205 // XXX - be careful of dataless loads
206 // getNamedOperandIdx returns the index for MachineInstrs. Since they
207 // include the output in the operand list, but SDNodes don't, we need to
208 // subtract the index by one.
209 Offset0Idx -= get(Opc0).NumDefs;
210 Offset1Idx -= get(Opc1).NumDefs;
211 Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue();
212 Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue();
213 return true;
214 }
215
216 if (isSMRD(Opc0) && isSMRD(Opc1)) {
217 // Skip time and cache invalidation instructions.
218 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
219 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
220 return false;
221
222 unsigned NumOps = getNumOperandsNoGlue(Load0);
223 if (NumOps != getNumOperandsNoGlue(Load1))
224 return false;
225
226 // Check base reg.
227 if (Load0->getOperand(0) != Load1->getOperand(0))
228 return false;
229
230 // Match register offsets, if both register and immediate offsets present.
231 assert(NumOps == 4 || NumOps == 5);
232 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
233 return false;
234
235 const ConstantSDNode *Load0Offset =
236 dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3));
237 const ConstantSDNode *Load1Offset =
238 dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3));
239
240 if (!Load0Offset || !Load1Offset)
241 return false;
242
243 Offset0 = Load0Offset->getZExtValue();
244 Offset1 = Load1Offset->getZExtValue();
245 return true;
246 }
247
248 // MUBUF and MTBUF can access the same addresses.
249 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
250
251 // MUBUF and MTBUF have vaddr at different indices.
252 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
253 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
254 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
255 return false;
256
257 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
258 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
259
260 if (OffIdx0 == -1 || OffIdx1 == -1)
261 return false;
262
263 // getNamedOperandIdx returns the index for MachineInstrs. Since they
264 // include the output in the operand list, but SDNodes don't, we need to
265 // subtract the index by one.
266 OffIdx0 -= get(Opc0).NumDefs;
267 OffIdx1 -= get(Opc1).NumDefs;
268
269 SDValue Off0 = Load0->getOperand(OffIdx0);
270 SDValue Off1 = Load1->getOperand(OffIdx1);
271
272 // The offset might be a FrameIndexSDNode.
273 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
274 return false;
275
276 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
277 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
278 return true;
279 }
280
281 return false;
282}
283
284static bool isStride64(unsigned Opc) {
285 switch (Opc) {
286 case AMDGPU::DS_READ2ST64_B32:
287 case AMDGPU::DS_READ2ST64_B64:
288 case AMDGPU::DS_WRITE2ST64_B32:
289 case AMDGPU::DS_WRITE2ST64_B64:
290 return true;
291 default:
292 return false;
293 }
294}
295
298 int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
299 const TargetRegisterInfo *TRI) const {
300 if (!LdSt.mayLoadOrStore())
301 return false;
302
303 unsigned Opc = LdSt.getOpcode();
304 OffsetIsScalable = false;
305 const MachineOperand *BaseOp, *OffsetOp;
306 int DataOpIdx;
307
308 if (isDS(LdSt)) {
309 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
310 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
311 if (OffsetOp) {
312 // Normal, single offset LDS instruction.
313 if (!BaseOp) {
314 // DS_CONSUME/DS_APPEND use M0 for the base address.
315 // TODO: find the implicit use operand for M0 and use that as BaseOp?
316 return false;
317 }
318 BaseOps.push_back(BaseOp);
319 Offset = OffsetOp->getImm();
320 // Get appropriate operand, and compute width accordingly.
321 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
322 if (DataOpIdx == -1)
323 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
324 Width = getOpSize(LdSt, DataOpIdx);
325 } else {
326 // The 2 offset instructions use offset0 and offset1 instead. We can treat
327 // these as a load with a single offset if the 2 offsets are consecutive.
328 // We will use this for some partially aligned loads.
329 const MachineOperand *Offset0Op =
330 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
331 const MachineOperand *Offset1Op =
332 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
333
334 unsigned Offset0 = Offset0Op->getImm() & 0xff;
335 unsigned Offset1 = Offset1Op->getImm() & 0xff;
336 if (Offset0 + 1 != Offset1)
337 return false;
338
339 // Each of these offsets is in element sized units, so we need to convert
340 // to bytes of the individual reads.
341
342 unsigned EltSize;
343 if (LdSt.mayLoad())
344 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
345 else {
346 assert(LdSt.mayStore());
347 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
348 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
349 }
350
351 if (isStride64(Opc))
352 EltSize *= 64;
353
354 BaseOps.push_back(BaseOp);
355 Offset = EltSize * Offset0;
356 // Get appropriate operand(s), and compute width accordingly.
357 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
358 if (DataOpIdx == -1) {
359 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
360 Width = getOpSize(LdSt, DataOpIdx);
361 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
362 Width += getOpSize(LdSt, DataOpIdx);
363 } else {
364 Width = getOpSize(LdSt, DataOpIdx);
365 }
366 }
367 return true;
368 }
369
370 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
371 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
372 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
373 return false;
374 BaseOps.push_back(RSrc);
375 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
376 if (BaseOp && !BaseOp->isFI())
377 BaseOps.push_back(BaseOp);
378 const MachineOperand *OffsetImm =
379 getNamedOperand(LdSt, AMDGPU::OpName::offset);
380 Offset = OffsetImm->getImm();
381 const MachineOperand *SOffset =
382 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
383 if (SOffset) {
384 if (SOffset->isReg())
385 BaseOps.push_back(SOffset);
386 else
387 Offset += SOffset->getImm();
388 }
389 // Get appropriate operand, and compute width accordingly.
390 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
391 if (DataOpIdx == -1)
392 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
393 if (DataOpIdx == -1) // LDS DMA
394 return false;
395 Width = getOpSize(LdSt, DataOpIdx);
396 return true;
397 }
398
399 if (isMIMG(LdSt)) {
400 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
401 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
402 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
403 if (VAddr0Idx >= 0) {
404 // GFX10 possible NSA encoding.
405 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
406 BaseOps.push_back(&LdSt.getOperand(I));
407 } else {
408 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
409 }
410 Offset = 0;
411 // Get appropriate operand, and compute width accordingly.
412 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
413 Width = getOpSize(LdSt, DataOpIdx);
414 return true;
415 }
416
417 if (isSMRD(LdSt)) {
418 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
419 if (!BaseOp) // e.g. S_MEMTIME
420 return false;
421 BaseOps.push_back(BaseOp);
422 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
423 Offset = OffsetOp ? OffsetOp->getImm() : 0;
424 // Get appropriate operand, and compute width accordingly.
425 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
426 Width = getOpSize(LdSt, DataOpIdx);
427 return true;
428 }
429
430 if (isFLAT(LdSt)) {
431 // Instructions have either vaddr or saddr or both or none.
432 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
433 if (BaseOp)
434 BaseOps.push_back(BaseOp);
435 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
436 if (BaseOp)
437 BaseOps.push_back(BaseOp);
438 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
439 // Get appropriate operand, and compute width accordingly.
440 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
441 if (DataOpIdx == -1)
442 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
443 if (DataOpIdx == -1) // LDS DMA
444 return false;
445 Width = getOpSize(LdSt, DataOpIdx);
446 return true;
447 }
448
449 return false;
450}
451
452static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
454 const MachineInstr &MI2,
456 // Only examine the first "base" operand of each instruction, on the
457 // assumption that it represents the real base address of the memory access.
458 // Other operands are typically offsets or indices from this base address.
459 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
460 return true;
461
462 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
463 return false;
464
465 auto MO1 = *MI1.memoperands_begin();
466 auto MO2 = *MI2.memoperands_begin();
467 if (MO1->getAddrSpace() != MO2->getAddrSpace())
468 return false;
469
470 auto Base1 = MO1->getValue();
471 auto Base2 = MO2->getValue();
472 if (!Base1 || !Base2)
473 return false;
474 Base1 = getUnderlyingObject(Base1);
475 Base2 = getUnderlyingObject(Base2);
476
477 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
478 return false;
479
480 return Base1 == Base2;
481}
482
485 unsigned NumLoads,
486 unsigned NumBytes) const {
487 // If the mem ops (to be clustered) do not have the same base ptr, then they
488 // should not be clustered
489 if (!BaseOps1.empty() && !BaseOps2.empty()) {
490 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
491 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
492 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
493 return false;
494 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
495 // If only one base op is empty, they do not have the same base ptr
496 return false;
497 }
498
499 // In order to avoid register pressure, on an average, the number of DWORDS
500 // loaded together by all clustered mem ops should not exceed 8. This is an
501 // empirical value based on certain observations and performance related
502 // experiments.
503 // The good thing about this heuristic is - it avoids clustering of too many
504 // sub-word loads, and also avoids clustering of wide loads. Below is the
505 // brief summary of how the heuristic behaves for various `LoadSize`.
506 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
507 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
508 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
509 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
510 // (5) LoadSize >= 17: do not cluster
511 const unsigned LoadSize = NumBytes / NumLoads;
512 const unsigned NumDWORDs = ((LoadSize + 3) / 4) * NumLoads;
513 return NumDWORDs <= 8;
514}
515
516// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
517// the first 16 loads will be interleaved with the stores, and the next 16 will
518// be clustered as expected. It should really split into 2 16 store batches.
519//
520// Loads are clustered until this returns false, rather than trying to schedule
521// groups of stores. This also means we have to deal with saying different
522// address space loads should be clustered, and ones which might cause bank
523// conflicts.
524//
525// This might be deprecated so it might not be worth that much effort to fix.
527 int64_t Offset0, int64_t Offset1,
528 unsigned NumLoads) const {
529 assert(Offset1 > Offset0 &&
530 "Second offset should be larger than first offset!");
531 // If we have less than 16 loads in a row, and the offsets are within 64
532 // bytes, then schedule together.
533
534 // A cacheline is 64 bytes (for global memory).
535 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
536}
537
540 const DebugLoc &DL, MCRegister DestReg,
541 MCRegister SrcReg, bool KillSrc,
542 const char *Msg = "illegal VGPR to SGPR copy") {
544 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
546 C.diagnose(IllegalCopy);
547
548 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
549 .addReg(SrcReg, getKillRegState(KillSrc));
550}
551
552/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
553/// possible to have a direct copy in these cases on GFX908, so an intermediate
554/// VGPR copy is required.
558 const DebugLoc &DL, MCRegister DestReg,
559 MCRegister SrcReg, bool KillSrc,
560 RegScavenger &RS, bool RegsOverlap,
561 Register ImpDefSuperReg = Register(),
562 Register ImpUseSuperReg = Register()) {
563 assert((TII.getSubtarget().hasMAIInsts() &&
564 !TII.getSubtarget().hasGFX90AInsts()) &&
565 "Expected GFX908 subtarget.");
566
567 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
568 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
569 "Source register of the copy should be either an SGPR or an AGPR.");
570
571 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
572 "Destination register of the copy should be an AGPR.");
573
574 const SIRegisterInfo &RI = TII.getRegisterInfo();
575
576 // First try to find defining accvgpr_write to avoid temporary registers.
577 // In the case of copies of overlapping AGPRs, we conservatively do not
578 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
579 // an accvgpr_write used for this same copy due to implicit-defs
580 if (!RegsOverlap) {
581 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
582 --Def;
583
584 if (!Def->modifiesRegister(SrcReg, &RI))
585 continue;
586
587 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
588 Def->getOperand(0).getReg() != SrcReg)
589 break;
590
591 MachineOperand &DefOp = Def->getOperand(1);
592 assert(DefOp.isReg() || DefOp.isImm());
593
594 if (DefOp.isReg()) {
595 bool SafeToPropagate = true;
596 // Check that register source operand is not clobbered before MI.
597 // Immediate operands are always safe to propagate.
598 for (auto I = Def; I != MI && SafeToPropagate; ++I)
599 if (I->modifiesRegister(DefOp.getReg(), &RI))
600 SafeToPropagate = false;
601
602 if (!SafeToPropagate)
603 break;
604
605 DefOp.setIsKill(false);
606 }
607
609 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
610 .add(DefOp);
611 if (ImpDefSuperReg)
612 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
613
614 if (ImpUseSuperReg) {
615 Builder.addReg(ImpUseSuperReg,
617 }
618
619 return;
620 }
621 }
622
624 RS.backward(MI);
625
626 // Ideally we want to have three registers for a long reg_sequence copy
627 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
628 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
629 *MBB.getParent());
630
631 // Registers in the sequence are allocated contiguously so we can just
632 // use register number to pick one of three round-robin temps.
633 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
634 Register Tmp =
635 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
637 "VGPR used for an intermediate copy should have been reserved.");
638
639 // Only loop through if there are any free registers left. We don't want to
640 // spill.
641 while (RegNo--) {
642 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
643 /* RestoreAfter */ false, 0,
644 /* AllowSpill */ false);
645 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
646 break;
647 Tmp = Tmp2;
648 RS.setRegUsed(Tmp);
649 }
650
651 // Insert copy to temporary VGPR.
652 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
653 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
654 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
655 } else {
656 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
657 }
658
659 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
660 .addReg(SrcReg, getKillRegState(KillSrc));
661 if (ImpUseSuperReg) {
662 UseBuilder.addReg(ImpUseSuperReg,
664 }
665
666 MachineInstrBuilder DefBuilder
667 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
668 .addReg(Tmp, RegState::Kill);
669
670 if (ImpDefSuperReg)
671 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
672}
673
676 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
677 const TargetRegisterClass *RC, bool Forward) {
678 const SIRegisterInfo &RI = TII.getRegisterInfo();
679 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
681 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
682
683 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
684 int16_t SubIdx = BaseIndices[Idx];
685 Register Reg = RI.getSubReg(DestReg, SubIdx);
686 unsigned Opcode = AMDGPU::S_MOV_B32;
687
688 // Is SGPR aligned? If so try to combine with next.
689 Register Src = RI.getSubReg(SrcReg, SubIdx);
690 bool AlignedDest = ((Reg - AMDGPU::SGPR0) % 2) == 0;
691 bool AlignedSrc = ((Src - AMDGPU::SGPR0) % 2) == 0;
692 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
693 // Can use SGPR64 copy
694 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
695 SubIdx = RI.getSubRegFromChannel(Channel, 2);
696 Opcode = AMDGPU::S_MOV_B64;
697 Idx++;
698 }
699
700 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), RI.getSubReg(DestReg, SubIdx))
701 .addReg(RI.getSubReg(SrcReg, SubIdx))
702 .addReg(SrcReg, RegState::Implicit);
703
704 if (!FirstMI)
705 FirstMI = LastMI;
706
707 if (!Forward)
708 I--;
709 }
710
711 assert(FirstMI && LastMI);
712 if (!Forward)
713 std::swap(FirstMI, LastMI);
714
715 FirstMI->addOperand(
716 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
717
718 if (KillSrc)
719 LastMI->addRegisterKilled(SrcReg, &RI);
720}
721
724 const DebugLoc &DL, MCRegister DestReg,
725 MCRegister SrcReg, bool KillSrc) const {
726 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
727 unsigned Size = RI.getRegSizeInBits(*RC);
728 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
729 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
730
731 // The rest of copyPhysReg assumes Src and Dst size are the same size.
732 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
733 // we remove Fix16BitCopies and this code block?
734 if (Fix16BitCopies) {
735 if (((Size == 16) != (SrcSize == 16))) {
736 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
738 MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg;
739 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
740 RegToFix = SubReg;
741
742 if (DestReg == SrcReg) {
743 // Identity copy. Insert empty bundle since ExpandPostRA expects an
744 // instruction here.
745 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
746 return;
747 }
748 RC = RI.getPhysRegBaseClass(DestReg);
749 Size = RI.getRegSizeInBits(*RC);
750 SrcRC = RI.getPhysRegBaseClass(SrcReg);
751 SrcSize = RI.getRegSizeInBits(*SrcRC);
752 }
753 }
754
755 if (RC == &AMDGPU::VGPR_32RegClass) {
756 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
757 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
758 AMDGPU::AGPR_32RegClass.contains(SrcReg));
759 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
760 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
761 BuildMI(MBB, MI, DL, get(Opc), DestReg)
762 .addReg(SrcReg, getKillRegState(KillSrc));
763 return;
764 }
765
766 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
767 RC == &AMDGPU::SReg_32RegClass) {
768 if (SrcReg == AMDGPU::SCC) {
769 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
770 .addImm(1)
771 .addImm(0);
772 return;
773 }
774
775 if (DestReg == AMDGPU::VCC_LO) {
776 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
777 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
778 .addReg(SrcReg, getKillRegState(KillSrc));
779 } else {
780 // FIXME: Hack until VReg_1 removed.
781 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
782 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
783 .addImm(0)
784 .addReg(SrcReg, getKillRegState(KillSrc));
785 }
786
787 return;
788 }
789
790 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
791 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
792 return;
793 }
794
795 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
796 .addReg(SrcReg, getKillRegState(KillSrc));
797 return;
798 }
799
800 if (RC == &AMDGPU::SReg_64RegClass) {
801 if (SrcReg == AMDGPU::SCC) {
802 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
803 .addImm(1)
804 .addImm(0);
805 return;
806 }
807
808 if (DestReg == AMDGPU::VCC) {
809 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
810 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
811 .addReg(SrcReg, getKillRegState(KillSrc));
812 } else {
813 // FIXME: Hack until VReg_1 removed.
814 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
815 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
816 .addImm(0)
817 .addReg(SrcReg, getKillRegState(KillSrc));
818 }
819
820 return;
821 }
822
823 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
824 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
825 return;
826 }
827
828 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
829 .addReg(SrcReg, getKillRegState(KillSrc));
830 return;
831 }
832
833 if (DestReg == AMDGPU::SCC) {
834 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
835 // but SelectionDAG emits such copies for i1 sources.
836 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
837 // This copy can only be produced by patterns
838 // with explicit SCC, which are known to be enabled
839 // only for subtargets with S_CMP_LG_U64 present.
841 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
842 .addReg(SrcReg, getKillRegState(KillSrc))
843 .addImm(0);
844 } else {
845 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
846 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
847 .addReg(SrcReg, getKillRegState(KillSrc))
848 .addImm(0);
849 }
850
851 return;
852 }
853
854 if (RC == &AMDGPU::AGPR_32RegClass) {
855 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
856 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
857 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
858 .addReg(SrcReg, getKillRegState(KillSrc));
859 return;
860 }
861
862 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
863 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
864 .addReg(SrcReg, getKillRegState(KillSrc));
865 return;
866 }
867
868 // FIXME: Pass should maintain scavenger to avoid scan through the block on
869 // every AGPR spill.
870 RegScavenger RS;
871 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
872 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
873 return;
874 }
875
876 if (Size == 16) {
877 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
878 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
879 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
880
881 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
882 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
883 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
884 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
885 bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) ||
886 AMDGPU::SReg_LO16RegClass.contains(DestReg) ||
887 AMDGPU::AGPR_LO16RegClass.contains(DestReg);
888 bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
889 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
890 AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
891 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
892 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
893
894 if (IsSGPRDst) {
895 if (!IsSGPRSrc) {
896 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
897 return;
898 }
899
900 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
901 .addReg(NewSrcReg, getKillRegState(KillSrc));
902 return;
903 }
904
905 if (IsAGPRDst || IsAGPRSrc) {
906 if (!DstLow || !SrcLow) {
907 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
908 "Cannot use hi16 subreg with an AGPR!");
909 }
910
911 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
912 return;
913 }
914
915 if (ST.hasTrue16BitInsts()) {
916 if (IsSGPRSrc) {
917 assert(SrcLow);
918 SrcReg = NewSrcReg;
919 }
920 // Use the smaller instruction encoding if possible.
921 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
922 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
923 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
924 .addReg(SrcReg);
925 } else {
926 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
927 .addImm(0) // src0_modifiers
928 .addReg(SrcReg)
929 .addImm(0); // op_sel
930 }
931 return;
932 }
933
934 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
935 if (!DstLow || !SrcLow) {
936 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
937 "Cannot use hi16 subreg on VI!");
938 }
939
940 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
941 .addReg(NewSrcReg, getKillRegState(KillSrc));
942 return;
943 }
944
945 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
946 .addImm(0) // src0_modifiers
947 .addReg(NewSrcReg)
948 .addImm(0) // clamp
955 // First implicit operand is $exec.
956 MIB->tieOperands(0, MIB->getNumOperands() - 1);
957 return;
958 }
959
960 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
961 if (ST.hasMovB64()) {
962 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
963 .addReg(SrcReg, getKillRegState(KillSrc));
964 return;
965 }
966 if (ST.hasPkMovB32()) {
967 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
969 .addReg(SrcReg)
971 .addReg(SrcReg)
972 .addImm(0) // op_sel_lo
973 .addImm(0) // op_sel_hi
974 .addImm(0) // neg_lo
975 .addImm(0) // neg_hi
976 .addImm(0) // clamp
977 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
978 return;
979 }
980 }
981
982 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
983 if (RI.isSGPRClass(RC)) {
984 if (!RI.isSGPRClass(SrcRC)) {
985 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
986 return;
987 }
988 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
989 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
990 Forward);
991 return;
992 }
993
994 unsigned EltSize = 4;
995 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
996 if (RI.isAGPRClass(RC)) {
997 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
998 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
999 else if (RI.hasVGPRs(SrcRC) ||
1000 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1001 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1002 else
1003 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1004 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1005 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1006 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1007 (RI.isProperlyAlignedRC(*RC) &&
1008 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1009 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1010 if (ST.hasMovB64()) {
1011 Opcode = AMDGPU::V_MOV_B64_e32;
1012 EltSize = 8;
1013 } else if (ST.hasPkMovB32()) {
1014 Opcode = AMDGPU::V_PK_MOV_B32;
1015 EltSize = 8;
1016 }
1017 }
1018
1019 // For the cases where we need an intermediate instruction/temporary register
1020 // (destination is an AGPR), we need a scavenger.
1021 //
1022 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1023 // whole block for every handled copy.
1024 std::unique_ptr<RegScavenger> RS;
1025 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1026 RS.reset(new RegScavenger());
1027
1028 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1029
1030 // If there is an overlap, we can't kill the super-register on the last
1031 // instruction, since it will also kill the components made live by this def.
1032 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1033 const bool CanKillSuperReg = KillSrc && !Overlap;
1034
1035 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1036 unsigned SubIdx;
1037 if (Forward)
1038 SubIdx = SubIndices[Idx];
1039 else
1040 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1041
1042 bool IsFirstSubreg = Idx == 0;
1043 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1044
1045 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1046 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1047 Register ImpUseSuper = SrcReg;
1048 indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx),
1049 RI.getSubReg(SrcReg, SubIdx), UseKill, *RS, Overlap,
1050 ImpDefSuper, ImpUseSuper);
1051 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1052 Register DstSubReg = RI.getSubReg(DestReg, SubIdx);
1053 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1055 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg)
1057 .addReg(SrcSubReg)
1059 .addReg(SrcSubReg)
1060 .addImm(0) // op_sel_lo
1061 .addImm(0) // op_sel_hi
1062 .addImm(0) // neg_lo
1063 .addImm(0) // neg_hi
1064 .addImm(0) // clamp
1065 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1066 if (IsFirstSubreg)
1068 } else {
1070 BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx))
1071 .addReg(RI.getSubReg(SrcReg, SubIdx));
1072 if (IsFirstSubreg)
1073 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1074
1075 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1076 }
1077 }
1078}
1079
1080int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1081 int NewOpc;
1082
1083 // Try to map original to commuted opcode
1084 NewOpc = AMDGPU::getCommuteRev(Opcode);
1085 if (NewOpc != -1)
1086 // Check if the commuted (REV) opcode exists on the target.
1087 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1088
1089 // Try to map commuted to original opcode
1090 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1091 if (NewOpc != -1)
1092 // Check if the original (non-REV) opcode exists on the target.
1093 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1094
1095 return Opcode;
1096}
1097
1100 const DebugLoc &DL, Register DestReg,
1101 int64_t Value) const {
1103 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
1104 if (RegClass == &AMDGPU::SReg_32RegClass ||
1105 RegClass == &AMDGPU::SGPR_32RegClass ||
1106 RegClass == &AMDGPU::SReg_32_XM0RegClass ||
1107 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
1108 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
1109 .addImm(Value);
1110 return;
1111 }
1112
1113 if (RegClass == &AMDGPU::SReg_64RegClass ||
1114 RegClass == &AMDGPU::SGPR_64RegClass ||
1115 RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
1116 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
1117 .addImm(Value);
1118 return;
1119 }
1120
1121 if (RegClass == &AMDGPU::VGPR_32RegClass) {
1122 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
1123 .addImm(Value);
1124 return;
1125 }
1126 if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
1127 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
1128 .addImm(Value);
1129 return;
1130 }
1131
1132 unsigned EltSize = 4;
1133 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1134 if (RI.isSGPRClass(RegClass)) {
1135 if (RI.getRegSizeInBits(*RegClass) > 32) {
1136 Opcode = AMDGPU::S_MOV_B64;
1137 EltSize = 8;
1138 } else {
1139 Opcode = AMDGPU::S_MOV_B32;
1140 EltSize = 4;
1141 }
1142 }
1143
1144 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
1145 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1146 int64_t IdxValue = Idx == 0 ? Value : 0;
1147
1149 get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
1150 Builder.addImm(IdxValue);
1151 }
1152}
1153
1154const TargetRegisterClass *
1156 return &AMDGPU::VGPR_32RegClass;
1157}
1158
1161 const DebugLoc &DL, Register DstReg,
1163 Register TrueReg,
1164 Register FalseReg) const {
1166 const TargetRegisterClass *BoolXExecRC =
1167 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1168 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1169 "Not a VGPR32 reg");
1170
1171 if (Cond.size() == 1) {
1172 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1173 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1174 .add(Cond[0]);
1175 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1176 .addImm(0)
1177 .addReg(FalseReg)
1178 .addImm(0)
1179 .addReg(TrueReg)
1180 .addReg(SReg);
1181 } else if (Cond.size() == 2) {
1182 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1183 switch (Cond[0].getImm()) {
1184 case SIInstrInfo::SCC_TRUE: {
1185 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1186 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1187 : AMDGPU::S_CSELECT_B64), SReg)
1188 .addImm(1)
1189 .addImm(0);
1190 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1191 .addImm(0)
1192 .addReg(FalseReg)
1193 .addImm(0)
1194 .addReg(TrueReg)
1195 .addReg(SReg);
1196 break;
1197 }
1198 case SIInstrInfo::SCC_FALSE: {
1199 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1200 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1201 : AMDGPU::S_CSELECT_B64), SReg)
1202 .addImm(0)
1203 .addImm(1);
1204 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1205 .addImm(0)
1206 .addReg(FalseReg)
1207 .addImm(0)
1208 .addReg(TrueReg)
1209 .addReg(SReg);
1210 break;
1211 }
1212 case SIInstrInfo::VCCNZ: {
1213 MachineOperand RegOp = Cond[1];
1214 RegOp.setImplicit(false);
1215 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1216 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1217 .add(RegOp);
1218 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1219 .addImm(0)
1220 .addReg(FalseReg)
1221 .addImm(0)
1222 .addReg(TrueReg)
1223 .addReg(SReg);
1224 break;
1225 }
1226 case SIInstrInfo::VCCZ: {
1227 MachineOperand RegOp = Cond[1];
1228 RegOp.setImplicit(false);
1229 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1230 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1231 .add(RegOp);
1232 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1233 .addImm(0)
1234 .addReg(TrueReg)
1235 .addImm(0)
1236 .addReg(FalseReg)
1237 .addReg(SReg);
1238 break;
1239 }
1240 case SIInstrInfo::EXECNZ: {
1241 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1242 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1243 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1244 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1245 .addImm(0);
1246 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1247 : AMDGPU::S_CSELECT_B64), SReg)
1248 .addImm(1)
1249 .addImm(0);
1250 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1251 .addImm(0)
1252 .addReg(FalseReg)
1253 .addImm(0)
1254 .addReg(TrueReg)
1255 .addReg(SReg);
1256 break;
1257 }
1258 case SIInstrInfo::EXECZ: {
1259 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1260 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1261 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1262 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1263 .addImm(0);
1264 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1265 : AMDGPU::S_CSELECT_B64), SReg)
1266 .addImm(0)
1267 .addImm(1);
1268 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1269 .addImm(0)
1270 .addReg(FalseReg)
1271 .addImm(0)
1272 .addReg(TrueReg)
1273 .addReg(SReg);
1274 llvm_unreachable("Unhandled branch predicate EXECZ");
1275 break;
1276 }
1277 default:
1278 llvm_unreachable("invalid branch predicate");
1279 }
1280 } else {
1281 llvm_unreachable("Can only handle Cond size 1 or 2");
1282 }
1283}
1284
1287 const DebugLoc &DL,
1288 Register SrcReg, int Value) const {
1290 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1291 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1292 .addImm(Value)
1293 .addReg(SrcReg);
1294
1295 return Reg;
1296}
1297
1300 const DebugLoc &DL,
1301 Register SrcReg, int Value) const {
1303 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1304 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1305 .addImm(Value)
1306 .addReg(SrcReg);
1307
1308 return Reg;
1309}
1310
1312
1313 if (RI.isAGPRClass(DstRC))
1314 return AMDGPU::COPY;
1315 if (RI.getRegSizeInBits(*DstRC) == 16) {
1316 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1317 // before RA.
1318 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1319 } else if (RI.getRegSizeInBits(*DstRC) == 32) {
1320 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1321 } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
1322 return AMDGPU::S_MOV_B64;
1323 } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
1324 return AMDGPU::V_MOV_B64_PSEUDO;
1325 }
1326 return AMDGPU::COPY;
1327}
1328
1329const MCInstrDesc &
1331 bool IsIndirectSrc) const {
1332 if (IsIndirectSrc) {
1333 if (VecSize <= 32) // 4 bytes
1334 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1335 if (VecSize <= 64) // 8 bytes
1336 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1337 if (VecSize <= 96) // 12 bytes
1338 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1339 if (VecSize <= 128) // 16 bytes
1340 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1341 if (VecSize <= 160) // 20 bytes
1342 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1343 if (VecSize <= 256) // 32 bytes
1344 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1345 if (VecSize <= 288) // 36 bytes
1346 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1347 if (VecSize <= 320) // 40 bytes
1348 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1349 if (VecSize <= 352) // 44 bytes
1350 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1351 if (VecSize <= 384) // 48 bytes
1352 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1353 if (VecSize <= 512) // 64 bytes
1354 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1355 if (VecSize <= 1024) // 128 bytes
1356 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1357
1358 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1359 }
1360
1361 if (VecSize <= 32) // 4 bytes
1362 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1363 if (VecSize <= 64) // 8 bytes
1364 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1365 if (VecSize <= 96) // 12 bytes
1366 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1367 if (VecSize <= 128) // 16 bytes
1368 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1369 if (VecSize <= 160) // 20 bytes
1370 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1371 if (VecSize <= 256) // 32 bytes
1372 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1373 if (VecSize <= 288) // 36 bytes
1374 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1375 if (VecSize <= 320) // 40 bytes
1376 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1377 if (VecSize <= 352) // 44 bytes
1378 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1379 if (VecSize <= 384) // 48 bytes
1380 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1381 if (VecSize <= 512) // 64 bytes
1382 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1383 if (VecSize <= 1024) // 128 bytes
1384 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1385
1386 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1387}
1388
1389static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1390 if (VecSize <= 32) // 4 bytes
1391 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1392 if (VecSize <= 64) // 8 bytes
1393 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1394 if (VecSize <= 96) // 12 bytes
1395 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1396 if (VecSize <= 128) // 16 bytes
1397 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1398 if (VecSize <= 160) // 20 bytes
1399 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1400 if (VecSize <= 256) // 32 bytes
1401 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1402 if (VecSize <= 288) // 36 bytes
1403 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1404 if (VecSize <= 320) // 40 bytes
1405 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1406 if (VecSize <= 352) // 44 bytes
1407 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1408 if (VecSize <= 384) // 48 bytes
1409 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1410 if (VecSize <= 512) // 64 bytes
1411 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1412 if (VecSize <= 1024) // 128 bytes
1413 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1414
1415 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1416}
1417
1418static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1419 if (VecSize <= 32) // 4 bytes
1420 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1421 if (VecSize <= 64) // 8 bytes
1422 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1423 if (VecSize <= 96) // 12 bytes
1424 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1425 if (VecSize <= 128) // 16 bytes
1426 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1427 if (VecSize <= 160) // 20 bytes
1428 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1429 if (VecSize <= 256) // 32 bytes
1430 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1431 if (VecSize <= 288) // 36 bytes
1432 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1433 if (VecSize <= 320) // 40 bytes
1434 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1435 if (VecSize <= 352) // 44 bytes
1436 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1437 if (VecSize <= 384) // 48 bytes
1438 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1439 if (VecSize <= 512) // 64 bytes
1440 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1441 if (VecSize <= 1024) // 128 bytes
1442 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1443
1444 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1445}
1446
1447static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1448 if (VecSize <= 64) // 8 bytes
1449 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1450 if (VecSize <= 128) // 16 bytes
1451 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1452 if (VecSize <= 256) // 32 bytes
1453 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1454 if (VecSize <= 512) // 64 bytes
1455 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1456 if (VecSize <= 1024) // 128 bytes
1457 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1458
1459 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1460}
1461
1462const MCInstrDesc &
1463SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1464 bool IsSGPR) const {
1465 if (IsSGPR) {
1466 switch (EltSize) {
1467 case 32:
1468 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1469 case 64:
1470 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1471 default:
1472 llvm_unreachable("invalid reg indexing elt size");
1473 }
1474 }
1475
1476 assert(EltSize == 32 && "invalid reg indexing elt size");
1478}
1479
1480static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1481 switch (Size) {
1482 case 4:
1483 return AMDGPU::SI_SPILL_S32_SAVE;
1484 case 8:
1485 return AMDGPU::SI_SPILL_S64_SAVE;
1486 case 12:
1487 return AMDGPU::SI_SPILL_S96_SAVE;
1488 case 16:
1489 return AMDGPU::SI_SPILL_S128_SAVE;
1490 case 20:
1491 return AMDGPU::SI_SPILL_S160_SAVE;
1492 case 24:
1493 return AMDGPU::SI_SPILL_S192_SAVE;
1494 case 28:
1495 return AMDGPU::SI_SPILL_S224_SAVE;
1496 case 32:
1497 return AMDGPU::SI_SPILL_S256_SAVE;
1498 case 36:
1499 return AMDGPU::SI_SPILL_S288_SAVE;
1500 case 40:
1501 return AMDGPU::SI_SPILL_S320_SAVE;
1502 case 44:
1503 return AMDGPU::SI_SPILL_S352_SAVE;
1504 case 48:
1505 return AMDGPU::SI_SPILL_S384_SAVE;
1506 case 64:
1507 return AMDGPU::SI_SPILL_S512_SAVE;
1508 case 128:
1509 return AMDGPU::SI_SPILL_S1024_SAVE;
1510 default:
1511 llvm_unreachable("unknown register size");
1512 }
1513}
1514
1515static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1516 switch (Size) {
1517 case 4:
1518 return AMDGPU::SI_SPILL_V32_SAVE;
1519 case 8:
1520 return AMDGPU::SI_SPILL_V64_SAVE;
1521 case 12:
1522 return AMDGPU::SI_SPILL_V96_SAVE;
1523 case 16:
1524 return AMDGPU::SI_SPILL_V128_SAVE;
1525 case 20:
1526 return AMDGPU::SI_SPILL_V160_SAVE;
1527 case 24:
1528 return AMDGPU::SI_SPILL_V192_SAVE;
1529 case 28:
1530 return AMDGPU::SI_SPILL_V224_SAVE;
1531 case 32:
1532 return AMDGPU::SI_SPILL_V256_SAVE;
1533 case 36:
1534 return AMDGPU::SI_SPILL_V288_SAVE;
1535 case 40:
1536 return AMDGPU::SI_SPILL_V320_SAVE;
1537 case 44:
1538 return AMDGPU::SI_SPILL_V352_SAVE;
1539 case 48:
1540 return AMDGPU::SI_SPILL_V384_SAVE;
1541 case 64:
1542 return AMDGPU::SI_SPILL_V512_SAVE;
1543 case 128:
1544 return AMDGPU::SI_SPILL_V1024_SAVE;
1545 default:
1546 llvm_unreachable("unknown register size");
1547 }
1548}
1549
1550static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
1551 switch (Size) {
1552 case 4:
1553 return AMDGPU::SI_SPILL_A32_SAVE;
1554 case 8:
1555 return AMDGPU::SI_SPILL_A64_SAVE;
1556 case 12:
1557 return AMDGPU::SI_SPILL_A96_SAVE;
1558 case 16:
1559 return AMDGPU::SI_SPILL_A128_SAVE;
1560 case 20:
1561 return AMDGPU::SI_SPILL_A160_SAVE;
1562 case 24:
1563 return AMDGPU::SI_SPILL_A192_SAVE;
1564 case 28:
1565 return AMDGPU::SI_SPILL_A224_SAVE;
1566 case 32:
1567 return AMDGPU::SI_SPILL_A256_SAVE;
1568 case 36:
1569 return AMDGPU::SI_SPILL_A288_SAVE;
1570 case 40:
1571 return AMDGPU::SI_SPILL_A320_SAVE;
1572 case 44:
1573 return AMDGPU::SI_SPILL_A352_SAVE;
1574 case 48:
1575 return AMDGPU::SI_SPILL_A384_SAVE;
1576 case 64:
1577 return AMDGPU::SI_SPILL_A512_SAVE;
1578 case 128:
1579 return AMDGPU::SI_SPILL_A1024_SAVE;
1580 default:
1581 llvm_unreachable("unknown register size");
1582 }
1583}
1584
1585static unsigned getAVSpillSaveOpcode(unsigned Size) {
1586 switch (Size) {
1587 case 4:
1588 return AMDGPU::SI_SPILL_AV32_SAVE;
1589 case 8:
1590 return AMDGPU::SI_SPILL_AV64_SAVE;
1591 case 12:
1592 return AMDGPU::SI_SPILL_AV96_SAVE;
1593 case 16:
1594 return AMDGPU::SI_SPILL_AV128_SAVE;
1595 case 20:
1596 return AMDGPU::SI_SPILL_AV160_SAVE;
1597 case 24:
1598 return AMDGPU::SI_SPILL_AV192_SAVE;
1599 case 28:
1600 return AMDGPU::SI_SPILL_AV224_SAVE;
1601 case 32:
1602 return AMDGPU::SI_SPILL_AV256_SAVE;
1603 case 36:
1604 return AMDGPU::SI_SPILL_AV288_SAVE;
1605 case 40:
1606 return AMDGPU::SI_SPILL_AV320_SAVE;
1607 case 44:
1608 return AMDGPU::SI_SPILL_AV352_SAVE;
1609 case 48:
1610 return AMDGPU::SI_SPILL_AV384_SAVE;
1611 case 64:
1612 return AMDGPU::SI_SPILL_AV512_SAVE;
1613 case 128:
1614 return AMDGPU::SI_SPILL_AV1024_SAVE;
1615 default:
1616 llvm_unreachable("unknown register size");
1617 }
1618}
1619
1620static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1621 bool IsVectorSuperClass) {
1622 // Currently, there is only 32-bit WWM register spills needed.
1623 if (Size != 4)
1624 llvm_unreachable("unknown wwm register spill size");
1625
1626 if (IsVectorSuperClass)
1627 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1628
1629 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1630}
1631
1633 const TargetRegisterClass *RC,
1634 unsigned Size,
1635 const SIRegisterInfo &TRI,
1636 const SIMachineFunctionInfo &MFI) {
1637 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1638
1639 // Choose the right opcode if spilling a WWM register.
1641 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1642
1643 if (IsVectorSuperClass)
1644 return getAVSpillSaveOpcode(Size);
1645
1646 return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
1648}
1649
1652 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1653 const TargetRegisterInfo *TRI, Register VReg) const {
1656 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1657 const DebugLoc &DL = MBB.findDebugLoc(MI);
1658
1659 MachinePointerInfo PtrInfo
1660 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1662 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1663 FrameInfo.getObjectAlign(FrameIndex));
1664 unsigned SpillSize = TRI->getSpillSize(*RC);
1665
1667 if (RI.isSGPRClass(RC)) {
1668 MFI->setHasSpilledSGPRs();
1669 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1670 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1671 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1672
1673 // We are only allowed to create one new instruction when spilling
1674 // registers, so we need to use pseudo instruction for spilling SGPRs.
1675 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1676
1677 // The SGPR spill/restore instructions only work on number sgprs, so we need
1678 // to make sure we are using the correct register class.
1679 if (SrcReg.isVirtual() && SpillSize == 4) {
1680 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1681 }
1682
1683 BuildMI(MBB, MI, DL, OpDesc)
1684 .addReg(SrcReg, getKillRegState(isKill)) // data
1685 .addFrameIndex(FrameIndex) // addr
1686 .addMemOperand(MMO)
1688
1689 if (RI.spillSGPRToVGPR())
1690 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1691 return;
1692 }
1693
1694 unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
1695 SpillSize, RI, *MFI);
1696 MFI->setHasSpilledVGPRs();
1697
1698 BuildMI(MBB, MI, DL, get(Opcode))
1699 .addReg(SrcReg, getKillRegState(isKill)) // data
1700 .addFrameIndex(FrameIndex) // addr
1701 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1702 .addImm(0) // offset
1703 .addMemOperand(MMO);
1704}
1705
1706static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1707 switch (Size) {
1708 case 4:
1709 return AMDGPU::SI_SPILL_S32_RESTORE;
1710 case 8:
1711 return AMDGPU::SI_SPILL_S64_RESTORE;
1712 case 12:
1713 return AMDGPU::SI_SPILL_S96_RESTORE;
1714 case 16:
1715 return AMDGPU::SI_SPILL_S128_RESTORE;
1716 case 20:
1717 return AMDGPU::SI_SPILL_S160_RESTORE;
1718 case 24:
1719 return AMDGPU::SI_SPILL_S192_RESTORE;
1720 case 28:
1721 return AMDGPU::SI_SPILL_S224_RESTORE;
1722 case 32:
1723 return AMDGPU::SI_SPILL_S256_RESTORE;
1724 case 36:
1725 return AMDGPU::SI_SPILL_S288_RESTORE;
1726 case 40:
1727 return AMDGPU::SI_SPILL_S320_RESTORE;
1728 case 44:
1729 return AMDGPU::SI_SPILL_S352_RESTORE;
1730 case 48:
1731 return AMDGPU::SI_SPILL_S384_RESTORE;
1732 case 64:
1733 return AMDGPU::SI_SPILL_S512_RESTORE;
1734 case 128:
1735 return AMDGPU::SI_SPILL_S1024_RESTORE;
1736 default:
1737 llvm_unreachable("unknown register size");
1738 }
1739}
1740
1741static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1742 switch (Size) {
1743 case 4:
1744 return AMDGPU::SI_SPILL_V32_RESTORE;
1745 case 8:
1746 return AMDGPU::SI_SPILL_V64_RESTORE;
1747 case 12:
1748 return AMDGPU::SI_SPILL_V96_RESTORE;
1749 case 16:
1750 return AMDGPU::SI_SPILL_V128_RESTORE;
1751 case 20:
1752 return AMDGPU::SI_SPILL_V160_RESTORE;
1753 case 24:
1754 return AMDGPU::SI_SPILL_V192_RESTORE;
1755 case 28:
1756 return AMDGPU::SI_SPILL_V224_RESTORE;
1757 case 32:
1758 return AMDGPU::SI_SPILL_V256_RESTORE;
1759 case 36:
1760 return AMDGPU::SI_SPILL_V288_RESTORE;
1761 case 40:
1762 return AMDGPU::SI_SPILL_V320_RESTORE;
1763 case 44:
1764 return AMDGPU::SI_SPILL_V352_RESTORE;
1765 case 48:
1766 return AMDGPU::SI_SPILL_V384_RESTORE;
1767 case 64:
1768 return AMDGPU::SI_SPILL_V512_RESTORE;
1769 case 128:
1770 return AMDGPU::SI_SPILL_V1024_RESTORE;
1771 default:
1772 llvm_unreachable("unknown register size");
1773 }
1774}
1775
1776static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1777 switch (Size) {
1778 case 4:
1779 return AMDGPU::SI_SPILL_A32_RESTORE;
1780 case 8:
1781 return AMDGPU::SI_SPILL_A64_RESTORE;
1782 case 12:
1783 return AMDGPU::SI_SPILL_A96_RESTORE;
1784 case 16:
1785 return AMDGPU::SI_SPILL_A128_RESTORE;
1786 case 20:
1787 return AMDGPU::SI_SPILL_A160_RESTORE;
1788 case 24:
1789 return AMDGPU::SI_SPILL_A192_RESTORE;
1790 case 28:
1791 return AMDGPU::SI_SPILL_A224_RESTORE;
1792 case 32:
1793 return AMDGPU::SI_SPILL_A256_RESTORE;
1794 case 36:
1795 return AMDGPU::SI_SPILL_A288_RESTORE;
1796 case 40:
1797 return AMDGPU::SI_SPILL_A320_RESTORE;
1798 case 44:
1799 return AMDGPU::SI_SPILL_A352_RESTORE;
1800 case 48:
1801 return AMDGPU::SI_SPILL_A384_RESTORE;
1802 case 64:
1803 return AMDGPU::SI_SPILL_A512_RESTORE;
1804 case 128:
1805 return AMDGPU::SI_SPILL_A1024_RESTORE;
1806 default:
1807 llvm_unreachable("unknown register size");
1808 }
1809}
1810
1811static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1812 switch (Size) {
1813 case 4:
1814 return AMDGPU::SI_SPILL_AV32_RESTORE;
1815 case 8:
1816 return AMDGPU::SI_SPILL_AV64_RESTORE;
1817 case 12:
1818 return AMDGPU::SI_SPILL_AV96_RESTORE;
1819 case 16:
1820 return AMDGPU::SI_SPILL_AV128_RESTORE;
1821 case 20:
1822 return AMDGPU::SI_SPILL_AV160_RESTORE;
1823 case 24:
1824 return AMDGPU::SI_SPILL_AV192_RESTORE;
1825 case 28:
1826 return AMDGPU::SI_SPILL_AV224_RESTORE;
1827 case 32:
1828 return AMDGPU::SI_SPILL_AV256_RESTORE;
1829 case 36:
1830 return AMDGPU::SI_SPILL_AV288_RESTORE;
1831 case 40:
1832 return AMDGPU::SI_SPILL_AV320_RESTORE;
1833 case 44:
1834 return AMDGPU::SI_SPILL_AV352_RESTORE;
1835 case 48:
1836 return AMDGPU::SI_SPILL_AV384_RESTORE;
1837 case 64:
1838 return AMDGPU::SI_SPILL_AV512_RESTORE;
1839 case 128:
1840 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1841 default:
1842 llvm_unreachable("unknown register size");
1843 }
1844}
1845
1846static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1847 bool IsVectorSuperClass) {
1848 // Currently, there is only 32-bit WWM register spills needed.
1849 if (Size != 4)
1850 llvm_unreachable("unknown wwm register spill size");
1851
1852 if (IsVectorSuperClass)
1853 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1854
1855 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1856}
1857
1858static unsigned
1860 unsigned Size, const SIRegisterInfo &TRI,
1861 const SIMachineFunctionInfo &MFI) {
1862 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1863
1864 // Choose the right opcode if restoring a WWM register.
1866 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1867
1868 if (IsVectorSuperClass)
1870
1871 return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
1873}
1874
1877 Register DestReg, int FrameIndex,
1878 const TargetRegisterClass *RC,
1879 const TargetRegisterInfo *TRI,
1880 Register VReg) const {
1883 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1884 const DebugLoc &DL = MBB.findDebugLoc(MI);
1885 unsigned SpillSize = TRI->getSpillSize(*RC);
1886
1887 MachinePointerInfo PtrInfo
1888 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1889
1891 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1892 FrameInfo.getObjectAlign(FrameIndex));
1893
1894 if (RI.isSGPRClass(RC)) {
1895 MFI->setHasSpilledSGPRs();
1896 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1897 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1898 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1899
1900 // FIXME: Maybe this should not include a memoperand because it will be
1901 // lowered to non-memory instructions.
1902 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1903 if (DestReg.isVirtual() && SpillSize == 4) {
1905 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1906 }
1907
1908 if (RI.spillSGPRToVGPR())
1909 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1910 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1911 .addFrameIndex(FrameIndex) // addr
1912 .addMemOperand(MMO)
1914
1915 return;
1916 }
1917
1918 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1919 SpillSize, RI, *MFI);
1920 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1921 .addFrameIndex(FrameIndex) // vaddr
1922 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1923 .addImm(0) // offset
1924 .addMemOperand(MMO);
1925}
1926
1929 insertNoops(MBB, MI, 1);
1930}
1931
1934 unsigned Quantity) const {
1936 while (Quantity > 0) {
1937 unsigned Arg = std::min(Quantity, 8u);
1938 Quantity -= Arg;
1939 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1940 }
1941}
1942
1944 auto MF = MBB.getParent();
1946
1947 assert(Info->isEntryFunction());
1948
1949 if (MBB.succ_empty()) {
1950 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1951 if (HasNoTerminator) {
1952 if (Info->returnsVoid()) {
1953 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1954 } else {
1955 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1956 }
1957 }
1958 }
1959}
1960
1962 switch (MI.getOpcode()) {
1963 default:
1964 if (MI.isMetaInstruction())
1965 return 0;
1966 return 1; // FIXME: Do wait states equal cycles?
1967
1968 case AMDGPU::S_NOP:
1969 return MI.getOperand(0).getImm() + 1;
1970 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
1971 // hazard, even if one exist, won't really be visible. Should we handle it?
1972 }
1973}
1974
1976 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1977 MachineBasicBlock &MBB = *MI.getParent();
1979 switch (MI.getOpcode()) {
1980 default: return TargetInstrInfo::expandPostRAPseudo(MI);
1981 case AMDGPU::S_MOV_B64_term:
1982 // This is only a terminator to get the correct spill code placement during
1983 // register allocation.
1984 MI.setDesc(get(AMDGPU::S_MOV_B64));
1985 break;
1986
1987 case AMDGPU::S_MOV_B32_term:
1988 // This is only a terminator to get the correct spill code placement during
1989 // register allocation.
1990 MI.setDesc(get(AMDGPU::S_MOV_B32));
1991 break;
1992
1993 case AMDGPU::S_XOR_B64_term:
1994 // This is only a terminator to get the correct spill code placement during
1995 // register allocation.
1996 MI.setDesc(get(AMDGPU::S_XOR_B64));
1997 break;
1998
1999 case AMDGPU::S_XOR_B32_term:
2000 // This is only a terminator to get the correct spill code placement during
2001 // register allocation.
2002 MI.setDesc(get(AMDGPU::S_XOR_B32));
2003 break;
2004 case AMDGPU::S_OR_B64_term:
2005 // This is only a terminator to get the correct spill code placement during
2006 // register allocation.
2007 MI.setDesc(get(AMDGPU::S_OR_B64));
2008 break;
2009 case AMDGPU::S_OR_B32_term:
2010 // This is only a terminator to get the correct spill code placement during
2011 // register allocation.
2012 MI.setDesc(get(AMDGPU::S_OR_B32));
2013 break;
2014
2015 case AMDGPU::S_ANDN2_B64_term:
2016 // This is only a terminator to get the correct spill code placement during
2017 // register allocation.
2018 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2019 break;
2020
2021 case AMDGPU::S_ANDN2_B32_term:
2022 // This is only a terminator to get the correct spill code placement during
2023 // register allocation.
2024 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2025 break;
2026
2027 case AMDGPU::S_AND_B64_term:
2028 // This is only a terminator to get the correct spill code placement during
2029 // register allocation.
2030 MI.setDesc(get(AMDGPU::S_AND_B64));
2031 break;
2032
2033 case AMDGPU::S_AND_B32_term:
2034 // This is only a terminator to get the correct spill code placement during
2035 // register allocation.
2036 MI.setDesc(get(AMDGPU::S_AND_B32));
2037 break;
2038
2039 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2040 // This is only a terminator to get the correct spill code placement during
2041 // register allocation.
2042 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2043 break;
2044
2045 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2046 // This is only a terminator to get the correct spill code placement during
2047 // register allocation.
2048 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2049 break;
2050
2051 case AMDGPU::V_MOV_B64_PSEUDO: {
2052 Register Dst = MI.getOperand(0).getReg();
2053 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2054 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2055
2056 const MachineOperand &SrcOp = MI.getOperand(1);
2057 // FIXME: Will this work for 64-bit floating point immediates?
2058 assert(!SrcOp.isFPImm());
2059 if (ST.hasMovB64()) {
2060 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2061 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2062 isUInt<32>(SrcOp.getImm()))
2063 break;
2064 }
2065 if (SrcOp.isImm()) {
2066 APInt Imm(64, SrcOp.getImm());
2067 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2068 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2069 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2070 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2072 .addImm(Lo.getSExtValue())
2074 .addImm(Lo.getSExtValue())
2075 .addImm(0) // op_sel_lo
2076 .addImm(0) // op_sel_hi
2077 .addImm(0) // neg_lo
2078 .addImm(0) // neg_hi
2079 .addImm(0); // clamp
2080 } else {
2081 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2082 .addImm(Lo.getSExtValue())
2084 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2085 .addImm(Hi.getSExtValue())
2087 }
2088 } else {
2089 assert(SrcOp.isReg());
2090 if (ST.hasPkMovB32() &&
2091 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2092 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2093 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2094 .addReg(SrcOp.getReg())
2096 .addReg(SrcOp.getReg())
2097 .addImm(0) // op_sel_lo
2098 .addImm(0) // op_sel_hi
2099 .addImm(0) // neg_lo
2100 .addImm(0) // neg_hi
2101 .addImm(0); // clamp
2102 } else {
2103 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2104 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2106 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2107 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2109 }
2110 }
2111 MI.eraseFromParent();
2112 break;
2113 }
2114 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2116 break;
2117 }
2118 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2119 const MachineOperand &SrcOp = MI.getOperand(1);
2120 assert(!SrcOp.isFPImm());
2121 APInt Imm(64, SrcOp.getImm());
2122 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2123 MI.setDesc(get(AMDGPU::S_MOV_B64));
2124 break;
2125 }
2126
2127 Register Dst = MI.getOperand(0).getReg();
2128 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2129 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2130
2131 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2132 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2133 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2134 .addImm(Lo.getSExtValue())
2136 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2137 .addImm(Hi.getSExtValue())
2139 MI.eraseFromParent();
2140 break;
2141 }
2142 case AMDGPU::V_SET_INACTIVE_B32: {
2143 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2144 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2145 // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
2146 // optimizations (mainly Register Coalescer) aware of WWM register liveness.
2147 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2148 .add(MI.getOperand(1));
2149 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2150 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2151 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2152 .add(MI.getOperand(2));
2153 BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2154 .addReg(Exec);
2155 MI.eraseFromParent();
2156 break;
2157 }
2158 case AMDGPU::V_SET_INACTIVE_B64: {
2159 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2160 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2161 MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2162 MI.getOperand(0).getReg())
2163 .add(MI.getOperand(1));
2164 expandPostRAPseudo(*Copy);
2165 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2166 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2167 Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2168 MI.getOperand(0).getReg())
2169 .add(MI.getOperand(2));
2170 expandPostRAPseudo(*Copy);
2171 BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2172 .addReg(Exec);
2173 MI.eraseFromParent();
2174 break;
2175 }
2176 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2177 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2178 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2179 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2180 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2181 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2182 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2183 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2184 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2185 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2186 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2187 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2188 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2189 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2190 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2191 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2192 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2193 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2194 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2195 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2196 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2197 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2198 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2199 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2200 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2201 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2202 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2203 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2204 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2205 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2206
2207 unsigned Opc;
2208 if (RI.hasVGPRs(EltRC)) {
2209 Opc = AMDGPU::V_MOVRELD_B32_e32;
2210 } else {
2211 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2212 : AMDGPU::S_MOVRELD_B32;
2213 }
2214
2215 const MCInstrDesc &OpDesc = get(Opc);
2216 Register VecReg = MI.getOperand(0).getReg();
2217 bool IsUndef = MI.getOperand(1).isUndef();
2218 unsigned SubReg = MI.getOperand(3).getImm();
2219 assert(VecReg == MI.getOperand(1).getReg());
2220
2222 BuildMI(MBB, MI, DL, OpDesc)
2223 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2224 .add(MI.getOperand(2))
2226 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2227
2228 const int ImpDefIdx =
2229 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2230 const int ImpUseIdx = ImpDefIdx + 1;
2231 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2232 MI.eraseFromParent();
2233 break;
2234 }
2235 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2236 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2237 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2238 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2239 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2240 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2241 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2242 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2243 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2244 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2245 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2246 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2248 Register VecReg = MI.getOperand(0).getReg();
2249 bool IsUndef = MI.getOperand(1).isUndef();
2250 Register Idx = MI.getOperand(3).getReg();
2251 Register SubReg = MI.getOperand(4).getImm();
2252
2253 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2254 .addReg(Idx)
2256 SetOn->getOperand(3).setIsUndef();
2257
2258 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2260 BuildMI(MBB, MI, DL, OpDesc)
2261 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2262 .add(MI.getOperand(2))
2264 .addReg(VecReg,
2265 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2266
2267 const int ImpDefIdx =
2268 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2269 const int ImpUseIdx = ImpDefIdx + 1;
2270 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2271
2272 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2273
2274 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2275
2276 MI.eraseFromParent();
2277 break;
2278 }
2279 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2280 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2281 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2282 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2283 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2284 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2285 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2286 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2287 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2288 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2289 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2290 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2292 Register Dst = MI.getOperand(0).getReg();
2293 Register VecReg = MI.getOperand(1).getReg();
2294 bool IsUndef = MI.getOperand(1).isUndef();
2295 Register Idx = MI.getOperand(2).getReg();
2296 Register SubReg = MI.getOperand(3).getImm();
2297
2298 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2299 .addReg(Idx)
2301 SetOn->getOperand(3).setIsUndef();
2302
2303 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2304 .addDef(Dst)
2305 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2306 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2307
2308 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2309
2310 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2311
2312 MI.eraseFromParent();
2313 break;
2314 }
2315 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2316 MachineFunction &MF = *MBB.getParent();
2317 Register Reg = MI.getOperand(0).getReg();
2318 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2319 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2320
2321 // Create a bundle so these instructions won't be re-ordered by the
2322 // post-RA scheduler.
2323 MIBundleBuilder Bundler(MBB, MI);
2324 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2325
2326 // Add 32-bit offset from this instruction to the start of the
2327 // constant data.
2328 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
2329 .addReg(RegLo)
2330 .add(MI.getOperand(1)));
2331
2332 MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2333 .addReg(RegHi);
2334 MIB.add(MI.getOperand(2));
2335
2336 Bundler.append(MIB);
2337 finalizeBundle(MBB, Bundler.begin());
2338
2339 MI.eraseFromParent();
2340 break;
2341 }
2342 case AMDGPU::ENTER_STRICT_WWM: {
2343 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2344 // Whole Wave Mode is entered.
2345 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
2346 : AMDGPU::S_OR_SAVEEXEC_B64));
2347 break;
2348 }
2349 case AMDGPU::ENTER_STRICT_WQM: {
2350 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2351 // STRICT_WQM is entered.
2352 const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2353 const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
2354 const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2355 BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
2356 BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
2357
2358 MI.eraseFromParent();
2359 break;
2360 }
2361 case AMDGPU::EXIT_STRICT_WWM:
2362 case AMDGPU::EXIT_STRICT_WQM: {
2363 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2364 // WWM/STICT_WQM is exited.
2365 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
2366 break;
2367 }
2368 case AMDGPU::ENTER_PSEUDO_WM:
2369 case AMDGPU::EXIT_PSEUDO_WM: {
2370 // These do nothing.
2371 MI.eraseFromParent();
2372 break;
2373 }
2374 case AMDGPU::SI_RETURN: {
2375 const MachineFunction *MF = MBB.getParent();
2376 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2377 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2378 // Hiding the return address use with SI_RETURN may lead to extra kills in
2379 // the function and missing live-ins. We are fine in practice because callee
2380 // saved register handling ensures the register value is restored before
2381 // RET, but we need the undef flag here to appease the MachineVerifier
2382 // liveness checks.
2384 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2385 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2386
2387 MIB.copyImplicitOps(MI);
2388 MI.eraseFromParent();
2389 break;
2390 }
2391 }
2392 return true;
2393}
2394
2395std::pair<MachineInstr*, MachineInstr*>
2397 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2398
2399 if (ST.hasMovB64() &&
2401 getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2402 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2403 return std::pair(&MI, nullptr);
2404 }
2405
2406 MachineBasicBlock &MBB = *MI.getParent();
2410 Register Dst = MI.getOperand(0).getReg();
2411 unsigned Part = 0;
2412 MachineInstr *Split[2];
2413
2414 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2415 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2416 if (Dst.isPhysical()) {
2417 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2418 } else {
2419 assert(MRI.isSSA());
2420 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2421 MovDPP.addDef(Tmp);
2422 }
2423
2424 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2425 const MachineOperand &SrcOp = MI.getOperand(I);
2426 assert(!SrcOp.isFPImm());
2427 if (SrcOp.isImm()) {
2428 APInt Imm(64, SrcOp.getImm());
2429 Imm.ashrInPlace(Part * 32);
2430 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2431 } else {
2432 assert(SrcOp.isReg());
2433 Register Src = SrcOp.getReg();
2434 if (Src.isPhysical())
2435 MovDPP.addReg(RI.getSubReg(Src, Sub));
2436 else
2437 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2438 }
2439 }
2440
2441 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2442 MovDPP.addImm(MO.getImm());
2443
2444 Split[Part] = MovDPP;
2445 ++Part;
2446 }
2447
2448 if (Dst.isVirtual())
2449 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2450 .addReg(Split[0]->getOperand(0).getReg())
2451 .addImm(AMDGPU::sub0)
2452 .addReg(Split[1]->getOperand(0).getReg())
2453 .addImm(AMDGPU::sub1);
2454
2455 MI.eraseFromParent();
2456 return std::pair(Split[0], Split[1]);
2457}
2458
2459std::optional<DestSourcePair>
2461 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2462 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2463
2464 return std::nullopt;
2465}
2466
2468 MachineOperand &Src0,
2469 unsigned Src0OpName,
2470 MachineOperand &Src1,
2471 unsigned Src1OpName) const {
2472 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2473 if (!Src0Mods)
2474 return false;
2475
2476 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2477 assert(Src1Mods &&
2478 "All commutable instructions have both src0 and src1 modifiers");
2479
2480 int Src0ModsVal = Src0Mods->getImm();
2481 int Src1ModsVal = Src1Mods->getImm();
2482
2483 Src1Mods->setImm(Src0ModsVal);
2484 Src0Mods->setImm(Src1ModsVal);
2485 return true;
2486}
2487
2489 MachineOperand &RegOp,
2490 MachineOperand &NonRegOp) {
2491 Register Reg = RegOp.getReg();
2492 unsigned SubReg = RegOp.getSubReg();
2493 bool IsKill = RegOp.isKill();
2494 bool IsDead = RegOp.isDead();
2495 bool IsUndef = RegOp.isUndef();
2496 bool IsDebug = RegOp.isDebug();
2497
2498 if (NonRegOp.isImm())
2499 RegOp.ChangeToImmediate(NonRegOp.getImm());
2500 else if (NonRegOp.isFI())
2501 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2502 else if (NonRegOp.isGlobal()) {
2503 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2504 NonRegOp.getTargetFlags());
2505 } else
2506 return nullptr;
2507
2508 // Make sure we don't reinterpret a subreg index in the target flags.
2509 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2510
2511 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2512 NonRegOp.setSubReg(SubReg);
2513
2514 return &MI;
2515}
2516
2518 unsigned Src0Idx,
2519 unsigned Src1Idx) const {
2520 assert(!NewMI && "this should never be used");
2521
2522 unsigned Opc = MI.getOpcode();
2523 int CommutedOpcode = commuteOpcode(Opc);
2524 if (CommutedOpcode == -1)
2525 return nullptr;
2526
2527 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2528 static_cast<int>(Src0Idx) &&
2529 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2530 static_cast<int>(Src1Idx) &&
2531 "inconsistency with findCommutedOpIndices");
2532
2533 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2534 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2535
2536 MachineInstr *CommutedMI = nullptr;
2537 if (Src0.isReg() && Src1.isReg()) {
2538 if (isOperandLegal(MI, Src1Idx, &Src0)) {
2539 // Be sure to copy the source modifiers to the right place.
2540 CommutedMI
2541 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2542 }
2543
2544 } else if (Src0.isReg() && !Src1.isReg()) {
2545 // src0 should always be able to support any operand type, so no need to
2546 // check operand legality.
2547 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2548 } else if (!Src0.isReg() && Src1.isReg()) {
2549 if (isOperandLegal(MI, Src1Idx, &Src0))
2550 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2551 } else {
2552 // FIXME: Found two non registers to commute. This does happen.
2553 return nullptr;
2554 }
2555
2556 if (CommutedMI) {
2557 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2558 Src1, AMDGPU::OpName::src1_modifiers);
2559
2560 CommutedMI->setDesc(get(CommutedOpcode));
2561 }
2562
2563 return CommutedMI;
2564}
2565
2566// This needs to be implemented because the source modifiers may be inserted
2567// between the true commutable operands, and the base
2568// TargetInstrInfo::commuteInstruction uses it.
2570 unsigned &SrcOpIdx0,
2571 unsigned &SrcOpIdx1) const {
2572 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2573}
2574
2576 unsigned &SrcOpIdx0,
2577 unsigned &SrcOpIdx1) const {
2578 if (!Desc.isCommutable())
2579 return false;
2580
2581 unsigned Opc = Desc.getOpcode();
2582 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2583 if (Src0Idx == -1)
2584 return false;
2585
2586 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2587 if (Src1Idx == -1)
2588 return false;
2589
2590 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2591}
2592
2594 int64_t BrOffset) const {
2595 // BranchRelaxation should never have to check s_setpc_b64 because its dest
2596 // block is unanalyzable.
2597 assert(BranchOp != AMDGPU::S_SETPC_B64);
2598
2599 // Convert to dwords.
2600 BrOffset /= 4;
2601
2602 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2603 // from the next instruction.
2604 BrOffset -= 1;
2605
2606 return isIntN(BranchOffsetBits, BrOffset);
2607}
2608
2610 const MachineInstr &MI) const {
2611 if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
2612 // This would be a difficult analysis to perform, but can always be legal so
2613 // there's no need to analyze it.
2614 return nullptr;
2615 }
2616
2617 return MI.getOperand(0).getMBB();
2618}
2619
2621 for (const MachineInstr &MI : MBB->terminators()) {
2622 if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO ||
2623 MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2624 MI.getOpcode() == AMDGPU::SI_LOOP)
2625 return true;
2626 }
2627 return false;
2628}
2629
2631 MachineBasicBlock &DestBB,
2632 MachineBasicBlock &RestoreBB,
2633 const DebugLoc &DL, int64_t BrOffset,
2634 RegScavenger *RS) const {
2635 assert(RS && "RegScavenger required for long branching");
2636 assert(MBB.empty() &&
2637 "new block should be inserted for expanding unconditional branch");
2638 assert(MBB.pred_size() == 1);
2639 assert(RestoreBB.empty() &&
2640 "restore block should be inserted for restoring clobbered registers");
2641
2645
2646 // FIXME: Virtual register workaround for RegScavenger not working with empty
2647 // blocks.
2648 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2649
2650 auto I = MBB.end();
2651
2652 // We need to compute the offset relative to the instruction immediately after
2653 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2654 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2655
2656 auto &MCCtx = MF->getContext();
2657 MCSymbol *PostGetPCLabel =
2658 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2659 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2660
2661 MCSymbol *OffsetLo =
2662 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2663 MCSymbol *OffsetHi =
2664 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2665 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2666 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2667 .addReg(PCReg, 0, AMDGPU::sub0)
2668 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2669 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2670 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2671 .addReg(PCReg, 0, AMDGPU::sub1)
2672 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2673
2674 // Insert the indirect branch after the other terminator.
2675 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2676 .addReg(PCReg);
2677
2678 // If a spill is needed for the pc register pair, we need to insert a spill
2679 // restore block right before the destination block, and insert a short branch
2680 // into the old destination block's fallthrough predecessor.
2681 // e.g.:
2682 //
2683 // s_cbranch_scc0 skip_long_branch:
2684 //
2685 // long_branch_bb:
2686 // spill s[8:9]
2687 // s_getpc_b64 s[8:9]
2688 // s_add_u32 s8, s8, restore_bb
2689 // s_addc_u32 s9, s9, 0
2690 // s_setpc_b64 s[8:9]
2691 //
2692 // skip_long_branch:
2693 // foo;
2694 //
2695 // .....
2696 //
2697 // dest_bb_fallthrough_predecessor:
2698 // bar;
2699 // s_branch dest_bb
2700 //
2701 // restore_bb:
2702 // restore s[8:9]
2703 // fallthrough dest_bb
2704 ///
2705 // dest_bb:
2706 // buzz;
2707
2708 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
2709 Register Scav;
2710
2711 // If we've previously reserved a register for long branches
2712 // avoid running the scavenger and just use those registers
2713 if (LongBranchReservedReg) {
2714 RS->enterBasicBlock(MBB);
2715 Scav = LongBranchReservedReg;
2716 } else {
2718 Scav = RS->scavengeRegisterBackwards(
2719 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
2720 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
2721 }
2722 if (Scav) {
2723 RS->setRegUsed(Scav);
2724 MRI.replaceRegWith(PCReg, Scav);
2725 MRI.clearVirtRegs();
2726 } else {
2727 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
2728 // SGPR spill.
2729 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2730 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2731 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
2732 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
2733 MRI.clearVirtRegs();
2734 }
2735
2736 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
2737 // Now, the distance could be defined.
2739 MCSymbolRefExpr::create(DestLabel, MCCtx),
2740 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
2741 // Add offset assignments.
2742 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
2743 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
2744 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
2745 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
2746}
2747
2748unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
2749 switch (Cond) {
2750 case SIInstrInfo::SCC_TRUE:
2751 return AMDGPU::S_CBRANCH_SCC1;
2752 case SIInstrInfo::SCC_FALSE:
2753 return AMDGPU::S_CBRANCH_SCC0;
2754 case SIInstrInfo::VCCNZ:
2755 return AMDGPU::S_CBRANCH_VCCNZ;
2756 case SIInstrInfo::VCCZ:
2757 return AMDGPU::S_CBRANCH_VCCZ;
2758 case SIInstrInfo::EXECNZ:
2759 return AMDGPU::S_CBRANCH_EXECNZ;
2760 case SIInstrInfo::EXECZ:
2761 return AMDGPU::S_CBRANCH_EXECZ;
2762 default:
2763 llvm_unreachable("invalid branch predicate");
2764 }
2765}
2766
2767SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
2768 switch (Opcode) {
2769 case AMDGPU::S_CBRANCH_SCC0:
2770 return SCC_FALSE;
2771 case AMDGPU::S_CBRANCH_SCC1:
2772 return SCC_TRUE;
2773 case AMDGPU::S_CBRANCH_VCCNZ:
2774 return VCCNZ;
2775 case AMDGPU::S_CBRANCH_VCCZ:
2776 return VCCZ;
2777 case AMDGPU::S_CBRANCH_EXECNZ:
2778 return EXECNZ;
2779 case AMDGPU::S_CBRANCH_EXECZ:
2780 return EXECZ;
2781 default:
2782 return INVALID_BR;
2783 }
2784}
2785
2789 MachineBasicBlock *&FBB,
2791 bool AllowModify) const {
2792 if (I->getOpcode() == AMDGPU::S_BRANCH) {
2793 // Unconditional Branch
2794 TBB = I->getOperand(0).getMBB();
2795 return false;
2796 }
2797
2798 MachineBasicBlock *CondBB = nullptr;
2799
2800 if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
2801 CondBB = I->getOperand(1).getMBB();
2802 Cond.push_back(I->getOperand(0));
2803 } else {
2804 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
2805 if (Pred == INVALID_BR)
2806 return true;
2807
2808 CondBB = I->getOperand(0).getMBB();
2809 Cond.push_back(MachineOperand::CreateImm(Pred));
2810 Cond.push_back(I->getOperand(1)); // Save the branch register.
2811 }
2812 ++I;
2813
2814 if (I == MBB.end()) {
2815 // Conditional branch followed by fall-through.
2816 TBB = CondBB;
2817 return false;
2818 }
2819
2820 if (I->getOpcode() == AMDGPU::S_BRANCH) {
2821 TBB = CondBB;
2822 FBB = I->getOperand(0).getMBB();
2823 return false;
2824 }
2825
2826 return true;
2827}
2828
2830 MachineBasicBlock *&FBB,
2832 bool AllowModify) const {
2834 auto E = MBB.end();
2835 if (I == E)
2836 return false;
2837
2838 // Skip over the instructions that are artificially terminators for special
2839 // exec management.
2840 while (I != E && !I->isBranch() && !I->isReturn()) {
2841 switch (I->getOpcode()) {
2842 case AMDGPU::S_MOV_B64_term:
2843 case AMDGPU::S_XOR_B64_term:
2844 case AMDGPU::S_OR_B64_term:
2845 case AMDGPU::S_ANDN2_B64_term:
2846 case AMDGPU::S_AND_B64_term:
2847 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2848 case AMDGPU::S_MOV_B32_term:
2849 case AMDGPU::S_XOR_B32_term:
2850 case AMDGPU::S_OR_B32_term:
2851 case AMDGPU::S_ANDN2_B32_term:
2852 case AMDGPU::S_AND_B32_term:
2853 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2854 break;
2855 case AMDGPU::SI_IF:
2856 case AMDGPU::SI_ELSE:
2857 case AMDGPU::SI_KILL_I1_TERMINATOR:
2858 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
2859 // FIXME: It's messy that these need to be considered here at all.
2860 return true;
2861 default:
2862 llvm_unreachable("unexpected non-branch terminator inst");
2863 }
2864
2865 ++I;
2866 }
2867
2868 if (I == E)
2869 return false;
2870
2871 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
2872}
2873
2875 int *BytesRemoved) const {
2876 unsigned Count = 0;
2877 unsigned RemovedSize = 0;
2879 // Skip over artificial terminators when removing instructions.
2880 if (MI.isBranch() || MI.isReturn()) {
2881 RemovedSize += getInstSizeInBytes(MI);
2882 MI.eraseFromParent();
2883 ++Count;
2884 }
2885 }
2886
2887 if (BytesRemoved)
2888 *BytesRemoved = RemovedSize;
2889
2890 return Count;
2891}
2892
2893// Copy the flags onto the implicit condition register operand.
2895 const MachineOperand &OrigCond) {
2896 CondReg.setIsUndef(OrigCond.isUndef());
2897 CondReg.setIsKill(OrigCond.isKill());
2898}
2899
2902 MachineBasicBlock *FBB,
2904 const DebugLoc &DL,
2905 int *BytesAdded) const {
2906 if (!FBB && Cond.empty()) {
2907 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
2908 .addMBB(TBB);
2909 if (BytesAdded)
2910 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
2911 return 1;
2912 }
2913
2914 if(Cond.size() == 1 && Cond[0].isReg()) {
2915 BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
2916 .add(Cond[0])
2917 .addMBB(TBB);
2918 return 1;
2919 }
2920
2921 assert(TBB && Cond[0].isImm());
2922
2923 unsigned Opcode
2924 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
2925
2926 if (!FBB) {
2927 MachineInstr *CondBr =
2928 BuildMI(&MBB, DL, get(Opcode))
2929 .addMBB(TBB);
2930
2931 // Copy the flags onto the implicit condition register operand.
2932 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
2933 fixImplicitOperands(*CondBr);
2934
2935 if (BytesAdded)
2936 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
2937 return 1;
2938 }
2939
2940 assert(TBB && FBB);
2941
2942 MachineInstr *CondBr =
2943 BuildMI(&MBB, DL, get(Opcode))
2944 .addMBB(TBB);
2945 fixImplicitOperands(*CondBr);
2946 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
2947 .addMBB(FBB);
2948
2949 MachineOperand &CondReg = CondBr->getOperand(1);
2950 CondReg.setIsUndef(Cond[1].isUndef());
2951 CondReg.setIsKill(Cond[1].isKill());
2952
2953 if (BytesAdded)
2954 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
2955
2956 return 2;
2957}
2958
2961 if (Cond.size() != 2) {
2962 return true;
2963 }
2964
2965 if (Cond[0].isImm()) {
2966 Cond[0].setImm(-Cond[0].getImm());
2967 return false;
2968 }
2969
2970 return true;
2971}
2972
2975 Register DstReg, Register TrueReg,
2976 Register FalseReg, int &CondCycles,
2977 int &TrueCycles, int &FalseCycles) const {
2978 switch (Cond[0].getImm()) {
2979 case VCCNZ:
2980 case VCCZ: {
2982 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
2983 if (MRI.getRegClass(FalseReg) != RC)
2984 return false;
2985
2986 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
2987 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
2988
2989 // Limit to equal cost for branch vs. N v_cndmask_b32s.
2990 return RI.hasVGPRs(RC) && NumInsts <= 6;
2991 }
2992 case SCC_TRUE:
2993 case SCC_FALSE: {
2994 // FIXME: We could insert for VGPRs if we could replace the original compare
2995 // with a vector one.
2997 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
2998 if (MRI.getRegClass(FalseReg) != RC)
2999 return false;
3000
3001 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3002
3003 // Multiples of 8 can do s_cselect_b64
3004 if (NumInsts % 2 == 0)
3005 NumInsts /= 2;
3006
3007 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3008 return RI.isSGPRClass(RC);
3009 }
3010 default:
3011 return false;
3012 }
3013}
3014
3018 Register TrueReg, Register FalseReg) const {
3019 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3020 if (Pred == VCCZ || Pred == SCC_FALSE) {
3021 Pred = static_cast<BranchPredicate>(-Pred);
3022 std::swap(TrueReg, FalseReg);
3023 }
3024
3026 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3027 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3028
3029 if (DstSize == 32) {
3031 if (Pred == SCC_TRUE) {
3032 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3033 .addReg(TrueReg)
3034 .addReg(FalseReg);
3035 } else {
3036 // Instruction's operands are backwards from what is expected.
3037 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3038 .addReg(FalseReg)
3039 .addReg(TrueReg);
3040 }
3041
3042 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3043 return;
3044 }
3045
3046 if (DstSize == 64 && Pred == SCC_TRUE) {
3048 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3049 .addReg(TrueReg)
3050 .addReg(FalseReg);
3051
3052 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3053 return;
3054 }
3055
3056 static const int16_t Sub0_15[] = {
3057 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3058 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3059 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3060 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3061 };
3062
3063 static const int16_t Sub0_15_64[] = {
3064 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3065 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3066 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3067 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3068 };
3069
3070 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3071 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3072 const int16_t *SubIndices = Sub0_15;
3073 int NElts = DstSize / 32;
3074
3075 // 64-bit select is only available for SALU.
3076 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3077 if (Pred == SCC_TRUE) {
3078 if (NElts % 2) {
3079 SelOp = AMDGPU::S_CSELECT_B32;
3080 EltRC = &AMDGPU::SGPR_32RegClass;
3081 } else {
3082 SelOp = AMDGPU::S_CSELECT_B64;
3083 EltRC = &AMDGPU::SGPR_64RegClass;
3084 SubIndices = Sub0_15_64;
3085 NElts /= 2;
3086 }
3087 }
3088
3090 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3091
3092 I = MIB->getIterator();
3093
3095 for (int Idx = 0; Idx != NElts; ++Idx) {
3096 Register DstElt = MRI.createVirtualRegister(EltRC);
3097 Regs.push_back(DstElt);
3098
3099 unsigned SubIdx = SubIndices[Idx];
3100
3102 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3103 Select =
3104 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3105 .addReg(FalseReg, 0, SubIdx)
3106 .addReg(TrueReg, 0, SubIdx);
3107 } else {
3108 Select =
3109 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3110 .addReg(TrueReg, 0, SubIdx)
3111 .addReg(FalseReg, 0, SubIdx);
3112 }
3113
3114 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3116
3117 MIB.addReg(DstElt)
3118 .addImm(SubIdx);
3119 }
3120}
3121
3123 switch (MI.getOpcode()) {
3124 case AMDGPU::V_MOV_B32_e32:
3125 case AMDGPU::V_MOV_B32_e64:
3126 case AMDGPU::V_MOV_B64_PSEUDO:
3127 case AMDGPU::V_MOV_B64_e32:
3128 case AMDGPU::V_MOV_B64_e64:
3129 case AMDGPU::S_MOV_B32:
3130 case AMDGPU::S_MOV_B64:
3131 case AMDGPU::COPY:
3132 case AMDGPU::WWM_COPY:
3133 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3134 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3135 case AMDGPU::V_ACCVGPR_MOV_B32:
3136 return true;
3137 default:
3138 return false;
3139 }
3140}
3141
3142static constexpr unsigned ModifierOpNames[] = {
3143 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3144 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3145 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3146
3148 unsigned Opc = MI.getOpcode();
3149 for (unsigned Name : reverse(ModifierOpNames)) {
3151 if (Idx >= 0)
3152 MI.removeOperand(Idx);
3153 }
3154}
3155
3157 Register Reg, MachineRegisterInfo *MRI) const {
3158 if (!MRI->hasOneNonDBGUse(Reg))
3159 return false;
3160
3161 switch (DefMI.getOpcode()) {
3162 default:
3163 return false;
3164 case AMDGPU::S_MOV_B64:
3165 // TODO: We could fold 64-bit immediates, but this get complicated
3166 // when there are sub-registers.
3167 return false;
3168
3169 case AMDGPU::V_MOV_B32_e32:
3170 case AMDGPU::S_MOV_B32:
3171 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3172 break;
3173 }
3174
3175 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
3176 assert(ImmOp);
3177 // FIXME: We could handle FrameIndex values here.
3178 if (!ImmOp->isImm())
3179 return false;
3180
3181 unsigned Opc = UseMI.getOpcode();
3182 if (Opc == AMDGPU::COPY) {
3183 Register DstReg = UseMI.getOperand(0).getReg();
3184 bool Is16Bit = getOpSize(UseMI, 0) == 2;
3185 bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
3186 unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
3187 APInt Imm(32, ImmOp->getImm());
3188
3189 if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16)
3190 Imm = Imm.ashr(16);
3191
3192 if (RI.isAGPR(*MRI, DstReg)) {
3193 if (!isInlineConstant(Imm))
3194 return false;
3195 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
3196 }
3197
3198 if (Is16Bit) {
3199 if (isVGPRCopy)
3200 return false; // Do not clobber vgpr_hi16
3201
3202 if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
3203 return false;
3204
3205 UseMI.getOperand(0).setSubReg(0);
3206 if (DstReg.isPhysical()) {
3207 DstReg = RI.get32BitRegister(DstReg);
3208 UseMI.getOperand(0).setReg(DstReg);
3209 }
3210 assert(UseMI.getOperand(1).getReg().isVirtual());
3211 }
3212
3213 const MCInstrDesc &NewMCID = get(NewOpc);
3214 if (DstReg.isPhysical() &&
3215 !RI.getRegClass(NewMCID.operands()[0].RegClass)->contains(DstReg))
3216 return false;
3217
3218 UseMI.setDesc(NewMCID);
3219 UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
3220 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
3221 return true;
3222 }
3223
3224 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3225 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3226 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3227 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3228 Opc == AMDGPU::V_FMAC_F16_t16_e64) {
3229 // Don't fold if we are using source or output modifiers. The new VOP2
3230 // instructions don't have them.
3232 return false;
3233
3234 // If this is a free constant, there's no reason to do this.
3235 // TODO: We could fold this here instead of letting SIFoldOperands do it
3236 // later.
3237 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
3238
3239 // Any src operand can be used for the legality check.
3240 if (isInlineConstant(UseMI, *Src0, *ImmOp))
3241 return false;
3242
3243 bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3244 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64;
3245 bool IsFMA =
3246 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3247 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3248 Opc == AMDGPU::V_FMAC_F16_t16_e64;
3249 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3250 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3251
3252 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3253 // We should only expect these to be on src0 due to canonicalization.
3254 if (Src0->isReg() && Src0->getReg() == Reg) {
3255 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
3256 return false;
3257
3258 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3259 return false;
3260
3261 unsigned NewOpc =
3262 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3263 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
3264 : AMDGPU::V_FMAMK_F16)
3265 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
3266 if (pseudoToMCOpcode(NewOpc) == -1)
3267 return false;
3268
3269 // We need to swap operands 0 and 1 since madmk constant is at operand 1.
3270
3271 const int64_t Imm = ImmOp->getImm();
3272
3273 // FIXME: This would be a lot easier if we could return a new instruction
3274 // instead of having to modify in place.
3275
3276 Register Src1Reg = Src1->getReg();
3277 unsigned Src1SubReg = Src1->getSubReg();
3278 Src0->setReg(Src1Reg);
3279 Src0->setSubReg(Src1SubReg);
3280 Src0->setIsKill(Src1->isKill());
3281
3282 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3283 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3284 Opc == AMDGPU::V_FMAC_F16_e64)
3285 UseMI.untieRegOperand(
3286 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3287
3288 Src1->ChangeToImmediate(Imm);
3289
3291 UseMI.setDesc(get(NewOpc));
3292
3293 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3294 if (DeleteDef)
3295 DefMI.eraseFromParent();
3296
3297 return true;
3298 }
3299
3300 // Added part is the constant: Use v_madak_{f16, f32}.
3301 if (Src2->isReg() && Src2->getReg() == Reg) {
3302 // Not allowed to use constant bus for another operand.
3303 // We can however allow an inline immediate as src0.
3304 bool Src0Inlined = false;
3305 if (Src0->isReg()) {
3306 // Try to inline constant if possible.
3307 // If the Def moves immediate and the use is single
3308 // We are saving VGPR here.
3309 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3310 if (Def && Def->isMoveImmediate() &&
3311 isInlineConstant(Def->getOperand(1)) &&
3312 MRI->hasOneUse(Src0->getReg())) {
3313 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3314 Src0Inlined = true;
3315 } else if ((Src0->getReg().isPhysical() &&
3316 (ST.getConstantBusLimit(Opc) <= 1 &&
3317 RI.isSGPRClass(RI.getPhysRegBaseClass(Src0->getReg())))) ||
3318 (Src0->getReg().isVirtual() &&
3319 (ST.getConstantBusLimit(Opc) <= 1 &&
3320 RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
3321 return false;
3322 // VGPR is okay as Src0 - fallthrough
3323 }
3324
3325 if (Src1->isReg() && !Src0Inlined ) {
3326 // We have one slot for inlinable constant so far - try to fill it
3327 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3328 if (Def && Def->isMoveImmediate() &&
3329 isInlineConstant(Def->getOperand(1)) &&
3330 MRI->hasOneUse(Src1->getReg()) &&
3331 commuteInstruction(UseMI)) {
3332 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3333 } else if ((Src1->getReg().isPhysical() &&
3334 RI.isSGPRClass(RI.getPhysRegBaseClass(Src1->getReg()))) ||
3335 (Src1->getReg().isVirtual() &&
3336 RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
3337 return false;
3338 // VGPR is okay as Src1 - fallthrough
3339 }
3340
3341 unsigned NewOpc =
3342 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3343 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
3344 : AMDGPU::V_FMAAK_F16)
3345 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
3346 if (pseudoToMCOpcode(NewOpc) == -1)
3347 return false;
3348
3349 const int64_t Imm = ImmOp->getImm();
3350
3351 // FIXME: This would be a lot easier if we could return a new instruction
3352 // instead of having to modify in place.
3353
3354 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3355 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3356 Opc == AMDGPU::V_FMAC_F16_e64)
3357 UseMI.untieRegOperand(
3358 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3359
3360 // ChangingToImmediate adds Src2 back to the instruction.
3361 Src2->ChangeToImmediate(Imm);
3362
3363 // These come before src2.
3365 UseMI.setDesc(get(NewOpc));
3366 // It might happen that UseMI was commuted
3367 // and we now have SGPR as SRC1. If so 2 inlined
3368 // constant and SGPR are illegal.
3370
3371 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3372 if (DeleteDef)
3373 DefMI.eraseFromParent();
3374
3375 return true;
3376 }
3377 }
3378
3379 return false;
3380}
3381
3382static bool
3385 if (BaseOps1.size() != BaseOps2.size())
3386 return false;
3387 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3388 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3389 return false;
3390 }
3391 return true;
3392}
3393
3394static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
3395 int WidthB, int OffsetB) {
3396 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3397 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3398 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3399 return LowOffset + LowWidth <= HighOffset;
3400}
3401
3402bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3403 const MachineInstr &MIb) const {
3404 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3405 int64_t Offset0, Offset1;
3406 unsigned Dummy0, Dummy1;
3407 bool Offset0IsScalable, Offset1IsScalable;
3408 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3409 Dummy0, &RI) ||
3410 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3411 Dummy1, &RI))
3412 return false;
3413
3414 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3415 return false;
3416
3417 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3418 // FIXME: Handle ds_read2 / ds_write2.
3419 return false;
3420 }
3421 unsigned Width0 = MIa.memoperands().front()->getSize();
3422 unsigned Width1 = MIb.memoperands().front()->getSize();
3423 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3424}
3425
3427 const MachineInstr &MIb) const {
3428 assert(MIa.mayLoadOrStore() &&
3429 "MIa must load from or modify a memory location");
3430 assert(MIb.mayLoadOrStore() &&
3431 "MIb must load from or modify a memory location");
3432
3434 return false;
3435
3436 // XXX - Can we relax this between address spaces?
3437 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3438 return false;
3439
3440 // TODO: Should we check the address space from the MachineMemOperand? That
3441 // would allow us to distinguish objects we know don't alias based on the
3442 // underlying address space, even if it was lowered to a different one,
3443 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3444 // buffer.
3445 if (isDS(MIa)) {
3446 if (isDS(MIb))
3447 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3448
3449 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3450 }
3451
3452 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3453 if (isMUBUF(MIb) || isMTBUF(MIb))
3454 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3455
3456 if (isFLAT(MIb))
3457 return isFLATScratch(MIb);
3458
3459 return !isSMRD(MIb);
3460 }
3461
3462 if (isSMRD(MIa)) {
3463 if (isSMRD(MIb))
3464 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3465
3466 if (isFLAT(MIb))
3467 return isFLATScratch(MIb);
3468
3469 return !isMUBUF(MIb) && !isMTBUF(MIb);
3470 }
3471
3472 if (isFLAT(MIa)) {
3473 if (isFLAT(MIb)) {
3474 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3475 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3476 return true;
3477
3478 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3479 }
3480
3481 return false;
3482 }
3483
3484 return false;
3485}
3486
3488 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3489 if (Reg.isPhysical())
3490 return false;
3491 auto *Def = MRI.getUniqueVRegDef(Reg);
3492 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3493 Imm = Def->getOperand(1).getImm();
3494 if (DefMI)
3495 *DefMI = Def;
3496 return true;
3497 }
3498 return false;
3499}
3500
3501static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3502 MachineInstr **DefMI = nullptr) {
3503 if (!MO->isReg())
3504 return false;
3505 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3506 const MachineRegisterInfo &MRI = MF->getRegInfo();
3507 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3508}
3509
3511 MachineInstr &NewMI) {
3512 if (LV) {
3513 unsigned NumOps = MI.getNumOperands();
3514 for (unsigned I = 1; I < NumOps; ++I) {
3515 MachineOperand &Op = MI.getOperand(I);
3516 if (Op.isReg() && Op.isKill())
3517 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3518 }
3519 }
3520}
3521
3523 LiveVariables *LV,
3524 LiveIntervals *LIS) const {
3525 MachineBasicBlock &MBB = *MI.getParent();
3526 unsigned Opc = MI.getOpcode();
3527
3528 // Handle MFMA.
3529 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
3530 if (NewMFMAOpc != -1) {
3532 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
3533 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3534 MIB.add(MI.getOperand(I));
3535 updateLiveVariables(LV, MI, *MIB);
3536 if (LIS)
3537 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3538 return MIB;
3539 }
3540
3541 if (SIInstrInfo::isWMMA(MI)) {
3542 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
3543 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3544 .setMIFlags(MI.getFlags());
3545 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3546 MIB->addOperand(MI.getOperand(I));
3547
3548 updateLiveVariables(LV, MI, *MIB);
3549 if (LIS)
3550 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3551
3552 return MIB;
3553 }
3554
3555 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
3556 "V_FMAC_F16_t16_e32 is not supported and not expected to be present "
3557 "pre-RA");
3558
3559 // Handle MAC/FMAC.
3560 bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
3561 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3562 Opc == AMDGPU::V_FMAC_F16_t16_e64;
3563 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3564 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3565 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
3566 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3567 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3568 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3569 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3570 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
3571 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
3572 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3573 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
3574 bool Src0Literal = false;
3575
3576 switch (Opc) {
3577 default:
3578 return nullptr;
3579 case AMDGPU::V_MAC_F16_e64:
3580 case AMDGPU::V_FMAC_F16_e64:
3581 case AMDGPU::V_FMAC_F16_t16_e64:
3582 case AMDGPU::V_MAC_F32_e64:
3583 case AMDGPU::V_MAC_LEGACY_F32_e64:
3584 case AMDGPU::V_FMAC_F32_e64:
3585 case AMDGPU::V_FMAC_LEGACY_F32_e64:
3586 case AMDGPU::V_FMAC_F64_e64:
3587 break;
3588 case AMDGPU::V_MAC_F16_e32:
3589 case AMDGPU::V_FMAC_F16_e32:
3590 case AMDGPU::V_MAC_F32_e32:
3591 case AMDGPU::V_MAC_LEGACY_F32_e32:
3592 case AMDGPU::V_FMAC_F32_e32:
3593 case AMDGPU::V_FMAC_LEGACY_F32_e32:
3594 case AMDGPU::V_FMAC_F64_e32: {
3595 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
3596 AMDGPU::OpName::src0);
3597 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
3598 if (!Src0->isReg() && !Src0->isImm())
3599 return nullptr;
3600
3601 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
3602 Src0Literal = true;
3603
3604 break;
3605 }
3606 }
3607
3609 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
3610 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
3611 const MachineOperand *Src0Mods =
3612 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
3613 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3614 const MachineOperand *Src1Mods =
3615 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
3616 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3617 const MachineOperand *Src2Mods =
3618 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
3619 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
3620 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
3621 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
3622
3623 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 &&
3624 !IsLegacy &&
3625 // If we have an SGPR input, we will violate the constant bus restriction.
3626 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
3627 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
3629 const auto killDef = [&]() -> void {
3631 // The only user is the instruction which will be killed.
3632 Register DefReg = DefMI->getOperand(0).getReg();
3633 if (!MRI.hasOneNonDBGUse(DefReg))
3634 return;
3635 // We cannot just remove the DefMI here, calling pass will crash.
3636 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
3637 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
3639 if (LV)
3640 LV->getVarInfo(DefReg).AliveBlocks.clear();
3641 };
3642
3643 int64_t Imm;
3644 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
3645 unsigned NewOpc =
3646 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
3647 : AMDGPU::V_FMAAK_F16)
3648 : AMDGPU::V_FMAAK_F32)
3649 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
3650 if (pseudoToMCOpcode(NewOpc) != -1) {
3651 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3652 .add(*Dst)
3653 .add(*Src0)
3654 .add(*Src1)
3655 .addImm(Imm);
3656 updateLiveVariables(LV, MI, *MIB);
3657 if (LIS)
3658 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3659 killDef();
3660 return MIB;
3661 }
3662 }
3663 unsigned NewOpc =
3664 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
3665 : AMDGPU::V_FMAMK_F16)
3666 : AMDGPU::V_FMAMK_F32)
3667 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
3668 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
3669 if (pseudoToMCOpcode(NewOpc) != -1) {
3670 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3671 .add(*Dst)
3672 .add(*Src0)
3673 .addImm(Imm)
3674 .add(*Src2);
3675 updateLiveVariables(LV, MI, *MIB);
3676 if (LIS)
3677 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3678 killDef();
3679 return MIB;
3680 }
3681 }
3682 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
3683 if (Src0Literal) {
3684 Imm = Src0->getImm();
3685 DefMI = nullptr;
3686 }
3687 if (pseudoToMCOpcode(NewOpc) != -1 &&
3689 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
3690 Src1)) {
3691 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3692 .add(*Dst)
3693 .add(*Src1)
3694 .addImm(Imm)
3695 .add(*Src2);
3696 updateLiveVariables(LV, MI, *MIB);
3697 if (LIS)
3698 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3699 if (DefMI)
3700 killDef();
3701 return MIB;
3702 }
3703 }
3704 }
3705
3706 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
3707 // if VOP3 does not allow a literal operand.
3708 if (Src0Literal && !ST.hasVOP3Literal())
3709 return nullptr;
3710
3711 unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64
3712 : IsF64 ? AMDGPU::V_FMA_F64_e64
3713 : IsLegacy
3714 ? AMDGPU::V_FMA_LEGACY_F32_e64
3715 : AMDGPU::V_FMA_F32_e64
3716 : IsF16 ? AMDGPU::V_MAD_F16_e64
3717 : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64
3718 : AMDGPU::V_MAD_F32_e64;
3719 if (pseudoToMCOpcode(NewOpc) == -1)
3720 return nullptr;
3721
3722 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3723 .add(*Dst)
3724 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
3725 .add(*Src0)
3726 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
3727 .add(*Src1)
3728 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
3729 .add(*Src2)
3730 .addImm(Clamp ? Clamp->getImm() : 0)
3731 .addImm(Omod ? Omod->getImm() : 0);
3732 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
3733 MIB.addImm(OpSel ? OpSel->getImm() : 0);
3734 updateLiveVariables(LV, MI, *MIB);
3735 if (LIS)
3736 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3737 return MIB;
3738}
3739
3740// It's not generally safe to move VALU instructions across these since it will
3741// start using the register as a base index rather than directly.
3742// XXX - Why isn't hasSideEffects sufficient for these?
3744 switch (MI.getOpcode()) {
3745 case AMDGPU::S_SET_GPR_IDX_ON:
3746 case AMDGPU::S_SET_GPR_IDX_MODE:
3747 case AMDGPU::S_SET_GPR_IDX_OFF:
3748 return true;
3749 default:
3750 return false;
3751 }
3752}
3753
3755 const MachineBasicBlock *MBB,
3756 const MachineFunction &MF) const {
3757 // Skipping the check for SP writes in the base implementation. The reason it
3758 // was added was apparently due to compile time concerns.
3759 //
3760 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
3761 // but is probably avoidable.
3762
3763 // Copied from base implementation.
3764 // Terminators and labels can't be scheduled around.
3765 if (MI.isTerminator() || MI.isPosition())
3766 return true;
3767
3768 // INLINEASM_BR can jump to another block
3769 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
3770 return true;
3771
3772 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
3773 return true;
3774
3775 // Target-independent instructions do not have an implicit-use of EXEC, even
3776 // when they operate on VGPRs. Treating EXEC modifications as scheduling
3777 // boundaries prevents incorrect movements of such instructions.
3778 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
3779 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
3780 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
3781 MI.getOpcode() == AMDGPU::S_SETPRIO ||
3783}
3784
3786 return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
3787}
3788
3790 // Skip the full operand and register alias search modifiesRegister
3791 // does. There's only a handful of instructions that touch this, it's only an
3792 // implicit def, and doesn't alias any other registers.
3793 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
3794}
3795
3797 unsigned Opcode = MI.getOpcode();
3798
3799 if (MI.mayStore() && isSMRD(MI))
3800 return true; // scalar store or atomic
3801
3802 // This will terminate the function when other lanes may need to continue.
3803 if (MI.isReturn())
3804 return true;
3805
3806 // These instructions cause shader I/O that may cause hardware lockups
3807 // when executed with an empty EXEC mask.
3808 //
3809 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
3810 // EXEC = 0, but checking for that case here seems not worth it
3811 // given the typical code patterns.
3812 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
3813 isEXP(Opcode) ||
3814 Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP ||
3815 Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER)
3816 return true;
3817
3818 if (MI.isCall() || MI.isInlineAsm())
3819 return true; // conservative assumption
3820
3821 // A mode change is a scalar operation that influences vector instructions.
3823 return true;
3824
3825 // These are like SALU instructions in terms of effects, so it's questionable
3826 // whether we should return true for those.
3827 //
3828 // However, executing them with EXEC = 0 causes them to operate on undefined
3829 // data, which we avoid by returning true here.
3830 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
3831 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32)
3832 return true;
3833
3834 return false;
3835}
3836
3838 const MachineInstr &MI) const {
3839 if (MI.isMetaInstruction())
3840 return false;
3841
3842 // This won't read exec if this is an SGPR->SGPR copy.
3843 if (MI.isCopyLike()) {
3844 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
3845 return true;
3846
3847 // Make sure this isn't copying exec as a normal operand
3848 return MI.readsRegister(AMDGPU::EXEC, &RI);
3849 }
3850
3851 // Make a conservative assumption about the callee.
3852 if (MI.isCall())
3853 return true;
3854
3855 // Be conservative with any unhandled generic opcodes.
3856 if (!isTargetSpecificOpcode(MI.getOpcode()))
3857 return true;
3858
3859 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
3860}
3861
3862bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
3863 switch (Imm.getBitWidth()) {
3864 case 1: // This likely will be a condition code mask.
3865 return true;
3866
3867 case 32:
3868 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
3869 ST.hasInv2PiInlineImm());
3870 case 64:
3871 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
3872 ST.hasInv2PiInlineImm());
3873 case 16:
3874 return ST.has16BitInsts() &&
3875 AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
3876 ST.hasInv2PiInlineImm());
3877 default:
3878 llvm_unreachable("invalid bitwidth");
3879 }
3880}
3881
3883 uint8_t OperandType) const {
3884 assert(!MO.isReg() && "isInlineConstant called on register operand!");
3885 if (!MO.isImm())
3886 return false;
3887
3888 // MachineOperand provides no way to tell the true operand size, since it only
3889 // records a 64-bit value. We need to know the size to determine if a 32-bit
3890 // floating point immediate bit pattern is legal for an integer immediate. It
3891 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
3892
3893 int64_t Imm = MO.getImm();
3894 switch (OperandType) {
3906 int32_t Trunc = static_cast<int32_t>(Imm);
3908 }
3915 ST.hasInv2PiInlineImm());
3919 // We would expect inline immediates to not be concerned with an integer/fp
3920 // distinction. However, in the case of 16-bit integer operations, the
3921 // "floating point" values appear to not work. It seems read the low 16-bits
3922 // of 32-bit immediates, which happens to always work for the integer
3923 // values.
3924 //
3925 // See llvm bugzilla 46302.
3926 //
3927 // TODO: Theoretically we could use op-sel to use the high bits of the
3928 // 32-bit FP values.
3933 // This suffers the same problem as the scalar 16-bit cases.
3939 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
3940 // A few special case instructions have 16-bit operands on subtargets
3941 // where 16-bit instructions are not legal.
3942 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
3943 // constants in these cases
3944 int16_t Trunc = static_cast<int16_t>(Imm);
3945 return ST.has16BitInsts() &&
3947 }
3948
3949 return false;
3950 }
3954 uint32_t Trunc = static_cast<uint32_t>(Imm);
3956 }
3959 return false;
3962 // Always embedded in the instruction for free.
3963 return true;
3973 // Just ignore anything else.
3974 return true;
3975 default:
3976 llvm_unreachable("invalid operand type");
3977 }
3978}
3979
3980static bool compareMachineOp(const MachineOperand &Op0,
3981 const MachineOperand &Op1) {
3982 if (Op0.getType() != Op1.getType())
3983 return false;
3984
3985 switch (Op0.getType()) {
3987 return Op0.getReg() == Op1.getReg();
3989 return Op0.getImm() == Op1.getImm();
3990 default:
3991 llvm_unreachable("Didn't expect to be comparing these operand types");
3992 }
3993}
3994
3996 const MachineOperand &MO) const {
3997 const MCInstrDesc &InstDesc = MI.getDesc();
3998 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
3999
4000 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
4001
4003 return true;
4004
4005 if (OpInfo.RegClass < 0)
4006 return false;
4007
4008 if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
4009 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
4010 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4011 AMDGPU::OpName::src2))
4012 return false;
4013 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4014 }
4015
4016 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4017 return false;
4018
4019 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
4020 return true;
4021
4022 return ST.hasVOP3Literal();
4023}
4024
4025bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4026 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4027 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4028 return false;
4029
4030 int Op32 = AMDGPU::getVOPe32(Opcode);
4031 if (Op32 == -1)
4032 return false;
4033
4034 return pseudoToMCOpcode(Op32) != -1;
4035}
4036
4037bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4038 // The src0_modifier operand is present on all instructions
4039 // that have modifiers.
4040
4041 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4042}
4043
4045 unsigned OpName) const {
4046 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4047 return Mods && Mods->getImm();
4048}
4049
4051 return any_of(ModifierOpNames,
4052 [&](unsigned Name) { return hasModifiersSet(MI, Name); });
4053}
4054
4056 const MachineRegisterInfo &MRI) const {
4057 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4058 // Can't shrink instruction with three operands.
4059 if (Src2) {
4060 switch (MI.getOpcode()) {
4061 default: return false;
4062
4063 case AMDGPU::V_ADDC_U32_e64:
4064 case AMDGPU::V_SUBB_U32_e64:
4065 case AMDGPU::V_SUBBREV_U32_e64: {
4066 const MachineOperand *Src1
4067 = getNamedOperand(MI, AMDGPU::OpName::src1);
4068 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4069 return false;
4070 // Additional verification is needed for sdst/src2.
4071 return true;
4072 }
4073 case AMDGPU::V_MAC_F16_e64:
4074 case AMDGPU::V_MAC_F32_e64:
4075 case AMDGPU::V_MAC_LEGACY_F32_e64:
4076 case AMDGPU::V_FMAC_F16_e64:
4077 case AMDGPU::V_FMAC_F16_t16_e64:
4078 case AMDGPU::V_FMAC_F32_e64:
4079 case AMDGPU::V_FMAC_F64_e64:
4080 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4081 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4082 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4083 return false;
4084 break;
4085
4086 case AMDGPU::V_CNDMASK_B32_e64:
4087 break;
4088 }
4089 }
4090
4091 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4092 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4093 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4094 return false;
4095
4096 // We don't need to check src0, all input types are legal, so just make sure
4097 // src0 isn't using any modifiers.
4098 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4099 return false;
4100
4101 // Can it be shrunk to a valid 32 bit opcode?
4102 if (!hasVALU32BitEncoding(MI.getOpcode()))
4103 return false;
4104
4105 // Check output modifiers
4106 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4107 !hasModifiersSet(MI, AMDGPU::OpName::clamp);
4108}
4109
4110// Set VCC operand with all flags from \p Orig, except for setting it as
4111// implicit.
4113 const MachineOperand &Orig) {
4114
4115 for (MachineOperand &Use : MI.implicit_operands()) {
4116 if (Use.isUse() &&
4117 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4118 Use.setIsUndef(Orig.isUndef());
4119 Use.setIsKill(Orig.isKill());
4120 return;
4121 }
4122 }
4123}
4124
4126 unsigned Op32) const {
4127 MachineBasicBlock *MBB = MI.getParent();
4128 MachineInstrBuilder Inst32 =
4129 BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32))
4130 .setMIFlags(MI.getFlags());
4131
4132 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4133 // For VOPC instructions, this is replaced by an implicit def of vcc.
4134 if (AMDGPU::hasNamedOperand(Op32, AMDGPU::OpName::vdst)) {
4135 // dst
4136 Inst32.add(MI.getOperand(0));
4137 } else if (AMDGPU::hasNamedOperand(Op32, AMDGPU::OpName::sdst)) {
4138 // VOPCX instructions won't be writing to an explicit dst, so this should
4139 // not fail for these instructions.
4140 assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) ||
4141 (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&
4142 "Unexpected case");
4143 }
4144
4145 Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0));
4146
4147 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4148 if (Src1)
4149 Inst32.add(*Src1);
4150
4151 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4152
4153 if (Src2) {
4154 int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
4155 if (Op32Src2Idx != -1) {
4156 Inst32.add(*Src2);
4157 } else {
4158 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4159 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4160 // of vcc was already added during the initial BuildMI, but we
4161 // 1) may need to change vcc to vcc_lo to preserve the original register
4162 // 2) have to preserve the original flags.
4163 fixImplicitOperands(*Inst32);
4164 copyFlagsToImplicitVCC(*Inst32, *Src2);
4165 }
4166 }
4167
4168 return Inst32;
4169}
4170
4172 const MachineOperand &MO,
4173 const MCOperandInfo &OpInfo) const {
4174 // Literal constants use the constant bus.
4175 if (!MO.isReg())
4176 return !isInlineConstant(MO, OpInfo);
4177
4178 if (!MO.isUse())
4179 return false;
4180
4181 if (MO.getReg().isVirtual())
4182 return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
4183
4184 // Null is free
4185 if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64)
4186 return false;
4187
4188 // SGPRs use the constant bus
4189 if (MO.isImplicit()) {
4190 return MO.getReg() == AMDGPU::M0 ||
4191 MO.getReg() == AMDGPU::VCC ||
4192 MO.getReg() == AMDGPU::VCC_LO;
4193 } else {
4194 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
4195 AMDGPU::SReg_64RegClass.contains(MO.getReg());
4196 }
4197}
4198
4200 for (const MachineOperand &MO : MI.implicit_operands()) {
4201 // We only care about reads.
4202 if (MO.isDef())
4203 continue;
4204
4205 switch (MO.getReg()) {
4206 case AMDGPU::VCC:
4207 case AMDGPU::VCC_LO:
4208 case AMDGPU::VCC_HI:
4209 case AMDGPU::M0:
4210 case AMDGPU::FLAT_SCR:
4211 return MO.getReg();
4212
4213 default:
4214 break;
4215 }
4216 }
4217
4218 return Register();
4219}
4220
4221static bool shouldReadExec(const MachineInstr &MI) {
4222 if (SIInstrInfo::isVALU(MI)) {
4223 switch (MI.getOpcode()) {
4224 case AMDGPU::V_READLANE_B32:
4225 case AMDGPU::V_WRITELANE_B32:
4226 return false;
4227 }
4228
4229 return true;
4230 }
4231
4232 if (MI.isPreISelOpcode() ||
4233 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4236 return false;
4237
4238 return true;
4239}
4240
4241static bool isSubRegOf(const SIRegisterInfo &TRI,
4242 const MachineOperand &SuperVec,
4243 const MachineOperand &SubReg) {
4244 if (SubReg.getReg().isPhysical())
4245 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4246
4247 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4248 SubReg.getReg() == SuperVec.getReg();
4249}
4250
4252 StringRef &ErrInfo) const {
4253 uint16_t Opcode = MI.getOpcode();
4254 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
4255 return true;
4256
4257 const MachineFunction *MF = MI.getParent()->getParent();
4258 const MachineRegisterInfo &MRI = MF->getRegInfo();
4259
4260 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4261 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4262 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4263 int Src3Idx = -1;
4264 if (Src0Idx == -1) {
4265 // VOPD V_DUAL_* instructions use different operand names.
4266 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4267 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4268 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4269 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4270 }
4271
4272 // Make sure the number of operands is correct.
4273 const MCInstrDesc &Desc = get(Opcode);
4274 if (!Desc.isVariadic() &&
4275 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4276 ErrInfo = "Instruction has wrong number of operands.";
4277 return false;
4278 }
4279
4280 if (MI.isInlineAsm()) {
4281 // Verify register classes for inlineasm constraints.
4282 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4283 I != E; ++I) {
4284 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4285 if (!RC)
4286 continue;
4287
4288 const MachineOperand &Op = MI.getOperand(I);
4289 if (!Op.isReg())
4290 continue;
4291
4292 Register Reg = Op.getReg();
4293 if (!Reg.isVirtual() && !RC->contains(Reg)) {
4294 ErrInfo = "inlineasm operand has incorrect register class.";
4295 return false;
4296 }
4297 }
4298
4299 return true;
4300 }
4301
4302 if (isMIMG(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
4303 ErrInfo = "missing memory operand from MIMG instruction.";
4304 return false;
4305 }
4306
4307 // Make sure the register classes are correct.
4308 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
4309 const MachineOperand &MO = MI.getOperand(i);
4310 if (MO.isFPImm()) {
4311 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
4312 "all fp values to integers.";
4313 return false;
4314 }
4315
4316 int RegClass = Desc.operands()[i].RegClass;
4317
4318 switch (Desc.operands()[i].OperandType) {
4320 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
4321 ErrInfo = "Illegal immediate value for operand.";
4322 return false;
4323 }
4324 break;
4329 break;
4341 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
4342 ErrInfo = "Illegal immediate value for operand.";
4343 return false;
4344 }
4345 break;
4346 }
4349 // Check if this operand is an immediate.
4350 // FrameIndex operands will be replaced by immediates, so they are
4351 // allowed.
4352 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
4353 ErrInfo = "Expected immediate, but got non-immediate";
4354 return false;
4355 }
4356 [[fallthrough]];
4357 default:
4358 continue;
4359 }
4360
4361 if (!MO.isReg())
4362 continue;
4363 Register Reg = MO.getReg();
4364 if (!Reg)
4365 continue;
4366
4367 // FIXME: Ideally we would have separate instruction definitions with the
4368 // aligned register constraint.
4369 // FIXME: We do not verify inline asm operands, but custom inline asm
4370 // verification is broken anyway
4371 if (ST.needsAlignedVGPRs()) {
4372 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
4373 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
4374 const TargetRegisterClass *SubRC =
4375 RI.getSubRegisterClass(RC, MO.getSubReg());
4376 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
4377 if (RC)
4378 RC = SubRC;
4379 }
4380
4381 // Check that this is the aligned version of the class.
4382 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
4383 ErrInfo = "Subtarget requires even aligned vector registers";
4384 return false;
4385 }
4386 }
4387
4388 if (RegClass != -1) {
4389 if (Reg.isVirtual())
4390 continue;
4391
4392 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
4393 if (!RC->contains(Reg)) {
4394 ErrInfo = "Operand has incorrect register class.";
4395 return false;
4396 }
4397 }
4398 }
4399
4400 // Verify SDWA
4401 if (isSDWA(MI)) {
4402 if (!ST.hasSDWA()) {
4403 ErrInfo = "SDWA is not supported on this target";
4404 return false;
4405 }
4406
4407 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
4408
4409 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
4410 if (OpIdx == -1)
4411 continue;
4412 const MachineOperand &MO = MI.getOperand(OpIdx);
4413
4414 if (!ST.hasSDWAScalar()) {
4415 // Only VGPRS on VI
4416 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
4417 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
4418 return false;
4419 }
4420 } else {
4421 // No immediates on GFX9
4422 if (!MO.isReg()) {
4423 ErrInfo =
4424 "Only reg allowed as operands in SDWA instructions on GFX9+";
4425 return false;
4426 }
4427 }
4428 }
4429
4430 if (!ST.hasSDWAOmod()) {
4431 // No omod allowed on VI
4432 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4433 if (OMod != nullptr &&
4434 (!OMod->isImm() || OMod->getImm() != 0)) {
4435 ErrInfo = "OMod not allowed in SDWA instructions on VI";
4436 return false;
4437 }
4438 }
4439
4440 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
4441 if (isVOPC(BasicOpcode)) {
4442 if (!ST.hasSDWASdst() && DstIdx != -1) {
4443 // Only vcc allowed as dst on VI for VOPC
4444 const MachineOperand &Dst = MI.getOperand(DstIdx);
4445 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
4446 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
4447 return false;
4448 }
4449 } else if (!ST.hasSDWAOutModsVOPC()) {
4450 // No clamp allowed on GFX9 for VOPC
4451 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4452 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
4453 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
4454 return false;
4455 }
4456
4457 // No omod allowed on GFX9 for VOPC
4458 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4459 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
4460 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
4461 return false;
4462 }
4463 }
4464 }
4465
4466 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
4467 if (DstUnused && DstUnused->isImm() &&
4468 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
4469 const MachineOperand &Dst = MI.getOperand(DstIdx);
4470 if (!Dst.isReg() || !Dst.isTied()) {
4471 ErrInfo = "Dst register should have tied register";
4472 return false;
4473 }
4474
4475 const MachineOperand &TiedMO =
4476 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
4477 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
4478 ErrInfo =
4479 "Dst register should be tied to implicit use of preserved register";
4480 return false;
4481 } else if (TiedMO.getReg().isPhysical() &&
4482 Dst.getReg() != TiedMO.getReg()) {
4483 ErrInfo = "Dst register should use same physical register as preserved";
4484 return false;
4485 }
4486 }
4487 }
4488
4489 // Verify MIMG
4490 if (isMIMG(MI.getOpcode()) && !MI.mayStore()) {
4491 // Ensure that the return type used is large enough for all the options
4492 // being used TFE/LWE require an extra result register.
4493 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
4494 if (DMask) {
4495 uint64_t DMaskImm = DMask->getImm();
4496 uint32_t RegCount =
4497 isGather4(MI.getOpcode()) ? 4 : llvm::popcount(DMaskImm);
4498 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
4499 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
4500 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
4501
4502 // Adjust for packed 16 bit values
4503 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
4504 RegCount = divideCeil(RegCount, 2);
4505
4506 // Adjust if using LWE or TFE
4507 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
4508 RegCount += 1;
4509
4510 const uint32_t DstIdx =
4511 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
4512 const MachineOperand &Dst = MI.getOperand(DstIdx);
4513 if (Dst.isReg()) {
4514 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
4515 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
4516 if (RegCount > DstSize) {
4517 ErrInfo = "Image instruction returns too many registers for dst "
4518 "register class";
4519 return false;
4520 }
4521 }
4522 }
4523 }
4524
4525 // Verify VOP*. Ignore multiple sgpr operands on writelane.
4526 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
4527 unsigned ConstantBusCount = 0;
4528 bool UsesLiteral = false;
4529 const MachineOperand *LiteralVal = nullptr;
4530
4531 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
4532 if (ImmIdx != -1) {
4533 ++ConstantBusCount;
4534 UsesLiteral = true;
4535 LiteralVal = &MI.getOperand(ImmIdx);
4536 }
4537
4538 SmallVector<Register, 2> SGPRsUsed;
4539 Register SGPRUsed;
4540
4541 // Only look at the true operands. Only a real operand can use the constant
4542 // bus, and we don't want to check pseudo-operands like the source modifier
4543 // flags.
4544 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
4545 if (OpIdx == -1)
4546 continue;
4547 const MachineOperand &MO = MI.getOperand(OpIdx);
4548 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
4549 if (MO.isReg()) {
4550 SGPRUsed = MO.getReg();
4551 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
4552 ++ConstantBusCount;
4553 SGPRsUsed.push_back(SGPRUsed);
4554 }
4555 } else {
4556 if (!UsesLiteral) {
4557 ++ConstantBusCount;
4558 UsesLiteral = true;
4559 LiteralVal = &MO;
4560 } else if (!MO.isIdenticalTo(*LiteralVal)) {
4561 assert(isVOP2(MI) || isVOP3(MI));
4562 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
4563 return false;
4564 }
4565 }
4566 }
4567 }
4568
4569 SGPRUsed = findImplicitSGPRRead(MI);
4570 if (SGPRUsed) {
4571 // Implicit uses may safely overlap true operands
4572 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
4573 return !RI.regsOverlap(SGPRUsed, SGPR);
4574 })) {
4575 ++ConstantBusCount;
4576 SGPRsUsed.push_back(SGPRUsed);
4577 }
4578 }
4579
4580 // v_writelane_b32 is an exception from constant bus restriction:
4581 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
4582 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
4583 Opcode != AMDGPU::V_WRITELANE_B32) {
4584 ErrInfo = "VOP* instruction violates constant bus restriction";
4585 return false;
4586 }
4587
4588 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
4589 ErrInfo = "VOP3 instruction uses literal";
4590 return false;
4591 }
4592 }
4593
4594 // Special case for writelane - this can break the multiple constant bus rule,
4595 // but still can't use more than one SGPR register
4596 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
4597 unsigned SGPRCount = 0;
4598 Register SGPRUsed;
4599
4600 for (int OpIdx : {Src0Idx, Src1Idx}) {
4601 if (OpIdx == -1)
4602 break;
4603
4604 const MachineOperand &MO = MI.getOperand(OpIdx);
4605
4606 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
4607 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
4608 if (MO.getReg() != SGPRUsed)
4609 ++SGPRCount;
4610 SGPRUsed = MO.getReg();
4611 }
4612 }
4613 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
4614 ErrInfo = "WRITELANE instruction violates constant bus restriction";
4615 return false;
4616 }
4617 }
4618 }
4619
4620 // Verify misc. restrictions on specific instructions.
4621 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
4622 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
4623 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
4624 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
4625 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
4626 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
4627 if (!compareMachineOp(Src0, Src1) &&
4628 !compareMachineOp(Src0, Src2)) {
4629 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
4630 return false;
4631 }
4632 }
4633 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
4634 SISrcMods::ABS) ||
4635 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
4636 SISrcMods::ABS) ||
4637 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
4638 SISrcMods::ABS)) {
4639 ErrInfo = "ABS not allowed in VOP3B instructions";
4640 return false;
4641 }
4642 }
4643
4644 if (isSOP2(MI) || isSOPC(MI)) {
4645 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
4646 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
4647
4648 if (!Src0.isReg() && !Src1.isReg() &&
4649 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
4650 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
4651 !Src0.isIdenticalTo(Src1)) {
4652 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
4653 return