LLVM 23.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "GCNHazardRecognizer.h"
19#include "GCNSubtarget.h"
22#include "llvm/ADT/STLExtras.h"
34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/MC/MCContext.h"
38
39using namespace llvm;
40
41#define DEBUG_TYPE "si-instr-info"
42
43#define GET_INSTRINFO_CTOR_DTOR
44#include "AMDGPUGenInstrInfo.inc"
45
46namespace llvm::AMDGPU {
47#define GET_D16ImageDimIntrinsics_IMPL
48#define GET_ImageDimIntrinsicTable_IMPL
49#define GET_RsrcIntrinsics_IMPL
50#include "AMDGPUGenSearchableTables.inc"
51} // namespace llvm::AMDGPU
52
53// Must be at least 4 to be able to branch over minimum unconditional branch
54// code. This is only for making it possible to write reasonably small tests for
55// long branches.
57BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
58 cl::desc("Restrict range of branch instructions (DEBUG)"));
59
61 "amdgpu-fix-16-bit-physreg-copies",
62 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
63 cl::init(true),
65
67 : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
68 AMDGPU::ADJCALLSTACKDOWN),
69 RI(ST), ST(ST) {
70 SchedModel.init(&ST);
71}
72
73//===----------------------------------------------------------------------===//
74// TargetInstrInfo callbacks
75//===----------------------------------------------------------------------===//
76
77static unsigned getNumOperandsNoGlue(SDNode *Node) {
78 unsigned N = Node->getNumOperands();
79 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
80 --N;
81 return N;
82}
83
84/// Returns true if both nodes have the same value for the given
85/// operand \p Op, or if both nodes do not have this operand.
87 AMDGPU::OpName OpName) {
88 unsigned Opc0 = N0->getMachineOpcode();
89 unsigned Opc1 = N1->getMachineOpcode();
90
91 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
92 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
93
94 if (Op0Idx == -1 && Op1Idx == -1)
95 return true;
96
97
98 if ((Op0Idx == -1 && Op1Idx != -1) ||
99 (Op1Idx == -1 && Op0Idx != -1))
100 return false;
101
102 // getNamedOperandIdx returns the index for the MachineInstr's operands,
103 // which includes the result as the first operand. We are indexing into the
104 // MachineSDNode's operands, so we need to skip the result operand to get
105 // the real index.
106 --Op0Idx;
107 --Op1Idx;
108
109 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
110}
111
112static bool canRemat(const MachineInstr &MI) {
113
117 return true;
118
119 if (SIInstrInfo::isSMRD(MI)) {
120 return !MI.memoperands_empty() &&
121 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
122 return MMO->isLoad() && MMO->isInvariant();
123 });
124 }
125
126 return false;
127}
128
130 const MachineInstr &MI) const {
131
132 if (canRemat(MI)) {
133 // Normally VALU use of exec would block the rematerialization, but that
134 // is OK in this case to have an implicit exec read as all VALU do.
135 // We really want all of the generic logic for this except for this.
136
137 // Another potential implicit use is mode register. The core logic of
138 // the RA will not attempt rematerialization if mode is set anywhere
139 // in the function, otherwise it is safe since mode is not changed.
140
141 // There is difference to generic method which does not allow
142 // rematerialization if there are virtual register uses. We allow this,
143 // therefore this method includes SOP instructions as well.
144 if (!MI.hasImplicitDef() &&
145 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
146 !MI.mayRaiseFPException())
147 return true;
148 }
149
151}
152
153// Returns true if the result of a VALU instruction depends on exec.
154bool SIInstrInfo::resultDependsOnExec(const MachineInstr &MI) const {
155 assert(isVALU(MI));
156
157 // If it is convergent it depends on EXEC.
158 if (MI.isConvergent())
159 return true;
160
161 // If it defines SGPR it depends on EXEC
162 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
163 for (const MachineOperand &Def : MI.defs()) {
164 if (!Def.isReg())
165 continue;
166
167 Register Reg = Def.getReg();
168 if (Reg && RI.isSGPRReg(MRI, Reg))
169 return true;
170 }
171
172 return false;
173}
174
176 // Any implicit use of exec by VALU is not a real register read.
177 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
178 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
179}
180
182 MachineBasicBlock *SuccToSinkTo,
183 MachineCycleInfo *CI) const {
184 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
185 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
186 return true;
187
188 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
189 // Check if sinking of MI would create temporal divergent use.
190 for (auto Op : MI.uses()) {
191 if (Op.isReg() && Op.getReg().isVirtual() &&
192 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
193 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
194
195 // SgprDef defined inside cycle
196 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
197 if (FromCycle == nullptr)
198 continue;
199
200 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
201 // Check if there is a FromCycle that contains SgprDef's basic block but
202 // does not contain SuccToSinkTo and also has divergent exit condition.
203 while (FromCycle && !FromCycle->contains(ToCycle)) {
205 FromCycle->getExitingBlocks(ExitingBlocks);
206
207 // FromCycle has divergent exit condition.
208 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
209 if (hasDivergentBranch(ExitingBlock))
210 return false;
211 }
212
213 FromCycle = FromCycle->getParentCycle();
214 }
215 }
216 }
217
218 return true;
219}
220
222 int64_t &Offset0,
223 int64_t &Offset1) const {
224 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
225 return false;
226
227 unsigned Opc0 = Load0->getMachineOpcode();
228 unsigned Opc1 = Load1->getMachineOpcode();
229
230 // Make sure both are actually loads.
231 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
232 return false;
233
234 // A mayLoad instruction without a def is not a load. Likely a prefetch.
235 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
236 return false;
237
238 if (isDS(Opc0) && isDS(Opc1)) {
239
240 // FIXME: Handle this case:
241 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
242 return false;
243
244 // Check base reg.
245 if (Load0->getOperand(0) != Load1->getOperand(0))
246 return false;
247
248 // Skip read2 / write2 variants for simplicity.
249 // TODO: We should report true if the used offsets are adjacent (excluded
250 // st64 versions).
251 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
252 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
253 if (Offset0Idx == -1 || Offset1Idx == -1)
254 return false;
255
256 // XXX - be careful of dataless loads
257 // getNamedOperandIdx returns the index for MachineInstrs. Since they
258 // include the output in the operand list, but SDNodes don't, we need to
259 // subtract the index by one.
260 Offset0Idx -= get(Opc0).NumDefs;
261 Offset1Idx -= get(Opc1).NumDefs;
262 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
263 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
264 return true;
265 }
266
267 if (isSMRD(Opc0) && isSMRD(Opc1)) {
268 // Skip time and cache invalidation instructions.
269 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
270 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
271 return false;
272
273 unsigned NumOps = getNumOperandsNoGlue(Load0);
274 if (NumOps != getNumOperandsNoGlue(Load1))
275 return false;
276
277 // Check base reg.
278 if (Load0->getOperand(0) != Load1->getOperand(0))
279 return false;
280
281 // Match register offsets, if both register and immediate offsets present.
282 assert(NumOps == 4 || NumOps == 5);
283 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
284 return false;
285
286 const ConstantSDNode *Load0Offset =
288 const ConstantSDNode *Load1Offset =
290
291 if (!Load0Offset || !Load1Offset)
292 return false;
293
294 Offset0 = Load0Offset->getZExtValue();
295 Offset1 = Load1Offset->getZExtValue();
296 return true;
297 }
298
299 // MUBUF and MTBUF can access the same addresses.
300 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
301
302 // MUBUF and MTBUF have vaddr at different indices.
303 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
304 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
305 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
306 return false;
307
308 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
309 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
310
311 if (OffIdx0 == -1 || OffIdx1 == -1)
312 return false;
313
314 // getNamedOperandIdx returns the index for MachineInstrs. Since they
315 // include the output in the operand list, but SDNodes don't, we need to
316 // subtract the index by one.
317 OffIdx0 -= get(Opc0).NumDefs;
318 OffIdx1 -= get(Opc1).NumDefs;
319
320 SDValue Off0 = Load0->getOperand(OffIdx0);
321 SDValue Off1 = Load1->getOperand(OffIdx1);
322
323 // The offset might be a FrameIndexSDNode.
324 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
325 return false;
326
327 Offset0 = Off0->getAsZExtVal();
328 Offset1 = Off1->getAsZExtVal();
329 return true;
330 }
331
332 return false;
333}
334
335static bool isStride64(unsigned Opc) {
336 switch (Opc) {
337 case AMDGPU::DS_READ2ST64_B32:
338 case AMDGPU::DS_READ2ST64_B64:
339 case AMDGPU::DS_WRITE2ST64_B32:
340 case AMDGPU::DS_WRITE2ST64_B64:
341 return true;
342 default:
343 return false;
344 }
345}
346
349 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
350 const TargetRegisterInfo *TRI) const {
351 if (!LdSt.mayLoadOrStore())
352 return false;
353
354 unsigned Opc = LdSt.getOpcode();
355 OffsetIsScalable = false;
356 const MachineOperand *BaseOp, *OffsetOp;
357 int DataOpIdx;
358
359 if (isDS(LdSt)) {
360 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
361 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
362 if (OffsetOp) {
363 // Normal, single offset LDS instruction.
364 if (!BaseOp) {
365 // DS_CONSUME/DS_APPEND use M0 for the base address.
366 // TODO: find the implicit use operand for M0 and use that as BaseOp?
367 return false;
368 }
369 BaseOps.push_back(BaseOp);
370 Offset = OffsetOp->getImm();
371 // Get appropriate operand, and compute width accordingly.
372 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
373 if (DataOpIdx == -1)
374 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
375 if (Opc == AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
376 Width = LocationSize::precise(64);
377 else
378 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
379 } else {
380 // The 2 offset instructions use offset0 and offset1 instead. We can treat
381 // these as a load with a single offset if the 2 offsets are consecutive.
382 // We will use this for some partially aligned loads.
383 const MachineOperand *Offset0Op =
384 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
385 const MachineOperand *Offset1Op =
386 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
387
388 unsigned Offset0 = Offset0Op->getImm() & 0xff;
389 unsigned Offset1 = Offset1Op->getImm() & 0xff;
390 if (Offset0 + 1 != Offset1)
391 return false;
392
393 // Each of these offsets is in element sized units, so we need to convert
394 // to bytes of the individual reads.
395
396 unsigned EltSize;
397 if (LdSt.mayLoad())
398 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
399 else {
400 assert(LdSt.mayStore());
401 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
402 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
403 }
404
405 if (isStride64(Opc))
406 EltSize *= 64;
407
408 BaseOps.push_back(BaseOp);
409 Offset = EltSize * Offset0;
410 // Get appropriate operand(s), and compute width accordingly.
411 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
412 if (DataOpIdx == -1) {
413 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
414 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
415 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
416 Width = LocationSize::precise(
417 Width.getValue() + TypeSize::getFixed(getOpSize(LdSt, DataOpIdx)));
418 } else {
419 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
420 }
421 }
422 return true;
423 }
424
425 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
426 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
427 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
428 return false;
429 BaseOps.push_back(RSrc);
430 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
431 if (BaseOp && !BaseOp->isFI())
432 BaseOps.push_back(BaseOp);
433 const MachineOperand *OffsetImm =
434 getNamedOperand(LdSt, AMDGPU::OpName::offset);
435 Offset = OffsetImm->getImm();
436 const MachineOperand *SOffset =
437 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
438 if (SOffset) {
439 if (SOffset->isReg())
440 BaseOps.push_back(SOffset);
441 else
442 Offset += SOffset->getImm();
443 }
444 // Get appropriate operand, and compute width accordingly.
445 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
446 if (DataOpIdx == -1)
447 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
448 if (DataOpIdx == -1) // LDS DMA
449 return false;
450 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
451 return true;
452 }
453
454 if (isImage(LdSt)) {
455 auto RsrcOpName =
456 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
457 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
458 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
459 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
460 if (VAddr0Idx >= 0) {
461 // GFX10 possible NSA encoding.
462 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
463 BaseOps.push_back(&LdSt.getOperand(I));
464 } else {
465 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
466 }
467 Offset = 0;
468 // Get appropriate operand, and compute width accordingly.
469 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
470 if (DataOpIdx == -1)
471 return false; // no return sampler
472 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
473 return true;
474 }
475
476 if (isSMRD(LdSt)) {
477 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
478 if (!BaseOp) // e.g. S_MEMTIME
479 return false;
480 BaseOps.push_back(BaseOp);
481 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
482 Offset = OffsetOp ? OffsetOp->getImm() : 0;
483 // Get appropriate operand, and compute width accordingly.
484 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
485 if (DataOpIdx == -1)
486 return false;
487 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
488 return true;
489 }
490
491 if (isFLAT(LdSt)) {
492 // Instructions have either vaddr or saddr or both or none.
493 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
494 if (BaseOp)
495 BaseOps.push_back(BaseOp);
496 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
497 if (BaseOp)
498 BaseOps.push_back(BaseOp);
499 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
500 // Get appropriate operand, and compute width accordingly.
501 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
502 if (DataOpIdx == -1)
503 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
504 if (DataOpIdx == -1) // LDS DMA
505 return false;
506 Width = LocationSize::precise(getOpSize(LdSt, DataOpIdx));
507 return true;
508 }
509
510 return false;
511}
512
513static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
515 const MachineInstr &MI2,
517 // Only examine the first "base" operand of each instruction, on the
518 // assumption that it represents the real base address of the memory access.
519 // Other operands are typically offsets or indices from this base address.
520 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
521 return true;
522
523 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
524 return false;
525
526 auto *MO1 = *MI1.memoperands_begin();
527 auto *MO2 = *MI2.memoperands_begin();
528 if (MO1->getAddrSpace() != MO2->getAddrSpace())
529 return false;
530
531 const auto *Base1 = MO1->getValue();
532 const auto *Base2 = MO2->getValue();
533 if (!Base1 || !Base2)
534 return false;
535 Base1 = getUnderlyingObject(Base1);
536 Base2 = getUnderlyingObject(Base2);
537
538 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
539 return false;
540
541 return Base1 == Base2;
542}
543
545 int64_t Offset1, bool OffsetIsScalable1,
547 int64_t Offset2, bool OffsetIsScalable2,
548 unsigned ClusterSize,
549 unsigned NumBytes) const {
550 // If the mem ops (to be clustered) do not have the same base ptr, then they
551 // should not be clustered
552 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
553 if (!BaseOps1.empty() && !BaseOps2.empty()) {
554 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
555 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
556 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
557 return false;
558
559 const SIMachineFunctionInfo *MFI =
560 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
561 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
562 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
563 // If only one base op is empty, they do not have the same base ptr
564 return false;
565 }
566
567 // In order to avoid register pressure, on an average, the number of DWORDS
568 // loaded together by all clustered mem ops should not exceed
569 // MaxMemoryClusterDWords. This is an empirical value based on certain
570 // observations and performance related experiments.
571 // The good thing about this heuristic is - it avoids clustering of too many
572 // sub-word loads, and also avoids clustering of wide loads. Below is the
573 // brief summary of how the heuristic behaves for various `LoadSize` when
574 // MaxMemoryClusterDWords is 8.
575 //
576 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
577 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
578 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
579 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
580 // (5) LoadSize >= 17: do not cluster
581 const unsigned LoadSize = NumBytes / ClusterSize;
582 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
583 return NumDWords <= MaxMemoryClusterDWords;
584}
585
586// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
587// the first 16 loads will be interleaved with the stores, and the next 16 will
588// be clustered as expected. It should really split into 2 16 store batches.
589//
590// Loads are clustered until this returns false, rather than trying to schedule
591// groups of stores. This also means we have to deal with saying different
592// address space loads should be clustered, and ones which might cause bank
593// conflicts.
594//
595// This might be deprecated so it might not be worth that much effort to fix.
597 int64_t Offset0, int64_t Offset1,
598 unsigned NumLoads) const {
599 assert(Offset1 > Offset0 &&
600 "Second offset should be larger than first offset!");
601 // If we have less than 16 loads in a row, and the offsets are within 64
602 // bytes, then schedule together.
603
604 // A cacheline is 64 bytes (for global memory).
605 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
606}
607
610 const DebugLoc &DL, MCRegister DestReg,
611 MCRegister SrcReg, bool KillSrc,
612 const char *Msg = "illegal VGPR to SGPR copy") {
613 MachineFunction *MF = MBB.getParent();
614
616 C.diagnose(DiagnosticInfoUnsupported(MF->getFunction(), Msg, DL, DS_Error));
617
618 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
619 .addReg(SrcReg, getKillRegState(KillSrc));
620}
621
622/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
623/// possible to have a direct copy in these cases on GFX908, so an intermediate
624/// VGPR copy is required.
628 const DebugLoc &DL, MCRegister DestReg,
629 MCRegister SrcReg, bool KillSrc,
630 RegScavenger &RS, bool RegsOverlap,
631 Register ImpDefSuperReg = Register(),
632 Register ImpUseSuperReg = Register()) {
633 assert((TII.getSubtarget().hasMAIInsts() &&
634 !TII.getSubtarget().hasGFX90AInsts()) &&
635 "Expected GFX908 subtarget.");
636
637 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
638 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
639 "Source register of the copy should be either an SGPR or an AGPR.");
640
641 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
642 "Destination register of the copy should be an AGPR.");
643
644 const SIRegisterInfo &RI = TII.getRegisterInfo();
645
646 // First try to find defining accvgpr_write to avoid temporary registers.
647 // In the case of copies of overlapping AGPRs, we conservatively do not
648 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
649 // an accvgpr_write used for this same copy due to implicit-defs
650 if (!RegsOverlap) {
651 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
652 --Def;
653
654 if (!Def->modifiesRegister(SrcReg, &RI))
655 continue;
656
657 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
658 Def->getOperand(0).getReg() != SrcReg)
659 break;
660
661 MachineOperand &DefOp = Def->getOperand(1);
662 assert(DefOp.isReg() || DefOp.isImm());
663
664 if (DefOp.isReg()) {
665 bool SafeToPropagate = true;
666 // Check that register source operand is not clobbered before MI.
667 // Immediate operands are always safe to propagate.
668 for (auto I = Def; I != MI && SafeToPropagate; ++I)
669 if (I->modifiesRegister(DefOp.getReg(), &RI))
670 SafeToPropagate = false;
671
672 if (!SafeToPropagate)
673 break;
674
675 for (auto I = Def; I != MI; ++I)
676 I->clearRegisterKills(DefOp.getReg(), &RI);
677 }
678
679 MachineInstrBuilder Builder =
680 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
681 .add(DefOp);
682 if (ImpDefSuperReg)
683 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
684
685 if (ImpUseSuperReg) {
686 Builder.addReg(ImpUseSuperReg,
688 }
689
690 return;
691 }
692 }
693
694 RS.enterBasicBlockEnd(MBB);
695 RS.backward(std::next(MI));
696
697 // Ideally we want to have three registers for a long reg_sequence copy
698 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
699 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
700 *MBB.getParent());
701
702 // Registers in the sequence are allocated contiguously so we can just
703 // use register number to pick one of three round-robin temps.
704 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
705 Register Tmp =
706 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
707 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
708 "VGPR used for an intermediate copy should have been reserved.");
709
710 // Only loop through if there are any free registers left. We don't want to
711 // spill.
712 while (RegNo--) {
713 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
714 /* RestoreAfter */ false, 0,
715 /* AllowSpill */ false);
716 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
717 break;
718 Tmp = Tmp2;
719 RS.setRegUsed(Tmp);
720 }
721
722 // Insert copy to temporary VGPR.
723 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
724 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
725 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
726 } else {
727 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
728 }
729
730 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
731 .addReg(SrcReg, getKillRegState(KillSrc));
732 if (ImpUseSuperReg) {
733 UseBuilder.addReg(ImpUseSuperReg,
735 }
736
737 MachineInstrBuilder DefBuilder
738 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
739 .addReg(Tmp, RegState::Kill);
740
741 if (ImpDefSuperReg)
742 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
743}
744
747 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
748 const TargetRegisterClass *RC, bool Forward) {
749 const SIRegisterInfo &RI = TII.getRegisterInfo();
750 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
752 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
753
754 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
755 int16_t SubIdx = BaseIndices[Idx];
756 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
757 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
758 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
759 unsigned Opcode = AMDGPU::S_MOV_B32;
760
761 // Is SGPR aligned? If so try to combine with next.
762 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
763 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
764 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
765 // Can use SGPR64 copy
766 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
767 SubIdx = RI.getSubRegFromChannel(Channel, 2);
768 DestSubReg = RI.getSubReg(DestReg, SubIdx);
769 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
770 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
771 Opcode = AMDGPU::S_MOV_B64;
772 Idx++;
773 }
774
775 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
776 .addReg(SrcSubReg)
777 .addReg(SrcReg, RegState::Implicit);
778
779 if (!FirstMI)
780 FirstMI = LastMI;
781
782 if (!Forward)
783 I--;
784 }
785
786 assert(FirstMI && LastMI);
787 if (!Forward)
788 std::swap(FirstMI, LastMI);
789
790 FirstMI->addOperand(
791 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
792
793 if (KillSrc)
794 LastMI->addRegisterKilled(SrcReg, &RI);
795}
796
799 const DebugLoc &DL, Register DestReg,
800 Register SrcReg, bool KillSrc, bool RenamableDest,
801 bool RenamableSrc) const {
802 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
803 unsigned Size = RI.getRegSizeInBits(*RC);
804 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
805 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
806
807 // The rest of copyPhysReg assumes Src and Dst size are the same size.
808 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
809 // we remove Fix16BitCopies and this code block?
810 if (Fix16BitCopies) {
811 if (((Size == 16) != (SrcSize == 16))) {
812 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
813 assert(ST.useRealTrue16Insts());
814 Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
815 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
816 RegToFix = SubReg;
817
818 if (DestReg == SrcReg) {
819 // Identity copy. Insert empty bundle since ExpandPostRA expects an
820 // instruction here.
821 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
822 return;
823 }
824 RC = RI.getPhysRegBaseClass(DestReg);
825 Size = RI.getRegSizeInBits(*RC);
826 SrcRC = RI.getPhysRegBaseClass(SrcReg);
827 SrcSize = RI.getRegSizeInBits(*SrcRC);
828 }
829 }
830
831 if (RC == &AMDGPU::VGPR_32RegClass) {
832 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
833 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
834 AMDGPU::AGPR_32RegClass.contains(SrcReg));
835 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
836 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
837 BuildMI(MBB, MI, DL, get(Opc), DestReg)
838 .addReg(SrcReg, getKillRegState(KillSrc));
839 return;
840 }
841
842 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
843 RC == &AMDGPU::SReg_32RegClass) {
844 if (SrcReg == AMDGPU::SCC) {
845 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
846 .addImm(1)
847 .addImm(0);
848 return;
849 }
850
851 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
852 if (DestReg == AMDGPU::VCC_LO) {
853 // FIXME: Hack until VReg_1 removed.
854 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
855 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
856 .addImm(0)
857 .addReg(SrcReg, getKillRegState(KillSrc));
858 return;
859 }
860
861 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
862 return;
863 }
864
865 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
866 .addReg(SrcReg, getKillRegState(KillSrc));
867 return;
868 }
869
870 if (RC == &AMDGPU::SReg_64RegClass) {
871 if (SrcReg == AMDGPU::SCC) {
872 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
873 .addImm(1)
874 .addImm(0);
875 return;
876 }
877
878 if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
879 if (DestReg == AMDGPU::VCC) {
880 // FIXME: Hack until VReg_1 removed.
881 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
882 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
883 .addImm(0)
884 .addReg(SrcReg, getKillRegState(KillSrc));
885 return;
886 }
887
888 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
889 return;
890 }
891
892 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
893 .addReg(SrcReg, getKillRegState(KillSrc));
894 return;
895 }
896
897 if (DestReg == AMDGPU::SCC) {
898 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
899 // but SelectionDAG emits such copies for i1 sources.
900 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
901 // This copy can only be produced by patterns
902 // with explicit SCC, which are known to be enabled
903 // only for subtargets with S_CMP_LG_U64 present.
904 assert(ST.hasScalarCompareEq64());
905 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
906 .addReg(SrcReg, getKillRegState(KillSrc))
907 .addImm(0);
908 } else {
909 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
910 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
911 .addReg(SrcReg, getKillRegState(KillSrc))
912 .addImm(0);
913 }
914
915 return;
916 }
917
918 if (RC == &AMDGPU::AGPR_32RegClass) {
919 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
920 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
921 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
922 .addReg(SrcReg, getKillRegState(KillSrc));
923 return;
924 }
925
926 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
927 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
928 .addReg(SrcReg, getKillRegState(KillSrc));
929 return;
930 }
931
932 // FIXME: Pass should maintain scavenger to avoid scan through the block on
933 // every AGPR spill.
934 RegScavenger RS;
935 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
936 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
937 return;
938 }
939
940 if (Size == 16) {
941 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
942 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
943 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
944
945 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
946 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
947 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
948 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
949 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
950 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
951 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
952 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
953
954 if (IsSGPRDst) {
955 if (!IsSGPRSrc) {
956 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
957 return;
958 }
959
960 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
961 .addReg(NewSrcReg, getKillRegState(KillSrc));
962 return;
963 }
964
965 if (IsAGPRDst || IsAGPRSrc) {
966 if (!DstLow || !SrcLow) {
967 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
968 "Cannot use hi16 subreg with an AGPR!");
969 }
970
971 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
972 return;
973 }
974
975 if (ST.useRealTrue16Insts()) {
976 if (IsSGPRSrc) {
977 assert(SrcLow);
978 SrcReg = NewSrcReg;
979 }
980 // Use the smaller instruction encoding if possible.
981 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
982 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
983 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
984 .addReg(SrcReg);
985 } else {
986 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
987 .addImm(0) // src0_modifiers
988 .addReg(SrcReg)
989 .addImm(0); // op_sel
990 }
991 return;
992 }
993
994 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
995 if (!DstLow || !SrcLow) {
996 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
997 "Cannot use hi16 subreg on VI!");
998 }
999
1000 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1001 .addReg(NewSrcReg, getKillRegState(KillSrc));
1002 return;
1003 }
1004
1005 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1006 .addImm(0) // src0_modifiers
1007 .addReg(NewSrcReg)
1008 .addImm(0) // clamp
1015 // First implicit operand is $exec.
1016 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1017 return;
1018 }
1019
1020 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1021 if (ST.hasVMovB64Inst()) {
1022 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1023 .addReg(SrcReg, getKillRegState(KillSrc));
1024 return;
1025 }
1026 if (ST.hasPkMovB32()) {
1027 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1029 .addReg(SrcReg)
1031 .addReg(SrcReg)
1032 .addImm(0) // op_sel_lo
1033 .addImm(0) // op_sel_hi
1034 .addImm(0) // neg_lo
1035 .addImm(0) // neg_hi
1036 .addImm(0) // clamp
1037 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1038 return;
1039 }
1040 }
1041
1042 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1043 if (RI.isSGPRClass(RC)) {
1044 if (!RI.isSGPRClass(SrcRC)) {
1045 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1046 return;
1047 }
1048 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1049 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1050 Forward);
1051 return;
1052 }
1053
1054 unsigned EltSize = 4;
1055 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1056 if (RI.isAGPRClass(RC)) {
1057 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1058 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1059 else if (RI.hasVGPRs(SrcRC) ||
1060 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1061 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1062 else
1063 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1064 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1065 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1066 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1067 (RI.isProperlyAlignedRC(*RC) &&
1068 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1069 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1070 if (ST.hasVMovB64Inst()) {
1071 Opcode = AMDGPU::V_MOV_B64_e32;
1072 EltSize = 8;
1073 } else if (ST.hasPkMovB32()) {
1074 Opcode = AMDGPU::V_PK_MOV_B32;
1075 EltSize = 8;
1076 }
1077 }
1078
1079 // For the cases where we need an intermediate instruction/temporary register
1080 // (destination is an AGPR), we need a scavenger.
1081 //
1082 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1083 // whole block for every handled copy.
1084 std::unique_ptr<RegScavenger> RS;
1085 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1086 RS = std::make_unique<RegScavenger>();
1087
1088 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1089
1090 // If there is an overlap, we can't kill the super-register on the last
1091 // instruction, since it will also kill the components made live by this def.
1092 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1093 const bool CanKillSuperReg = KillSrc && !Overlap;
1094
1095 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1096 unsigned SubIdx;
1097 if (Forward)
1098 SubIdx = SubIndices[Idx];
1099 else
1100 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1101 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1102 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1103 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1104
1105 bool IsFirstSubreg = Idx == 0;
1106 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1107
1108 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1109 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1110 Register ImpUseSuper = SrcReg;
1111 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1112 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1113 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1115 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1117 .addReg(SrcSubReg)
1119 .addReg(SrcSubReg)
1120 .addImm(0) // op_sel_lo
1121 .addImm(0) // op_sel_hi
1122 .addImm(0) // neg_lo
1123 .addImm(0) // neg_hi
1124 .addImm(0) // clamp
1125 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1126 if (IsFirstSubreg)
1128 } else {
1129 MachineInstrBuilder Builder =
1130 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1131 if (IsFirstSubreg)
1132 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1133
1134 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1135 }
1136 }
1137}
1138
1139int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1140 int32_t NewOpc;
1141
1142 // Try to map original to commuted opcode
1143 NewOpc = AMDGPU::getCommuteRev(Opcode);
1144 if (NewOpc != -1)
1145 // Check if the commuted (REV) opcode exists on the target.
1146 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1147
1148 // Try to map commuted to original opcode
1149 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1150 if (NewOpc != -1)
1151 // Check if the original (non-REV) opcode exists on the target.
1152 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1153
1154 return Opcode;
1155}
1156
1157const TargetRegisterClass *
1159 return &AMDGPU::VGPR_32RegClass;
1160}
1161
1164 const DebugLoc &DL, Register DstReg,
1166 Register TrueReg,
1167 Register FalseReg) const {
1168 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1169 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1171 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1172 "Not a VGPR32 reg");
1173
1174 if (Cond.size() == 1) {
1175 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1176 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1177 .add(Cond[0]);
1178 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1179 .addImm(0)
1180 .addReg(FalseReg)
1181 .addImm(0)
1182 .addReg(TrueReg)
1183 .addReg(SReg);
1184 } else if (Cond.size() == 2) {
1185 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1186 switch (Cond[0].getImm()) {
1187 case SIInstrInfo::SCC_TRUE: {
1188 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1189 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1190 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1191 .addImm(0)
1192 .addReg(FalseReg)
1193 .addImm(0)
1194 .addReg(TrueReg)
1195 .addReg(SReg);
1196 break;
1197 }
1198 case SIInstrInfo::SCC_FALSE: {
1199 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1200 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1201 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1202 .addImm(0)
1203 .addReg(FalseReg)
1204 .addImm(0)
1205 .addReg(TrueReg)
1206 .addReg(SReg);
1207 break;
1208 }
1209 case SIInstrInfo::VCCNZ: {
1210 MachineOperand RegOp = Cond[1];
1211 RegOp.setImplicit(false);
1212 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1213 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1214 .add(RegOp);
1215 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1216 .addImm(0)
1217 .addReg(FalseReg)
1218 .addImm(0)
1219 .addReg(TrueReg)
1220 .addReg(SReg);
1221 break;
1222 }
1223 case SIInstrInfo::VCCZ: {
1224 MachineOperand RegOp = Cond[1];
1225 RegOp.setImplicit(false);
1226 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1227 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1228 .add(RegOp);
1229 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1230 .addImm(0)
1231 .addReg(TrueReg)
1232 .addImm(0)
1233 .addReg(FalseReg)
1234 .addReg(SReg);
1235 break;
1236 }
1237 case SIInstrInfo::EXECNZ: {
1238 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1239 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1240 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1241 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(1).addImm(0);
1242 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1243 .addImm(0)
1244 .addReg(FalseReg)
1245 .addImm(0)
1246 .addReg(TrueReg)
1247 .addReg(SReg);
1248 break;
1249 }
1250 case SIInstrInfo::EXECZ: {
1251 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1252 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1253 BuildMI(MBB, I, DL, get(LMC.OrSaveExecOpc), SReg2).addImm(0);
1254 BuildMI(MBB, I, DL, get(LMC.CSelectOpc), SReg).addImm(0).addImm(1);
1255 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1256 .addImm(0)
1257 .addReg(FalseReg)
1258 .addImm(0)
1259 .addReg(TrueReg)
1260 .addReg(SReg);
1261 llvm_unreachable("Unhandled branch predicate EXECZ");
1262 break;
1263 }
1264 default:
1265 llvm_unreachable("invalid branch predicate");
1266 }
1267 } else {
1268 llvm_unreachable("Can only handle Cond size 1 or 2");
1269 }
1270}
1271
1274 const DebugLoc &DL,
1275 Register SrcReg, int Value) const {
1276 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1277 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1278 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1279 .addImm(Value)
1280 .addReg(SrcReg);
1281
1282 return Reg;
1283}
1284
1287 const DebugLoc &DL,
1288 Register SrcReg, int Value) const {
1289 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1290 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1291 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1292 .addImm(Value)
1293 .addReg(SrcReg);
1294
1295 return Reg;
1296}
1297
1299 const Register Reg,
1300 int64_t &ImmVal) const {
1301 switch (MI.getOpcode()) {
1302 case AMDGPU::V_MOV_B32_e32:
1303 case AMDGPU::S_MOV_B32:
1304 case AMDGPU::S_MOVK_I32:
1305 case AMDGPU::S_MOV_B64:
1306 case AMDGPU::V_MOV_B64_e32:
1307 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
1308 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
1309 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
1310 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
1311 case AMDGPU::V_MOV_B64_PSEUDO:
1312 case AMDGPU::V_MOV_B16_t16_e32: {
1313 const MachineOperand &Src0 = MI.getOperand(1);
1314 if (Src0.isImm()) {
1315 ImmVal = Src0.getImm();
1316 return MI.getOperand(0).getReg() == Reg;
1317 }
1318
1319 return false;
1320 }
1321 case AMDGPU::V_MOV_B16_t16_e64: {
1322 const MachineOperand &Src0 = MI.getOperand(2);
1323 if (Src0.isImm() && !MI.getOperand(1).getImm()) {
1324 ImmVal = Src0.getImm();
1325 return MI.getOperand(0).getReg() == Reg;
1326 }
1327
1328 return false;
1329 }
1330 case AMDGPU::S_BREV_B32:
1331 case AMDGPU::V_BFREV_B32_e32:
1332 case AMDGPU::V_BFREV_B32_e64: {
1333 const MachineOperand &Src0 = MI.getOperand(1);
1334 if (Src0.isImm()) {
1335 ImmVal = static_cast<int64_t>(reverseBits<int32_t>(Src0.getImm()));
1336 return MI.getOperand(0).getReg() == Reg;
1337 }
1338
1339 return false;
1340 }
1341 case AMDGPU::S_NOT_B32:
1342 case AMDGPU::V_NOT_B32_e32:
1343 case AMDGPU::V_NOT_B32_e64: {
1344 const MachineOperand &Src0 = MI.getOperand(1);
1345 if (Src0.isImm()) {
1346 ImmVal = static_cast<int64_t>(~static_cast<int32_t>(Src0.getImm()));
1347 return MI.getOperand(0).getReg() == Reg;
1348 }
1349
1350 return false;
1351 }
1352 default:
1353 return false;
1354 }
1355}
1356
1357std::optional<int64_t>
1359 if (Op.isImm())
1360 return Op.getImm();
1361
1362 if (!Op.isReg() || !Op.getReg().isVirtual())
1363 return std::nullopt;
1364 MachineRegisterInfo &MRI = Op.getParent()->getMF()->getRegInfo();
1365 const MachineInstr *Def = MRI.getVRegDef(Op.getReg());
1366 if (Def && Def->isMoveImmediate()) {
1367 const MachineOperand &ImmSrc = Def->getOperand(1);
1368 if (ImmSrc.isImm())
1369 return extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg());
1370 }
1371
1372 return std::nullopt;
1373}
1374
1376
1377 if (RI.isAGPRClass(DstRC))
1378 return AMDGPU::COPY;
1379 if (RI.getRegSizeInBits(*DstRC) == 16) {
1380 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1381 // before RA.
1382 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1383 }
1384 if (RI.getRegSizeInBits(*DstRC) == 32)
1385 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1386 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1387 return AMDGPU::S_MOV_B64;
1388 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1389 return AMDGPU::V_MOV_B64_PSEUDO;
1390 return AMDGPU::COPY;
1391}
1392
1393const MCInstrDesc &
1395 bool IsIndirectSrc) const {
1396 if (IsIndirectSrc) {
1397 if (VecSize <= 32) // 4 bytes
1398 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1399 if (VecSize <= 64) // 8 bytes
1400 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1401 if (VecSize <= 96) // 12 bytes
1402 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1403 if (VecSize <= 128) // 16 bytes
1404 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1405 if (VecSize <= 160) // 20 bytes
1406 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1407 if (VecSize <= 192) // 24 bytes
1408 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6);
1409 if (VecSize <= 224) // 28 bytes
1410 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7);
1411 if (VecSize <= 256) // 32 bytes
1412 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1413 if (VecSize <= 288) // 36 bytes
1414 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1415 if (VecSize <= 320) // 40 bytes
1416 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1417 if (VecSize <= 352) // 44 bytes
1418 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1419 if (VecSize <= 384) // 48 bytes
1420 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1421 if (VecSize <= 512) // 64 bytes
1422 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1423 if (VecSize <= 1024) // 128 bytes
1424 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1425
1426 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1427 }
1428
1429 if (VecSize <= 32) // 4 bytes
1430 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1431 if (VecSize <= 64) // 8 bytes
1432 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1433 if (VecSize <= 96) // 12 bytes
1434 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1435 if (VecSize <= 128) // 16 bytes
1436 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1437 if (VecSize <= 160) // 20 bytes
1438 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1439 if (VecSize <= 192) // 24 bytes
1440 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6);
1441 if (VecSize <= 224) // 28 bytes
1442 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7);
1443 if (VecSize <= 256) // 32 bytes
1444 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1445 if (VecSize <= 288) // 36 bytes
1446 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1447 if (VecSize <= 320) // 40 bytes
1448 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1449 if (VecSize <= 352) // 44 bytes
1450 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1451 if (VecSize <= 384) // 48 bytes
1452 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1453 if (VecSize <= 512) // 64 bytes
1454 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1455 if (VecSize <= 1024) // 128 bytes
1456 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1457
1458 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1459}
1460
1461static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1462 if (VecSize <= 32) // 4 bytes
1463 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1464 if (VecSize <= 64) // 8 bytes
1465 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1466 if (VecSize <= 96) // 12 bytes
1467 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1468 if (VecSize <= 128) // 16 bytes
1469 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1470 if (VecSize <= 160) // 20 bytes
1471 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1472 if (VecSize <= 192) // 24 bytes
1473 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1474 if (VecSize <= 224) // 28 bytes
1475 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1476 if (VecSize <= 256) // 32 bytes
1477 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1478 if (VecSize <= 288) // 36 bytes
1479 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1480 if (VecSize <= 320) // 40 bytes
1481 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1482 if (VecSize <= 352) // 44 bytes
1483 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1484 if (VecSize <= 384) // 48 bytes
1485 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1486 if (VecSize <= 512) // 64 bytes
1487 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1488 if (VecSize <= 1024) // 128 bytes
1489 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1490
1491 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1492}
1493
1494static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1495 if (VecSize <= 32) // 4 bytes
1496 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1497 if (VecSize <= 64) // 8 bytes
1498 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1499 if (VecSize <= 96) // 12 bytes
1500 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1501 if (VecSize <= 128) // 16 bytes
1502 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1503 if (VecSize <= 160) // 20 bytes
1504 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1505 if (VecSize <= 192) // 24 bytes
1506 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6;
1507 if (VecSize <= 224) // 28 bytes
1508 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7;
1509 if (VecSize <= 256) // 32 bytes
1510 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1511 if (VecSize <= 288) // 36 bytes
1512 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1513 if (VecSize <= 320) // 40 bytes
1514 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1515 if (VecSize <= 352) // 44 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1517 if (VecSize <= 384) // 48 bytes
1518 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1519 if (VecSize <= 512) // 64 bytes
1520 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1521 if (VecSize <= 1024) // 128 bytes
1522 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1523
1524 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1525}
1526
1527static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1528 if (VecSize <= 64) // 8 bytes
1529 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1530 if (VecSize <= 128) // 16 bytes
1531 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1532 if (VecSize <= 256) // 32 bytes
1533 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1534 if (VecSize <= 512) // 64 bytes
1535 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1536 if (VecSize <= 1024) // 128 bytes
1537 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1538
1539 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1540}
1541
1542const MCInstrDesc &
1543SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1544 bool IsSGPR) const {
1545 if (IsSGPR) {
1546 switch (EltSize) {
1547 case 32:
1548 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1549 case 64:
1550 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1551 default:
1552 llvm_unreachable("invalid reg indexing elt size");
1553 }
1554 }
1555
1556 assert(EltSize == 32 && "invalid reg indexing elt size");
1558}
1559
1560static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1561 switch (Size) {
1562 case 4:
1563 return AMDGPU::SI_SPILL_S32_SAVE;
1564 case 8:
1565 return AMDGPU::SI_SPILL_S64_SAVE;
1566 case 12:
1567 return AMDGPU::SI_SPILL_S96_SAVE;
1568 case 16:
1569 return AMDGPU::SI_SPILL_S128_SAVE;
1570 case 20:
1571 return AMDGPU::SI_SPILL_S160_SAVE;
1572 case 24:
1573 return AMDGPU::SI_SPILL_S192_SAVE;
1574 case 28:
1575 return AMDGPU::SI_SPILL_S224_SAVE;
1576 case 32:
1577 return AMDGPU::SI_SPILL_S256_SAVE;
1578 case 36:
1579 return AMDGPU::SI_SPILL_S288_SAVE;
1580 case 40:
1581 return AMDGPU::SI_SPILL_S320_SAVE;
1582 case 44:
1583 return AMDGPU::SI_SPILL_S352_SAVE;
1584 case 48:
1585 return AMDGPU::SI_SPILL_S384_SAVE;
1586 case 64:
1587 return AMDGPU::SI_SPILL_S512_SAVE;
1588 case 128:
1589 return AMDGPU::SI_SPILL_S1024_SAVE;
1590 default:
1591 llvm_unreachable("unknown register size");
1592 }
1593}
1594
1595static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1596 switch (Size) {
1597 case 2:
1598 return AMDGPU::SI_SPILL_V16_SAVE;
1599 case 4:
1600 return AMDGPU::SI_SPILL_V32_SAVE;
1601 case 8:
1602 return AMDGPU::SI_SPILL_V64_SAVE;
1603 case 12:
1604 return AMDGPU::SI_SPILL_V96_SAVE;
1605 case 16:
1606 return AMDGPU::SI_SPILL_V128_SAVE;
1607 case 20:
1608 return AMDGPU::SI_SPILL_V160_SAVE;
1609 case 24:
1610 return AMDGPU::SI_SPILL_V192_SAVE;
1611 case 28:
1612 return AMDGPU::SI_SPILL_V224_SAVE;
1613 case 32:
1614 return AMDGPU::SI_SPILL_V256_SAVE;
1615 case 36:
1616 return AMDGPU::SI_SPILL_V288_SAVE;
1617 case 40:
1618 return AMDGPU::SI_SPILL_V320_SAVE;
1619 case 44:
1620 return AMDGPU::SI_SPILL_V352_SAVE;
1621 case 48:
1622 return AMDGPU::SI_SPILL_V384_SAVE;
1623 case 64:
1624 return AMDGPU::SI_SPILL_V512_SAVE;
1625 case 128:
1626 return AMDGPU::SI_SPILL_V1024_SAVE;
1627 default:
1628 llvm_unreachable("unknown register size");
1629 }
1630}
1631
1632static unsigned getAVSpillSaveOpcode(unsigned Size) {
1633 switch (Size) {
1634 case 4:
1635 return AMDGPU::SI_SPILL_AV32_SAVE;
1636 case 8:
1637 return AMDGPU::SI_SPILL_AV64_SAVE;
1638 case 12:
1639 return AMDGPU::SI_SPILL_AV96_SAVE;
1640 case 16:
1641 return AMDGPU::SI_SPILL_AV128_SAVE;
1642 case 20:
1643 return AMDGPU::SI_SPILL_AV160_SAVE;
1644 case 24:
1645 return AMDGPU::SI_SPILL_AV192_SAVE;
1646 case 28:
1647 return AMDGPU::SI_SPILL_AV224_SAVE;
1648 case 32:
1649 return AMDGPU::SI_SPILL_AV256_SAVE;
1650 case 36:
1651 return AMDGPU::SI_SPILL_AV288_SAVE;
1652 case 40:
1653 return AMDGPU::SI_SPILL_AV320_SAVE;
1654 case 44:
1655 return AMDGPU::SI_SPILL_AV352_SAVE;
1656 case 48:
1657 return AMDGPU::SI_SPILL_AV384_SAVE;
1658 case 64:
1659 return AMDGPU::SI_SPILL_AV512_SAVE;
1660 case 128:
1661 return AMDGPU::SI_SPILL_AV1024_SAVE;
1662 default:
1663 llvm_unreachable("unknown register size");
1664 }
1665}
1666
1667static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1668 bool IsVectorSuperClass) {
1669 // Currently, there is only 32-bit WWM register spills needed.
1670 if (Size != 4)
1671 llvm_unreachable("unknown wwm register spill size");
1672
1673 if (IsVectorSuperClass)
1674 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1675
1676 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1677}
1678
1680 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1681 const SIMachineFunctionInfo &MFI) const {
1682 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1683
1684 // Choose the right opcode if spilling a WWM register.
1686 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1687
1688 // TODO: Check if AGPRs are available
1689 if (ST.hasMAIInsts())
1690 return getAVSpillSaveOpcode(Size);
1691
1693}
1694
1697 bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
1698 MachineInstr::MIFlag Flags) const {
1699 MachineFunction *MF = MBB.getParent();
1701 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1702 const DebugLoc &DL = MBB.findDebugLoc(MI);
1703
1704 MachinePointerInfo PtrInfo
1705 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1707 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1708 FrameInfo.getObjectAlign(FrameIndex));
1709 unsigned SpillSize = RI.getSpillSize(*RC);
1710
1711 MachineRegisterInfo &MRI = MF->getRegInfo();
1712 if (RI.isSGPRClass(RC)) {
1713 MFI->setHasSpilledSGPRs();
1714 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1715 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1716 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1717
1718 // We are only allowed to create one new instruction when spilling
1719 // registers, so we need to use pseudo instruction for spilling SGPRs.
1720 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1721
1722 // The SGPR spill/restore instructions only work on number sgprs, so we need
1723 // to make sure we are using the correct register class.
1724 if (SrcReg.isVirtual() && SpillSize == 4) {
1725 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1726 }
1727
1728 BuildMI(MBB, MI, DL, OpDesc)
1729 .addReg(SrcReg, getKillRegState(isKill)) // data
1730 .addFrameIndex(FrameIndex) // addr
1731 .addMemOperand(MMO)
1733
1734 if (RI.spillSGPRToVGPR())
1735 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1736 return;
1737 }
1738
1739 unsigned Opcode =
1740 getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI);
1741 MFI->setHasSpilledVGPRs();
1742
1743 BuildMI(MBB, MI, DL, get(Opcode))
1744 .addReg(SrcReg, getKillRegState(isKill)) // data
1745 .addFrameIndex(FrameIndex) // addr
1746 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1747 .addImm(0) // offset
1748 .addMemOperand(MMO);
1749}
1750
1751static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1752 switch (Size) {
1753 case 4:
1754 return AMDGPU::SI_SPILL_S32_RESTORE;
1755 case 8:
1756 return AMDGPU::SI_SPILL_S64_RESTORE;
1757 case 12:
1758 return AMDGPU::SI_SPILL_S96_RESTORE;
1759 case 16:
1760 return AMDGPU::SI_SPILL_S128_RESTORE;
1761 case 20:
1762 return AMDGPU::SI_SPILL_S160_RESTORE;
1763 case 24:
1764 return AMDGPU::SI_SPILL_S192_RESTORE;
1765 case 28:
1766 return AMDGPU::SI_SPILL_S224_RESTORE;
1767 case 32:
1768 return AMDGPU::SI_SPILL_S256_RESTORE;
1769 case 36:
1770 return AMDGPU::SI_SPILL_S288_RESTORE;
1771 case 40:
1772 return AMDGPU::SI_SPILL_S320_RESTORE;
1773 case 44:
1774 return AMDGPU::SI_SPILL_S352_RESTORE;
1775 case 48:
1776 return AMDGPU::SI_SPILL_S384_RESTORE;
1777 case 64:
1778 return AMDGPU::SI_SPILL_S512_RESTORE;
1779 case 128:
1780 return AMDGPU::SI_SPILL_S1024_RESTORE;
1781 default:
1782 llvm_unreachable("unknown register size");
1783 }
1784}
1785
1786static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1787 switch (Size) {
1788 case 2:
1789 return AMDGPU::SI_SPILL_V16_RESTORE;
1790 case 4:
1791 return AMDGPU::SI_SPILL_V32_RESTORE;
1792 case 8:
1793 return AMDGPU::SI_SPILL_V64_RESTORE;
1794 case 12:
1795 return AMDGPU::SI_SPILL_V96_RESTORE;
1796 case 16:
1797 return AMDGPU::SI_SPILL_V128_RESTORE;
1798 case 20:
1799 return AMDGPU::SI_SPILL_V160_RESTORE;
1800 case 24:
1801 return AMDGPU::SI_SPILL_V192_RESTORE;
1802 case 28:
1803 return AMDGPU::SI_SPILL_V224_RESTORE;
1804 case 32:
1805 return AMDGPU::SI_SPILL_V256_RESTORE;
1806 case 36:
1807 return AMDGPU::SI_SPILL_V288_RESTORE;
1808 case 40:
1809 return AMDGPU::SI_SPILL_V320_RESTORE;
1810 case 44:
1811 return AMDGPU::SI_SPILL_V352_RESTORE;
1812 case 48:
1813 return AMDGPU::SI_SPILL_V384_RESTORE;
1814 case 64:
1815 return AMDGPU::SI_SPILL_V512_RESTORE;
1816 case 128:
1817 return AMDGPU::SI_SPILL_V1024_RESTORE;
1818 default:
1819 llvm_unreachable("unknown register size");
1820 }
1821}
1822
1823static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1824 switch (Size) {
1825 case 4:
1826 return AMDGPU::SI_SPILL_AV32_RESTORE;
1827 case 8:
1828 return AMDGPU::SI_SPILL_AV64_RESTORE;
1829 case 12:
1830 return AMDGPU::SI_SPILL_AV96_RESTORE;
1831 case 16:
1832 return AMDGPU::SI_SPILL_AV128_RESTORE;
1833 case 20:
1834 return AMDGPU::SI_SPILL_AV160_RESTORE;
1835 case 24:
1836 return AMDGPU::SI_SPILL_AV192_RESTORE;
1837 case 28:
1838 return AMDGPU::SI_SPILL_AV224_RESTORE;
1839 case 32:
1840 return AMDGPU::SI_SPILL_AV256_RESTORE;
1841 case 36:
1842 return AMDGPU::SI_SPILL_AV288_RESTORE;
1843 case 40:
1844 return AMDGPU::SI_SPILL_AV320_RESTORE;
1845 case 44:
1846 return AMDGPU::SI_SPILL_AV352_RESTORE;
1847 case 48:
1848 return AMDGPU::SI_SPILL_AV384_RESTORE;
1849 case 64:
1850 return AMDGPU::SI_SPILL_AV512_RESTORE;
1851 case 128:
1852 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1853 default:
1854 llvm_unreachable("unknown register size");
1855 }
1856}
1857
1858static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1859 bool IsVectorSuperClass) {
1860 // Currently, there is only 32-bit WWM register spills needed.
1861 if (Size != 4)
1862 llvm_unreachable("unknown wwm register spill size");
1863
1864 if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs
1865 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1866
1867 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1868}
1869
1871 Register Reg, const TargetRegisterClass *RC, unsigned Size,
1872 const SIMachineFunctionInfo &MFI) const {
1873 bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
1874
1875 // Choose the right opcode if restoring a WWM register.
1877 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1878
1879 // TODO: Check if AGPRs are available
1880 if (ST.hasMAIInsts())
1882
1883 assert(!RI.isAGPRClass(RC));
1885}
1886
1889 Register DestReg, int FrameIndex,
1890 const TargetRegisterClass *RC,
1891 Register VReg, unsigned SubReg,
1892 MachineInstr::MIFlag Flags) const {
1893 MachineFunction *MF = MBB.getParent();
1895 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1896 const DebugLoc &DL = MBB.findDebugLoc(MI);
1897 unsigned SpillSize = RI.getSpillSize(*RC);
1898
1899 MachinePointerInfo PtrInfo
1900 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1901
1903 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1904 FrameInfo.getObjectAlign(FrameIndex));
1905
1906 if (RI.isSGPRClass(RC)) {
1907 MFI->setHasSpilledSGPRs();
1908 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1909 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1910 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1911
1912 // FIXME: Maybe this should not include a memoperand because it will be
1913 // lowered to non-memory instructions.
1914 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1915 if (DestReg.isVirtual() && SpillSize == 4) {
1916 MachineRegisterInfo &MRI = MF->getRegInfo();
1917 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1918 }
1919
1920 if (RI.spillSGPRToVGPR())
1921 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1922 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1923 .addFrameIndex(FrameIndex) // addr
1924 .addMemOperand(MMO)
1926
1927 return;
1928 }
1929
1930 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1931 SpillSize, *MFI);
1932 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1933 .addFrameIndex(FrameIndex) // vaddr
1934 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1935 .addImm(0) // offset
1936 .addMemOperand(MMO);
1937}
1938
1943
1946 unsigned Quantity) const {
1947 DebugLoc DL = MBB.findDebugLoc(MI);
1948 unsigned MaxSNopCount = 1u << ST.getSNopBits();
1949 while (Quantity > 0) {
1950 unsigned Arg = std::min(Quantity, MaxSNopCount);
1951 Quantity -= Arg;
1952 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1953 }
1954}
1955
1957 auto *MF = MBB.getParent();
1958 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1959
1960 assert(Info->isEntryFunction());
1961
1962 if (MBB.succ_empty()) {
1963 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1964 if (HasNoTerminator) {
1965 if (Info->returnsVoid()) {
1966 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1967 } else {
1968 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1969 }
1970 }
1971 }
1972}
1973
1977 const DebugLoc &DL) const {
1978 MachineFunction *MF = MBB.getParent();
1979 constexpr unsigned DoorbellIDMask = 0x3ff;
1980 constexpr unsigned ECQueueWaveAbort = 0x400;
1981
1982 MachineBasicBlock *TrapBB = &MBB;
1983 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
1984
1985 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
1986 MBB.splitAt(MI, /*UpdateLiveIns=*/false);
1987 TrapBB = MF->CreateMachineBasicBlock();
1988 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
1989 MF->push_back(TrapBB);
1990 MBB.addSuccessor(TrapBB);
1991 }
1992 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
1993 // will be a nop.
1994 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
1995 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
1996 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1997 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
1998 DoorbellReg)
2000 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2001 .addUse(AMDGPU::M0);
2002 Register DoorbellRegMasked =
2003 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2004 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2005 .addUse(DoorbellReg)
2006 .addImm(DoorbellIDMask);
2007 Register SetWaveAbortBit =
2008 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2009 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2010 .addUse(DoorbellRegMasked)
2011 .addImm(ECQueueWaveAbort);
2012 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2013 .addUse(SetWaveAbortBit);
2014 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2016 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2017 .addUse(AMDGPU::TTMP2);
2018 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2019 TrapBB->addSuccessor(HaltLoopBB);
2020
2021 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2022 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2023 .addMBB(HaltLoopBB);
2024 MF->push_back(HaltLoopBB);
2025 HaltLoopBB->addSuccessor(HaltLoopBB);
2026
2027 return MBB.getNextNode();
2028}
2029
2031 switch (MI.getOpcode()) {
2032 default:
2033 if (MI.isMetaInstruction())
2034 return 0;
2035 return 1; // FIXME: Do wait states equal cycles?
2036
2037 case AMDGPU::S_NOP:
2038 return MI.getOperand(0).getImm() + 1;
2039 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2040 // hazard, even if one exist, won't really be visible. Should we handle it?
2041 }
2042}
2043
2045 MachineBasicBlock &MBB = *MI.getParent();
2046 DebugLoc DL = MBB.findDebugLoc(MI);
2048 switch (MI.getOpcode()) {
2049 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2050 case AMDGPU::S_MOV_B64_term:
2051 // This is only a terminator to get the correct spill code placement during
2052 // register allocation.
2053 MI.setDesc(get(AMDGPU::S_MOV_B64));
2054 break;
2055
2056 case AMDGPU::S_MOV_B32_term:
2057 // This is only a terminator to get the correct spill code placement during
2058 // register allocation.
2059 MI.setDesc(get(AMDGPU::S_MOV_B32));
2060 break;
2061
2062 case AMDGPU::S_XOR_B64_term:
2063 // This is only a terminator to get the correct spill code placement during
2064 // register allocation.
2065 MI.setDesc(get(AMDGPU::S_XOR_B64));
2066 break;
2067
2068 case AMDGPU::S_XOR_B32_term:
2069 // This is only a terminator to get the correct spill code placement during
2070 // register allocation.
2071 MI.setDesc(get(AMDGPU::S_XOR_B32));
2072 break;
2073 case AMDGPU::S_OR_B64_term:
2074 // This is only a terminator to get the correct spill code placement during
2075 // register allocation.
2076 MI.setDesc(get(AMDGPU::S_OR_B64));
2077 break;
2078 case AMDGPU::S_OR_B32_term:
2079 // This is only a terminator to get the correct spill code placement during
2080 // register allocation.
2081 MI.setDesc(get(AMDGPU::S_OR_B32));
2082 break;
2083
2084 case AMDGPU::S_ANDN2_B64_term:
2085 // This is only a terminator to get the correct spill code placement during
2086 // register allocation.
2087 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2088 break;
2089
2090 case AMDGPU::S_ANDN2_B32_term:
2091 // This is only a terminator to get the correct spill code placement during
2092 // register allocation.
2093 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2094 break;
2095
2096 case AMDGPU::S_AND_B64_term:
2097 // This is only a terminator to get the correct spill code placement during
2098 // register allocation.
2099 MI.setDesc(get(AMDGPU::S_AND_B64));
2100 break;
2101
2102 case AMDGPU::S_AND_B32_term:
2103 // This is only a terminator to get the correct spill code placement during
2104 // register allocation.
2105 MI.setDesc(get(AMDGPU::S_AND_B32));
2106 break;
2107
2108 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2109 // This is only a terminator to get the correct spill code placement during
2110 // register allocation.
2111 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2112 break;
2113
2114 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2115 // This is only a terminator to get the correct spill code placement during
2116 // register allocation.
2117 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2118 break;
2119
2120 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2121 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2122 break;
2123
2124 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2125 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2126 break;
2127 case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
2128 Register Dst = MI.getOperand(0).getReg();
2129 bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
2130 MI.setDesc(
2131 get(IsAGPR ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::V_MOV_B32_e32));
2132 break;
2133 }
2134 case AMDGPU::AV_MOV_B64_IMM_PSEUDO: {
2135 Register Dst = MI.getOperand(0).getReg();
2136 if (SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst))) {
2137 int64_t Imm = MI.getOperand(1).getImm();
2138
2139 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2140 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2141 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstLo)
2142 .addImm(SignExtend64<32>(Imm));
2143 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DstHi)
2144 .addImm(SignExtend64<32>(Imm >> 32));
2145 MI.eraseFromParent();
2146 break;
2147 }
2148
2149 [[fallthrough]];
2150 }
2151 case AMDGPU::V_MOV_B64_PSEUDO: {
2152 Register Dst = MI.getOperand(0).getReg();
2153 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2154 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2155
2156 const MCInstrDesc &Mov64Desc = get(AMDGPU::V_MOV_B64_e32);
2157 const TargetRegisterClass *Mov64RC = getRegClass(Mov64Desc, /*OpNum=*/0);
2158
2159 const MachineOperand &SrcOp = MI.getOperand(1);
2160 // FIXME: Will this work for 64-bit floating point immediates?
2161 assert(!SrcOp.isFPImm());
2162 if (ST.hasVMovB64Inst() && Mov64RC->contains(Dst)) {
2163 MI.setDesc(Mov64Desc);
2164 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2165 isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
2166 break;
2167 }
2168 if (SrcOp.isImm()) {
2169 APInt Imm(64, SrcOp.getImm());
2170 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2171 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2172 const MCInstrDesc &PkMovDesc = get(AMDGPU::V_PK_MOV_B32);
2173 const TargetRegisterClass *PkMovRC = getRegClass(PkMovDesc, /*OpNum=*/0);
2174
2175 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo) &&
2176 PkMovRC->contains(Dst)) {
2177 BuildMI(MBB, MI, DL, PkMovDesc, Dst)
2179 .addImm(Lo.getSExtValue())
2181 .addImm(Lo.getSExtValue())
2182 .addImm(0) // op_sel_lo
2183 .addImm(0) // op_sel_hi
2184 .addImm(0) // neg_lo
2185 .addImm(0) // neg_hi
2186 .addImm(0); // clamp
2187 } else {
2188 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2189 .addImm(Lo.getSExtValue());
2190 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2191 .addImm(Hi.getSExtValue());
2192 }
2193 } else {
2194 assert(SrcOp.isReg());
2195 if (ST.hasPkMovB32() &&
2196 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2197 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2198 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2199 .addReg(SrcOp.getReg())
2201 .addReg(SrcOp.getReg())
2202 .addImm(0) // op_sel_lo
2203 .addImm(0) // op_sel_hi
2204 .addImm(0) // neg_lo
2205 .addImm(0) // neg_hi
2206 .addImm(0); // clamp
2207 } else {
2208 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2209 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0));
2210 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2211 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1));
2212 }
2213 }
2214 MI.eraseFromParent();
2215 break;
2216 }
2217 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2219 break;
2220 }
2221 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2222 const MachineOperand &SrcOp = MI.getOperand(1);
2223 assert(!SrcOp.isFPImm());
2224
2225 if (ST.has64BitLiterals()) {
2226 MI.setDesc(get(AMDGPU::S_MOV_B64));
2227 break;
2228 }
2229
2230 APInt Imm(64, SrcOp.getImm());
2231 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2232 MI.setDesc(get(AMDGPU::S_MOV_B64));
2233 break;
2234 }
2235
2236 Register Dst = MI.getOperand(0).getReg();
2237 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2238 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2239
2240 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2241 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2242 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2243 .addImm(Lo.getSExtValue());
2244 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2245 .addImm(Hi.getSExtValue());
2246 MI.eraseFromParent();
2247 break;
2248 }
2249 case AMDGPU::V_SET_INACTIVE_B32: {
2250 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2251 Register DstReg = MI.getOperand(0).getReg();
2252 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2253 .add(MI.getOperand(3))
2254 .add(MI.getOperand(4))
2255 .add(MI.getOperand(1))
2256 .add(MI.getOperand(2))
2257 .add(MI.getOperand(5));
2258 MI.eraseFromParent();
2259 break;
2260 }
2261 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2262 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2263 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2264 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2265 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2266 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2267 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2268 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2269 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2270 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2271 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2272 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2273 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2274 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2275 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2276 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2277 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2278 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2279 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2280 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V6:
2281 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V7:
2282 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2283 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2284 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2285 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2286 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2287 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2288 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2289 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2290 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2291 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2292 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2293 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2294 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2295
2296 unsigned Opc;
2297 if (RI.hasVGPRs(EltRC)) {
2298 Opc = AMDGPU::V_MOVRELD_B32_e32;
2299 } else {
2300 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2301 : AMDGPU::S_MOVRELD_B32;
2302 }
2303
2304 const MCInstrDesc &OpDesc = get(Opc);
2305 Register VecReg = MI.getOperand(0).getReg();
2306 bool IsUndef = MI.getOperand(1).isUndef();
2307 unsigned SubReg = MI.getOperand(3).getImm();
2308 assert(VecReg == MI.getOperand(1).getReg());
2309
2311 BuildMI(MBB, MI, DL, OpDesc)
2312 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2313 .add(MI.getOperand(2))
2315 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2316
2317 const int ImpDefIdx =
2318 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2319 const int ImpUseIdx = ImpDefIdx + 1;
2320 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2321 MI.eraseFromParent();
2322 break;
2323 }
2324 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2325 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2326 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2327 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2328 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2329 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V6:
2330 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V7:
2331 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2332 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2333 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2334 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2335 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2336 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2337 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2338 assert(ST.useVGPRIndexMode());
2339 Register VecReg = MI.getOperand(0).getReg();
2340 bool IsUndef = MI.getOperand(1).isUndef();
2341 MachineOperand &Idx = MI.getOperand(3);
2342 Register SubReg = MI.getOperand(4).getImm();
2343
2344 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2345 .add(Idx)
2347 SetOn->getOperand(3).setIsUndef();
2348
2349 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2351 BuildMI(MBB, MI, DL, OpDesc)
2352 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2353 .add(MI.getOperand(2))
2355 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2356
2357 const int ImpDefIdx =
2358 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2359 const int ImpUseIdx = ImpDefIdx + 1;
2360 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2361
2362 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2363
2364 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2365
2366 MI.eraseFromParent();
2367 break;
2368 }
2369 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2370 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2371 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2372 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2373 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2374 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V6:
2375 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V7:
2376 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2377 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2378 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2379 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2380 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2381 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2382 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2383 assert(ST.useVGPRIndexMode());
2384 Register Dst = MI.getOperand(0).getReg();
2385 Register VecReg = MI.getOperand(1).getReg();
2386 bool IsUndef = MI.getOperand(1).isUndef();
2387 Register SubReg = MI.getOperand(3).getImm();
2388
2389 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2390 .add(MI.getOperand(2))
2392 SetOn->getOperand(3).setIsUndef();
2393
2394 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2395 .addDef(Dst)
2396 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2397 .addReg(VecReg, RegState::Implicit | getUndefRegState(IsUndef));
2398
2399 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2400
2401 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2402
2403 MI.eraseFromParent();
2404 break;
2405 }
2406 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2407 MachineFunction &MF = *MBB.getParent();
2408 Register Reg = MI.getOperand(0).getReg();
2409 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2410 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2411 MachineOperand OpLo = MI.getOperand(1);
2412 MachineOperand OpHi = MI.getOperand(2);
2413
2414 // Create a bundle so these instructions won't be re-ordered by the
2415 // post-RA scheduler.
2416 MIBundleBuilder Bundler(MBB, MI);
2417 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2418
2419 // What we want here is an offset from the value returned by s_getpc (which
2420 // is the address of the s_add_u32 instruction) to the global variable, but
2421 // since the encoding of $symbol starts 4 bytes after the start of the
2422 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2423 // small. This requires us to add 4 to the global variable offset in order
2424 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2425 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2426 // instruction.
2427
2428 int64_t Adjust = 0;
2429 if (ST.hasGetPCZeroExtension()) {
2430 // Fix up hardware that does not sign-extend the 48-bit PC value by
2431 // inserting: s_sext_i32_i16 reghi, reghi
2432 Bundler.append(
2433 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2434 Adjust += 4;
2435 }
2436
2437 if (OpLo.isGlobal())
2438 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2439 Bundler.append(
2440 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2441
2442 if (OpHi.isGlobal())
2443 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2444 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2445 .addReg(RegHi)
2446 .add(OpHi));
2447
2448 finalizeBundle(MBB, Bundler.begin());
2449
2450 MI.eraseFromParent();
2451 break;
2452 }
2453 case AMDGPU::SI_PC_ADD_REL_OFFSET64: {
2454 MachineFunction &MF = *MBB.getParent();
2455 Register Reg = MI.getOperand(0).getReg();
2456 MachineOperand Op = MI.getOperand(1);
2457
2458 // Create a bundle so these instructions won't be re-ordered by the
2459 // post-RA scheduler.
2460 MIBundleBuilder Bundler(MBB, MI);
2461 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2462 if (Op.isGlobal())
2463 Op.setOffset(Op.getOffset() + 4);
2464 Bundler.append(
2465 BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op));
2466
2467 finalizeBundle(MBB, Bundler.begin());
2468
2469 MI.eraseFromParent();
2470 break;
2471 }
2472 case AMDGPU::ENTER_STRICT_WWM: {
2473 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2474 // Whole Wave Mode is entered.
2475 MI.setDesc(get(LMC.OrSaveExecOpc));
2476 break;
2477 }
2478 case AMDGPU::ENTER_STRICT_WQM: {
2479 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2480 // STRICT_WQM is entered.
2481 BuildMI(MBB, MI, DL, get(LMC.MovOpc), MI.getOperand(0).getReg())
2482 .addReg(LMC.ExecReg);
2483 BuildMI(MBB, MI, DL, get(LMC.WQMOpc), LMC.ExecReg).addReg(LMC.ExecReg);
2484
2485 MI.eraseFromParent();
2486 break;
2487 }
2488 case AMDGPU::EXIT_STRICT_WWM:
2489 case AMDGPU::EXIT_STRICT_WQM: {
2490 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2491 // WWM/STICT_WQM is exited.
2492 MI.setDesc(get(LMC.MovOpc));
2493 break;
2494 }
2495 case AMDGPU::SI_RETURN: {
2496 const MachineFunction *MF = MBB.getParent();
2497 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2498 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2499 // Hiding the return address use with SI_RETURN may lead to extra kills in
2500 // the function and missing live-ins. We are fine in practice because callee
2501 // saved register handling ensures the register value is restored before
2502 // RET, but we need the undef flag here to appease the MachineVerifier
2503 // liveness checks.
2505 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2506 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2507
2508 MIB.copyImplicitOps(MI);
2509 MI.eraseFromParent();
2510 break;
2511 }
2512
2513 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2514 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2515 MI.setDesc(get(AMDGPU::S_MUL_U64));
2516 break;
2517
2518 case AMDGPU::S_GETPC_B64_pseudo:
2519 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2520 if (ST.hasGetPCZeroExtension()) {
2521 Register Dst = MI.getOperand(0).getReg();
2522 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2523 // Fix up hardware that does not sign-extend the 48-bit PC value by
2524 // inserting: s_sext_i32_i16 dsthi, dsthi
2525 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2526 DstHi)
2527 .addReg(DstHi);
2528 }
2529 break;
2530
2531 case AMDGPU::V_MAX_BF16_PSEUDO_e64: {
2532 assert(ST.hasBF16PackedInsts());
2533 MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2534 MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2535 MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2536 MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2537 auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2538 Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2539 auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2540 Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2541 break;
2542 }
2543
2544 case AMDGPU::GET_STACK_BASE:
2545 // The stack starts at offset 0 unless we need to reserve some space at the
2546 // bottom.
2547 if (ST.getFrameLowering()->mayReserveScratchForCWSR(*MBB.getParent())) {
2548 // When CWSR is used in dynamic VGPR mode, the trap handler needs to save
2549 // some of the VGPRs. The size of the required scratch space has already
2550 // been computed by prolog epilog insertion.
2551 const SIMachineFunctionInfo *MFI =
2552 MBB.getParent()->getInfo<SIMachineFunctionInfo>();
2553 unsigned VGPRSize = MFI->getScratchReservedForDynamicVGPRs();
2554 Register DestReg = MI.getOperand(0).getReg();
2555 BuildMI(MBB, MI, DL, get(AMDGPU::S_GETREG_B32), DestReg)
2558 // The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
2559 // (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
2560 // SCC, so we need to check for 0 manually.
2561 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(DestReg);
2562 // Change the implicif-def of SCC to an explicit use (but first remove
2563 // the dead flag if present).
2564 MI.getOperand(MI.getNumExplicitOperands()).setIsDead(false);
2565 MI.getOperand(MI.getNumExplicitOperands()).setIsUse();
2566 MI.setDesc(get(AMDGPU::S_CMOVK_I32));
2567 MI.addOperand(MachineOperand::CreateImm(VGPRSize));
2568 } else {
2569 MI.setDesc(get(AMDGPU::S_MOV_B32));
2570 MI.addOperand(MachineOperand::CreateImm(0));
2571 MI.removeOperand(
2572 MI.getNumExplicitOperands()); // Drop implicit def of SCC.
2573 }
2574 break;
2575 }
2576
2577 return true;
2578}
2579
2582 unsigned SubIdx, const MachineInstr &Orig,
2583 LaneBitmask UsedLanes) const {
2584
2585 // Try shrinking the instruction to remat only the part needed for current
2586 // context.
2587 // TODO: Handle more cases.
2588 unsigned Opcode = Orig.getOpcode();
2589 switch (Opcode) {
2590 case AMDGPU::S_MOV_B64:
2591 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2592 if (SubIdx != 0)
2593 break;
2594
2595 if (!Orig.getOperand(1).isImm())
2596 break;
2597
2598 // Shrink S_MOV_B64 to S_MOV_B32 when UsedLanes indicates only a single
2599 // 32-bit lane of the 64-bit value is live at the rematerialization point.
2600 if (UsedLanes.all())
2601 break;
2602
2603 // Determine which half of the 64-bit immediate corresponds to the use.
2604 unsigned OrigSubReg = Orig.getOperand(0).getSubReg();
2605 unsigned LoSubReg = RI.composeSubRegIndices(OrigSubReg, AMDGPU::sub0);
2606 unsigned HiSubReg = RI.composeSubRegIndices(OrigSubReg, AMDGPU::sub1);
2607
2608 bool NeedLo = (UsedLanes & RI.getSubRegIndexLaneMask(LoSubReg)).any();
2609 bool NeedHi = (UsedLanes & RI.getSubRegIndexLaneMask(HiSubReg)).any();
2610
2611 if (NeedLo && NeedHi)
2612 break;
2613
2614 int64_t Imm64 = Orig.getOperand(1).getImm();
2615 int32_t Imm32 = NeedLo ? Lo_32(Imm64) : Hi_32(Imm64);
2616
2617 unsigned UseSubReg = NeedLo ? LoSubReg : HiSubReg;
2618
2619 // Emit S_MOV_B32 defining just the needed 32-bit subreg of DestReg.
2620 BuildMI(MBB, I, Orig.getDebugLoc(), get(AMDGPU::S_MOV_B32))
2621 .addReg(DestReg, RegState::Define | RegState::Undef, UseSubReg)
2622 .addImm(Imm32);
2623 return;
2624 }
2625
2626 case AMDGPU::S_LOAD_DWORDX16_IMM:
2627 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2628 if (SubIdx != 0)
2629 break;
2630
2631 if (I == MBB.end())
2632 break;
2633
2634 if (I->isBundled())
2635 break;
2636
2637 // Look for a single use of the register that is also a subreg.
2638 Register RegToFind = Orig.getOperand(0).getReg();
2639 MachineOperand *UseMO = nullptr;
2640 for (auto &CandMO : I->operands()) {
2641 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2642 continue;
2643 if (UseMO) {
2644 UseMO = nullptr;
2645 break;
2646 }
2647 UseMO = &CandMO;
2648 }
2649 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2650 break;
2651
2652 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2653 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2654
2655 MachineFunction *MF = MBB.getParent();
2656 MachineRegisterInfo &MRI = MF->getRegInfo();
2657 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2658
2659 unsigned NewOpcode = -1;
2660 if (SubregSize == 256)
2661 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2662 else if (SubregSize == 128)
2663 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2664 else
2665 break;
2666
2667 const MCInstrDesc &TID = get(NewOpcode);
2668 const TargetRegisterClass *NewRC =
2669 RI.getAllocatableClass(getRegClass(TID, 0));
2670 MRI.setRegClass(DestReg, NewRC);
2671
2672 UseMO->setReg(DestReg);
2673 UseMO->setSubReg(AMDGPU::NoSubRegister);
2674
2675 // Use a smaller load with the desired size, possibly with updated offset.
2676 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2677 MI->setDesc(TID);
2678 MI->getOperand(0).setReg(DestReg);
2679 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2680 if (Offset) {
2681 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2682 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2683 OffsetMO->setImm(FinalOffset);
2684 }
2686 for (const MachineMemOperand *MemOp : Orig.memoperands())
2687 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2688 SubregSize / 8));
2689 MI->setMemRefs(*MF, NewMMOs);
2690
2691 MBB.insert(I, MI);
2692 return;
2693 }
2694
2695 default:
2696 break;
2697 }
2698
2699 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, UsedLanes);
2700}
2701
2702std::pair<MachineInstr*, MachineInstr*>
2704 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2705
2706 if (ST.hasVMovB64Inst() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
2708 ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2709 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2710 return std::pair(&MI, nullptr);
2711 }
2712
2713 MachineBasicBlock &MBB = *MI.getParent();
2714 DebugLoc DL = MBB.findDebugLoc(MI);
2715 MachineFunction *MF = MBB.getParent();
2716 MachineRegisterInfo &MRI = MF->getRegInfo();
2717 Register Dst = MI.getOperand(0).getReg();
2718 unsigned Part = 0;
2719 MachineInstr *Split[2];
2720
2721 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2722 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2723 if (Dst.isPhysical()) {
2724 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2725 } else {
2726 assert(MRI.isSSA());
2727 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2728 MovDPP.addDef(Tmp);
2729 }
2730
2731 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2732 const MachineOperand &SrcOp = MI.getOperand(I);
2733 assert(!SrcOp.isFPImm());
2734 if (SrcOp.isImm()) {
2735 APInt Imm(64, SrcOp.getImm());
2736 Imm.ashrInPlace(Part * 32);
2737 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2738 } else {
2739 assert(SrcOp.isReg());
2740 Register Src = SrcOp.getReg();
2741 if (Src.isPhysical())
2742 MovDPP.addReg(RI.getSubReg(Src, Sub));
2743 else
2744 MovDPP.addReg(Src, getUndefRegState(SrcOp.isUndef()), Sub);
2745 }
2746 }
2747
2748 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2749 MovDPP.addImm(MO.getImm());
2750
2751 Split[Part] = MovDPP;
2752 ++Part;
2753 }
2754
2755 if (Dst.isVirtual())
2756 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2757 .addReg(Split[0]->getOperand(0).getReg())
2758 .addImm(AMDGPU::sub0)
2759 .addReg(Split[1]->getOperand(0).getReg())
2760 .addImm(AMDGPU::sub1);
2761
2762 MI.eraseFromParent();
2763 return std::pair(Split[0], Split[1]);
2764}
2765
2766std::optional<DestSourcePair>
2768 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2769 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2770
2771 return std::nullopt;
2772}
2773
2775 AMDGPU::OpName Src0OpName,
2776 MachineOperand &Src1,
2777 AMDGPU::OpName Src1OpName) const {
2778 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2779 if (!Src0Mods)
2780 return false;
2781
2782 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2783 assert(Src1Mods &&
2784 "All commutable instructions have both src0 and src1 modifiers");
2785
2786 int Src0ModsVal = Src0Mods->getImm();
2787 int Src1ModsVal = Src1Mods->getImm();
2788
2789 Src1Mods->setImm(Src0ModsVal);
2790 Src0Mods->setImm(Src1ModsVal);
2791 return true;
2792}
2793
2795 MachineOperand &RegOp,
2796 MachineOperand &NonRegOp) {
2797 Register Reg = RegOp.getReg();
2798 unsigned SubReg = RegOp.getSubReg();
2799 bool IsKill = RegOp.isKill();
2800 bool IsDead = RegOp.isDead();
2801 bool IsUndef = RegOp.isUndef();
2802 bool IsDebug = RegOp.isDebug();
2803
2804 if (NonRegOp.isImm())
2805 RegOp.ChangeToImmediate(NonRegOp.getImm());
2806 else if (NonRegOp.isFI())
2807 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2808 else if (NonRegOp.isGlobal()) {
2809 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2810 NonRegOp.getTargetFlags());
2811 } else
2812 return nullptr;
2813
2814 // Make sure we don't reinterpret a subreg index in the target flags.
2815 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2816
2817 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2818 NonRegOp.setSubReg(SubReg);
2819
2820 return &MI;
2821}
2822
2824 MachineOperand &NonRegOp1,
2825 MachineOperand &NonRegOp2) {
2826 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2827 int64_t NonRegVal = NonRegOp1.getImm();
2828
2829 NonRegOp1.setImm(NonRegOp2.getImm());
2830 NonRegOp2.setImm(NonRegVal);
2831 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2832 NonRegOp2.setTargetFlags(TargetFlags);
2833 return &MI;
2834}
2835
2836bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2837 unsigned OpIdx1) const {
2838 const MCInstrDesc &InstDesc = MI.getDesc();
2839 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2840 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2841
2842 unsigned Opc = MI.getOpcode();
2843 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2844
2845 const MachineOperand &MO0 = MI.getOperand(OpIdx0);
2846 const MachineOperand &MO1 = MI.getOperand(OpIdx1);
2847
2848 // Swap doesn't breach constant bus or literal limits
2849 // It may move literal to position other than src0, this is not allowed
2850 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2851 // FIXME: After gfx9, literal can be in place other than Src0
2852 if (isVALU(MI)) {
2853 if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
2854 !isInlineConstant(MO0, OpInfo1))
2855 return false;
2856 if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
2857 !isInlineConstant(MO1, OpInfo0))
2858 return false;
2859 }
2860
2861 if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
2862 if (OpInfo1.RegClass == -1)
2863 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2864 return isLegalRegOperand(MI, OpIdx1, MO0) &&
2865 (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
2866 }
2867 if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
2868 if (OpInfo0.RegClass == -1)
2869 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2870 return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
2871 isLegalRegOperand(MI, OpIdx0, MO1);
2872 }
2873
2874 // No need to check 64-bit literals since swapping does not bring new
2875 // 64-bit literals into current instruction to fold to 32-bit
2876
2877 return isImmOperandLegal(MI, OpIdx1, MO0);
2878}
2879
2881 unsigned Src0Idx,
2882 unsigned Src1Idx) const {
2883 assert(!NewMI && "this should never be used");
2884
2885 unsigned Opc = MI.getOpcode();
2886 int CommutedOpcode = commuteOpcode(Opc);
2887 if (CommutedOpcode == -1)
2888 return nullptr;
2889
2890 if (Src0Idx > Src1Idx)
2891 std::swap(Src0Idx, Src1Idx);
2892
2893 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2894 static_cast<int>(Src0Idx) &&
2895 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2896 static_cast<int>(Src1Idx) &&
2897 "inconsistency with findCommutedOpIndices");
2898
2899 if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
2900 return nullptr;
2901
2902 MachineInstr *CommutedMI = nullptr;
2903 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2904 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2905 if (Src0.isReg() && Src1.isReg()) {
2906 // Be sure to copy the source modifiers to the right place.
2907 CommutedMI =
2908 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2909 } else if (Src0.isReg() && !Src1.isReg()) {
2910 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2911 } else if (!Src0.isReg() && Src1.isReg()) {
2912 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2913 } else if (Src0.isImm() && Src1.isImm()) {
2914 CommutedMI = swapImmOperands(MI, Src0, Src1);
2915 } else {
2916 // FIXME: Found two non registers to commute. This does happen.
2917 return nullptr;
2918 }
2919
2920 if (CommutedMI) {
2921 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2922 Src1, AMDGPU::OpName::src1_modifiers);
2923
2924 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2925 AMDGPU::OpName::src1_sel);
2926
2927 CommutedMI->setDesc(get(CommutedOpcode));
2928 }
2929
2930 return CommutedMI;
2931}
2932
2933// This needs to be implemented because the source modifiers may be inserted
2934// between the true commutable operands, and the base
2935// TargetInstrInfo::commuteInstruction uses it.
2937 unsigned &SrcOpIdx0,
2938 unsigned &SrcOpIdx1) const {
2939 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2940}
2941
2943 unsigned &SrcOpIdx0,
2944 unsigned &SrcOpIdx1) const {
2945 if (!Desc.isCommutable())
2946 return false;
2947
2948 unsigned Opc = Desc.getOpcode();
2949 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2950 if (Src0Idx == -1)
2951 return false;
2952
2953 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2954 if (Src1Idx == -1)
2955 return false;
2956
2957 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2958}
2959
2961 int64_t BrOffset) const {
2962 // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64
2963 // because its dest block is unanalyzable.
2964 assert(isSOPP(BranchOp) || isSOPK(BranchOp));
2965
2966 // Convert to dwords.
2967 BrOffset /= 4;
2968
2969 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2970 // from the next instruction.
2971 BrOffset -= 1;
2972
2973 return isIntN(BranchOffsetBits, BrOffset);
2974}
2975
2978 return MI.getOperand(0).getMBB();
2979}
2980
2982 for (const MachineInstr &MI : MBB->terminators()) {
2983 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2984 MI.getOpcode() == AMDGPU::SI_LOOP)
2985 return true;
2986 }
2987 return false;
2988}
2989
2991 MachineBasicBlock &DestBB,
2992 MachineBasicBlock &RestoreBB,
2993 const DebugLoc &DL, int64_t BrOffset,
2994 RegScavenger *RS) const {
2995 assert(MBB.empty() &&
2996 "new block should be inserted for expanding unconditional branch");
2997 assert(MBB.pred_size() == 1);
2998 assert(RestoreBB.empty() &&
2999 "restore block should be inserted for restoring clobbered registers");
3000
3001 MachineFunction *MF = MBB.getParent();
3002 MachineRegisterInfo &MRI = MF->getRegInfo();
3004 auto I = MBB.end();
3005 auto &MCCtx = MF->getContext();
3006
3007 if (ST.useAddPC64Inst()) {
3008 MCSymbol *Offset =
3009 MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true);
3010 auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64))
3012 MCSymbol *PostAddPCLabel =
3013 MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true);
3014 AddPC->setPostInstrSymbol(*MF, PostAddPCLabel);
3015 auto *OffsetExpr = MCBinaryExpr::createSub(
3016 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
3017 MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx);
3018 Offset->setVariableValue(OffsetExpr);
3019 return;
3020 }
3021
3022 assert(RS && "RegScavenger required for long branching");
3023
3024 // FIXME: Virtual register workaround for RegScavenger not working with empty
3025 // blocks.
3026 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3027
3028 // Note: as this is used after hazard recognizer we need to apply some hazard
3029 // workarounds directly.
3030 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
3031 ST.hasVALUReadSGPRHazard();
3032 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
3033 if (FlushSGPRWrites)
3034 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
3036 };
3037
3038 // We need to compute the offset relative to the instruction immediately after
3039 // s_getpc_b64. Insert pc arithmetic code before last terminator.
3040 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
3041 ApplyHazardWorkarounds();
3042
3043 MCSymbol *PostGetPCLabel =
3044 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
3045 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
3046
3047 MCSymbol *OffsetLo =
3048 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
3049 MCSymbol *OffsetHi =
3050 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
3051 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
3052 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
3053 .addReg(PCReg, {}, AMDGPU::sub0)
3054 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
3055 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
3056 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
3057 .addReg(PCReg, {}, AMDGPU::sub1)
3058 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
3059 ApplyHazardWorkarounds();
3060
3061 // Insert the indirect branch after the other terminator.
3062 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
3063 .addReg(PCReg);
3064
3065 // If a spill is needed for the pc register pair, we need to insert a spill
3066 // restore block right before the destination block, and insert a short branch
3067 // into the old destination block's fallthrough predecessor.
3068 // e.g.:
3069 //
3070 // s_cbranch_scc0 skip_long_branch:
3071 //
3072 // long_branch_bb:
3073 // spill s[8:9]
3074 // s_getpc_b64 s[8:9]
3075 // s_add_u32 s8, s8, restore_bb
3076 // s_addc_u32 s9, s9, 0
3077 // s_setpc_b64 s[8:9]
3078 //
3079 // skip_long_branch:
3080 // foo;
3081 //
3082 // .....
3083 //
3084 // dest_bb_fallthrough_predecessor:
3085 // bar;
3086 // s_branch dest_bb
3087 //
3088 // restore_bb:
3089 // restore s[8:9]
3090 // fallthrough dest_bb
3091 ///
3092 // dest_bb:
3093 // buzz;
3094
3095 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3096 Register Scav;
3097
3098 // If we've previously reserved a register for long branches
3099 // avoid running the scavenger and just use those registers
3100 if (LongBranchReservedReg) {
3101 RS->enterBasicBlock(MBB);
3102 Scav = LongBranchReservedReg;
3103 } else {
3104 RS->enterBasicBlockEnd(MBB);
3105 Scav = RS->scavengeRegisterBackwards(
3106 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3107 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3108 }
3109 if (Scav) {
3110 RS->setRegUsed(Scav);
3111 MRI.replaceRegWith(PCReg, Scav);
3112 MRI.clearVirtRegs();
3113 } else {
3114 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3115 // SGPR spill.
3116 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3117 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3118 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3119 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3120 MRI.clearVirtRegs();
3121 }
3122
3123 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3124 // Now, the distance could be defined.
3126 MCSymbolRefExpr::create(DestLabel, MCCtx),
3127 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3128 // Add offset assignments.
3129 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3130 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3131 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3132 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3133}
3134
3135unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3136 switch (Cond) {
3137 case SIInstrInfo::SCC_TRUE:
3138 return AMDGPU::S_CBRANCH_SCC1;
3139 case SIInstrInfo::SCC_FALSE:
3140 return AMDGPU::S_CBRANCH_SCC0;
3141 case SIInstrInfo::VCCNZ:
3142 return AMDGPU::S_CBRANCH_VCCNZ;
3143 case SIInstrInfo::VCCZ:
3144 return AMDGPU::S_CBRANCH_VCCZ;
3145 case SIInstrInfo::EXECNZ:
3146 return AMDGPU::S_CBRANCH_EXECNZ;
3147 case SIInstrInfo::EXECZ:
3148 return AMDGPU::S_CBRANCH_EXECZ;
3149 default:
3150 llvm_unreachable("invalid branch predicate");
3151 }
3152}
3153
3154SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3155 switch (Opcode) {
3156 case AMDGPU::S_CBRANCH_SCC0:
3157 return SCC_FALSE;
3158 case AMDGPU::S_CBRANCH_SCC1:
3159 return SCC_TRUE;
3160 case AMDGPU::S_CBRANCH_VCCNZ:
3161 return VCCNZ;
3162 case AMDGPU::S_CBRANCH_VCCZ:
3163 return VCCZ;
3164 case AMDGPU::S_CBRANCH_EXECNZ:
3165 return EXECNZ;
3166 case AMDGPU::S_CBRANCH_EXECZ:
3167 return EXECZ;
3168 default:
3169 return INVALID_BR;
3170 }
3171}
3172
3176 MachineBasicBlock *&FBB,
3178 bool AllowModify) const {
3179 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3180 // Unconditional Branch
3181 TBB = I->getOperand(0).getMBB();
3182 return false;
3183 }
3184
3185 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3186 if (Pred == INVALID_BR)
3187 return true;
3188
3189 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3190 Cond.push_back(MachineOperand::CreateImm(Pred));
3191 Cond.push_back(I->getOperand(1)); // Save the branch register.
3192
3193 ++I;
3194
3195 if (I == MBB.end()) {
3196 // Conditional branch followed by fall-through.
3197 TBB = CondBB;
3198 return false;
3199 }
3200
3201 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3202 TBB = CondBB;
3203 FBB = I->getOperand(0).getMBB();
3204 return false;
3205 }
3206
3207 return true;
3208}
3209
3211 MachineBasicBlock *&FBB,
3213 bool AllowModify) const {
3214 MachineBasicBlock::iterator I = MBB.getFirstTerminator();
3215 auto E = MBB.end();
3216 if (I == E)
3217 return false;
3218
3219 // Skip over the instructions that are artificially terminators for special
3220 // exec management.
3221 while (I != E && !I->isBranch() && !I->isReturn()) {
3222 switch (I->getOpcode()) {
3223 case AMDGPU::S_MOV_B64_term:
3224 case AMDGPU::S_XOR_B64_term:
3225 case AMDGPU::S_OR_B64_term:
3226 case AMDGPU::S_ANDN2_B64_term:
3227 case AMDGPU::S_AND_B64_term:
3228 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3229 case AMDGPU::S_MOV_B32_term:
3230 case AMDGPU::S_XOR_B32_term:
3231 case AMDGPU::S_OR_B32_term:
3232 case AMDGPU::S_ANDN2_B32_term:
3233 case AMDGPU::S_AND_B32_term:
3234 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3235 break;
3236 case AMDGPU::SI_IF:
3237 case AMDGPU::SI_ELSE:
3238 case AMDGPU::SI_KILL_I1_TERMINATOR:
3239 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3240 // FIXME: It's messy that these need to be considered here at all.
3241 return true;
3242 default:
3243 llvm_unreachable("unexpected non-branch terminator inst");
3244 }
3245
3246 ++I;
3247 }
3248
3249 if (I == E)
3250 return false;
3251
3252 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3253}
3254
3256 int *BytesRemoved) const {
3257 unsigned Count = 0;
3258 unsigned RemovedSize = 0;
3259 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
3260 // Skip over artificial terminators when removing instructions.
3261 if (MI.isBranch() || MI.isReturn()) {
3262 RemovedSize += getInstSizeInBytes(MI);
3263 MI.eraseFromParent();
3264 ++Count;
3265 }
3266 }
3267
3268 if (BytesRemoved)
3269 *BytesRemoved = RemovedSize;
3270
3271 return Count;
3272}
3273
3274// Copy the flags onto the implicit condition register operand.
3276 const MachineOperand &OrigCond) {
3277 CondReg.setIsUndef(OrigCond.isUndef());
3278 CondReg.setIsKill(OrigCond.isKill());
3279}
3280
3283 MachineBasicBlock *FBB,
3285 const DebugLoc &DL,
3286 int *BytesAdded) const {
3287 if (!FBB && Cond.empty()) {
3288 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3289 .addMBB(TBB);
3290 if (BytesAdded)
3291 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3292 return 1;
3293 }
3294
3295 assert(TBB && Cond[0].isImm());
3296
3297 unsigned Opcode
3298 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3299
3300 if (!FBB) {
3301 MachineInstr *CondBr =
3302 BuildMI(&MBB, DL, get(Opcode))
3303 .addMBB(TBB);
3304
3305 // Copy the flags onto the implicit condition register operand.
3306 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3307 fixImplicitOperands(*CondBr);
3308
3309 if (BytesAdded)
3310 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3311 return 1;
3312 }
3313
3314 assert(TBB && FBB);
3315
3316 MachineInstr *CondBr =
3317 BuildMI(&MBB, DL, get(Opcode))
3318 .addMBB(TBB);
3319 fixImplicitOperands(*CondBr);
3320 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3321 .addMBB(FBB);
3322
3323 MachineOperand &CondReg = CondBr->getOperand(1);
3324 CondReg.setIsUndef(Cond[1].isUndef());
3325 CondReg.setIsKill(Cond[1].isKill());
3326
3327 if (BytesAdded)
3328 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3329
3330 return 2;
3331}
3332
3335 if (Cond.size() != 2) {
3336 return true;
3337 }
3338
3339 if (Cond[0].isImm()) {
3340 Cond[0].setImm(-Cond[0].getImm());
3341 return false;
3342 }
3343
3344 return true;
3345}
3346
3349 Register DstReg, Register TrueReg,
3350 Register FalseReg, int &CondCycles,
3351 int &TrueCycles, int &FalseCycles) const {
3352 switch (Cond[0].getImm()) {
3353 case VCCNZ:
3354 case VCCZ: {
3355 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3356 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3357 if (MRI.getRegClass(FalseReg) != RC)
3358 return false;
3359
3360 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3361 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3362
3363 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3364 return RI.hasVGPRs(RC) && NumInsts <= 6;
3365 }
3366 case SCC_TRUE:
3367 case SCC_FALSE: {
3368 // FIXME: We could insert for VGPRs if we could replace the original compare
3369 // with a vector one.
3370 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3371 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3372 if (MRI.getRegClass(FalseReg) != RC)
3373 return false;
3374
3375 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3376
3377 // Multiples of 8 can do s_cselect_b64
3378 if (NumInsts % 2 == 0)
3379 NumInsts /= 2;
3380
3381 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3382 return RI.isSGPRClass(RC);
3383 }
3384 default:
3385 return false;
3386 }
3387}
3388
3392 Register TrueReg, Register FalseReg) const {
3393 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3394 if (Pred == VCCZ || Pred == SCC_FALSE) {
3395 Pred = static_cast<BranchPredicate>(-Pred);
3396 std::swap(TrueReg, FalseReg);
3397 }
3398
3399 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3400 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3401 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3402
3403 if (DstSize == 32) {
3405 if (Pred == SCC_TRUE) {
3406 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3407 .addReg(TrueReg)
3408 .addReg(FalseReg);
3409 } else {
3410 // Instruction's operands are backwards from what is expected.
3411 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3412 .addReg(FalseReg)
3413 .addReg(TrueReg);
3414 }
3415
3416 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3417 return;
3418 }
3419
3420 if (DstSize == 64 && Pred == SCC_TRUE) {
3422 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3423 .addReg(TrueReg)
3424 .addReg(FalseReg);
3425
3426 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3427 return;
3428 }
3429
3430 static const int16_t Sub0_15[] = {
3431 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3432 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3433 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3434 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3435 };
3436
3437 static const int16_t Sub0_15_64[] = {
3438 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3439 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3440 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3441 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3442 };
3443
3444 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3445 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3446 const int16_t *SubIndices = Sub0_15;
3447 int NElts = DstSize / 32;
3448
3449 // 64-bit select is only available for SALU.
3450 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3451 if (Pred == SCC_TRUE) {
3452 if (NElts % 2) {
3453 SelOp = AMDGPU::S_CSELECT_B32;
3454 EltRC = &AMDGPU::SGPR_32RegClass;
3455 } else {
3456 SelOp = AMDGPU::S_CSELECT_B64;
3457 EltRC = &AMDGPU::SGPR_64RegClass;
3458 SubIndices = Sub0_15_64;
3459 NElts /= 2;
3460 }
3461 }
3462
3464 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3465
3466 I = MIB->getIterator();
3467
3469 for (int Idx = 0; Idx != NElts; ++Idx) {
3470 Register DstElt = MRI.createVirtualRegister(EltRC);
3471 Regs.push_back(DstElt);
3472
3473 unsigned SubIdx = SubIndices[Idx];
3474
3476 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3477 Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
3478 .addReg(FalseReg, {}, SubIdx)
3479 .addReg(TrueReg, {}, SubIdx);
3480 } else {
3481 Select = BuildMI(MBB, I, DL, get(SelOp), DstElt)
3482 .addReg(TrueReg, {}, SubIdx)
3483 .addReg(FalseReg, {}, SubIdx);
3484 }
3485
3486 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3488
3489 MIB.addReg(DstElt)
3490 .addImm(SubIdx);
3491 }
3492}
3493
3495 switch (MI.getOpcode()) {
3496 case AMDGPU::V_MOV_B16_t16_e32:
3497 case AMDGPU::V_MOV_B16_t16_e64:
3498 case AMDGPU::V_MOV_B32_e32:
3499 case AMDGPU::V_MOV_B32_e64:
3500 case AMDGPU::V_MOV_B64_PSEUDO:
3501 case AMDGPU::V_MOV_B64_e32:
3502 case AMDGPU::V_MOV_B64_e64:
3503 case AMDGPU::S_MOV_B32:
3504 case AMDGPU::S_MOV_B64:
3505 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3506 case AMDGPU::COPY:
3507 case AMDGPU::WWM_COPY:
3508 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3509 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3510 case AMDGPU::V_ACCVGPR_MOV_B32:
3511 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3512 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3513 return true;
3514 default:
3515 return false;
3516 }
3517}
3518
3520 switch (MI.getOpcode()) {
3521 case AMDGPU::V_MOV_B16_t16_e32:
3522 case AMDGPU::V_MOV_B16_t16_e64:
3523 return 2;
3524 case AMDGPU::V_MOV_B32_e32:
3525 case AMDGPU::V_MOV_B32_e64:
3526 case AMDGPU::V_MOV_B64_PSEUDO:
3527 case AMDGPU::V_MOV_B64_e32:
3528 case AMDGPU::V_MOV_B64_e64:
3529 case AMDGPU::S_MOV_B32:
3530 case AMDGPU::S_MOV_B64:
3531 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3532 case AMDGPU::COPY:
3533 case AMDGPU::WWM_COPY:
3534 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3535 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3536 case AMDGPU::V_ACCVGPR_MOV_B32:
3537 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3538 case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3539 return 1;
3540 default:
3541 llvm_unreachable("MI is not a foldable copy");
3542 }
3543}
3544
3545static constexpr AMDGPU::OpName ModifierOpNames[] = {
3546 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3547 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3548 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3549
3551 unsigned Opc = MI.getOpcode();
3552 for (AMDGPU::OpName Name : reverse(ModifierOpNames)) {
3553 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name);
3554 if (Idx >= 0)
3555 MI.removeOperand(Idx);
3556 }
3557}
3558
3560 const MCInstrDesc &NewDesc) const {
3561 MI.setDesc(NewDesc);
3562
3563 // Remove any leftover implicit operands from mutating the instruction. e.g.
3564 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
3565 // anymore.
3566 const MCInstrDesc &Desc = MI.getDesc();
3567 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
3568 Desc.implicit_defs().size();
3569
3570 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
3571 MI.removeOperand(I);
3572}
3573
3574std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
3575 unsigned SubRegIndex) {
3576 switch (SubRegIndex) {
3577 case AMDGPU::NoSubRegister:
3578 return Imm;
3579 case AMDGPU::sub0:
3580 return SignExtend64<32>(Imm);
3581 case AMDGPU::sub1:
3582 return SignExtend64<32>(Imm >> 32);
3583 case AMDGPU::lo16:
3584 return SignExtend64<16>(Imm);
3585 case AMDGPU::hi16:
3586 return SignExtend64<16>(Imm >> 16);
3587 case AMDGPU::sub1_lo16:
3588 return SignExtend64<16>(Imm >> 32);
3589 case AMDGPU::sub1_hi16:
3590 return SignExtend64<16>(Imm >> 48);
3591 default:
3592 return std::nullopt;
3593 }
3594
3595 llvm_unreachable("covered subregister switch");
3596}
3597
3598static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) {
3599 switch (Opc) {
3600 case AMDGPU::V_MAC_F16_e32:
3601 case AMDGPU::V_MAC_F16_e64:
3602 case AMDGPU::V_MAD_F16_e64:
3603 return AMDGPU::V_MADAK_F16;
3604 case AMDGPU::V_MAC_F32_e32:
3605 case AMDGPU::V_MAC_F32_e64:
3606 case AMDGPU::V_MAD_F32_e64:
3607 return AMDGPU::V_MADAK_F32;
3608 case AMDGPU::V_FMAC_F32_e32:
3609 case AMDGPU::V_FMAC_F32_e64:
3610 case AMDGPU::V_FMA_F32_e64:
3611 return AMDGPU::V_FMAAK_F32;
3612 case AMDGPU::V_FMAC_F16_e32:
3613 case AMDGPU::V_FMAC_F16_e64:
3614 case AMDGPU::V_FMAC_F16_t16_e64:
3615 case AMDGPU::V_FMAC_F16_fake16_e64:
3616 case AMDGPU::V_FMAC_F16_t16_e32:
3617 case AMDGPU::V_FMAC_F16_fake16_e32:
3618 case AMDGPU::V_FMA_F16_e64:
3619 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3620 ? AMDGPU::V_FMAAK_F16_t16
3621 : AMDGPU::V_FMAAK_F16_fake16
3622 : AMDGPU::V_FMAAK_F16;
3623 case AMDGPU::V_FMAC_F64_e32:
3624 case AMDGPU::V_FMAC_F64_e64:
3625 case AMDGPU::V_FMA_F64_e64:
3626 return AMDGPU::V_FMAAK_F64;
3627 default:
3628 llvm_unreachable("invalid instruction");
3629 }
3630}
3631
3632static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) {
3633 switch (Opc) {
3634 case AMDGPU::V_MAC_F16_e32:
3635 case AMDGPU::V_MAC_F16_e64:
3636 case AMDGPU::V_MAD_F16_e64:
3637 return AMDGPU::V_MADMK_F16;
3638 case AMDGPU::V_MAC_F32_e32:
3639 case AMDGPU::V_MAC_F32_e64:
3640 case AMDGPU::V_MAD_F32_e64:
3641 return AMDGPU::V_MADMK_F32;
3642 case AMDGPU::V_FMAC_F32_e32:
3643 case AMDGPU::V_FMAC_F32_e64:
3644 case AMDGPU::V_FMA_F32_e64:
3645 return AMDGPU::V_FMAMK_F32;
3646 case AMDGPU::V_FMAC_F16_e32:
3647 case AMDGPU::V_FMAC_F16_e64:
3648 case AMDGPU::V_FMAC_F16_t16_e64:
3649 case AMDGPU::V_FMAC_F16_fake16_e64:
3650 case AMDGPU::V_FMAC_F16_t16_e32:
3651 case AMDGPU::V_FMAC_F16_fake16_e32:
3652 case AMDGPU::V_FMA_F16_e64:
3653 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
3654 ? AMDGPU::V_FMAMK_F16_t16
3655 : AMDGPU::V_FMAMK_F16_fake16
3656 : AMDGPU::V_FMAMK_F16;
3657 case AMDGPU::V_FMAC_F64_e32:
3658 case AMDGPU::V_FMAC_F64_e64:
3659 case AMDGPU::V_FMA_F64_e64:
3660 return AMDGPU::V_FMAMK_F64;
3661 default:
3662 llvm_unreachable("invalid instruction");
3663 }
3664}
3665
3667 Register Reg, MachineRegisterInfo *MRI) const {
3668 int64_t Imm;
3669 if (!getConstValDefinedInReg(DefMI, Reg, Imm))
3670 return false;
3671
3672 const bool HasMultipleUses = !MRI->hasOneNonDBGUse(Reg);
3673
3674 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3675
3676 unsigned Opc = UseMI.getOpcode();
3677 if (Opc == AMDGPU::COPY) {
3678 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3679
3680 Register DstReg = UseMI.getOperand(0).getReg();
3681 Register UseSubReg = UseMI.getOperand(1).getSubReg();
3682
3683 const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
3684
3685 if (HasMultipleUses) {
3686 // TODO: This should fold in more cases with multiple use, but we need to
3687 // more carefully consider what those uses are.
3688 unsigned ImmDefSize = RI.getRegSizeInBits(*MRI->getRegClass(Reg));
3689
3690 // Avoid breaking up a 64-bit inline immediate into a subregister extract.
3691 if (UseSubReg != AMDGPU::NoSubRegister && ImmDefSize == 64)
3692 return false;
3693
3694 // Most of the time folding a 32-bit inline constant is free (though this
3695 // might not be true if we can't later fold it into a real user).
3696 //
3697 // FIXME: This isInlineConstant check is imprecise if
3698 // getConstValDefinedInReg handled the tricky non-mov cases.
3699 if (ImmDefSize == 32 &&
3701 return false;
3702 }
3703
3704 bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
3705 RI.getSubRegIdxSize(UseSubReg) == 16;
3706
3707 if (Is16Bit) {
3708 if (RI.hasVGPRs(DstRC))
3709 return false; // Do not clobber vgpr_hi16
3710
3711 if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
3712 return false;
3713 }
3714
3715 MachineFunction *MF = UseMI.getMF();
3716
3717 unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
3718 MCRegister MovDstPhysReg =
3719 DstReg.isPhysical() ? DstReg.asMCReg() : MCRegister();
3720
3721 std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
3722
3723 // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
3724 for (unsigned MovOp :
3725 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
3726 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
3727 const MCInstrDesc &MovDesc = get(MovOp);
3728
3729 const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0);
3730 if (Is16Bit) {
3731 // We just need to find a correctly sized register class, so the
3732 // subregister index compatibility doesn't matter since we're statically
3733 // extracting the immediate value.
3734 MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
3735 if (!MovDstRC)
3736 continue;
3737
3738 if (MovDstPhysReg) {
3739 // FIXME: We probably should not do this. If there is a live value in
3740 // the high half of the register, it will be corrupted.
3741 MovDstPhysReg =
3742 RI.getMatchingSuperReg(MovDstPhysReg, AMDGPU::lo16, MovDstRC);
3743 if (!MovDstPhysReg)
3744 continue;
3745 }
3746 }
3747
3748 // Result class isn't the right size, try the next instruction.
3749 if (MovDstPhysReg) {
3750 if (!MovDstRC->contains(MovDstPhysReg))
3751 return false;
3752 } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
3753 // TODO: This will be overly conservative in the case of 16-bit virtual
3754 // SGPRs. We could hack up the virtual register uses to use a compatible
3755 // 32-bit class.
3756 continue;
3757 }
3758
3759 const MCOperandInfo &OpInfo = MovDesc.operands()[1];
3760
3761 // Ensure the interpreted immediate value is a valid operand in the new
3762 // mov.
3763 //
3764 // FIXME: isImmOperandLegal should have form that doesn't require existing
3765 // MachineInstr or MachineOperand
3766 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
3767 !isInlineConstant(*SubRegImm, OpInfo.OperandType))
3768 break;
3769
3770 NewOpc = MovOp;
3771 break;
3772 }
3773
3774 if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
3775 return false;
3776
3777 if (Is16Bit) {
3778 UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
3779 if (MovDstPhysReg)
3780 UseMI.getOperand(0).setReg(MovDstPhysReg);
3781 assert(UseMI.getOperand(1).getReg().isVirtual());
3782 }
3783
3784 const MCInstrDesc &NewMCID = get(NewOpc);
3785 UseMI.setDesc(NewMCID);
3786 UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
3787 UseMI.addImplicitDefUseOperands(*MF);
3788 return true;
3789 }
3790
3791 if (HasMultipleUses)
3792 return false;
3793
3794 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3795 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3796 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3797 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3798 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3799 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 ||
3800 Opc == AMDGPU::V_FMAC_F64_e64) {
3801 // Don't fold if we are using source or output modifiers. The new VOP2
3802 // instructions don't have them.
3804 return false;
3805
3806 // If this is a free constant, there's no reason to do this.
3807 // TODO: We could fold this here instead of letting SIFoldOperands do it
3808 // later.
3809 int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0);
3810
3811 // Any src operand can be used for the legality check.
3812 if (isInlineConstant(UseMI, Src0Idx, Imm))
3813 return false;
3814
3815 MachineOperand *Src0 = &UseMI.getOperand(Src0Idx);
3816
3817 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3818 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3819
3820 auto CopyRegOperandToNarrowerRC =
3821 [MRI, this](MachineInstr &MI, unsigned OpNo,
3822 const TargetRegisterClass *NewRC) -> void {
3823 if (!MI.getOperand(OpNo).isReg())
3824 return;
3825 Register Reg = MI.getOperand(OpNo).getReg();
3826 const TargetRegisterClass *RC = RI.getRegClassForReg(*MRI, Reg);
3827 if (RI.getCommonSubClass(RC, NewRC) != NewRC)
3828 return;
3829 Register Tmp = MRI->createVirtualRegister(NewRC);
3830 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
3831 get(AMDGPU::COPY), Tmp)
3832 .addReg(Reg);
3833 MI.getOperand(OpNo).setReg(Tmp);
3834 MI.getOperand(OpNo).setIsKill();
3835 };
3836
3837 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3838 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3839 (Src1->isReg() && Src1->getReg() == Reg)) {
3840 MachineOperand *RegSrc =
3841 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3842 if (!RegSrc->isReg())
3843 return false;
3844 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3845 ST.getConstantBusLimit(Opc) < 2)
3846 return false;
3847
3848 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3849 return false;
3850
3851 // If src2 is also a literal constant then we have to choose which one to
3852 // fold. In general it is better to choose madak so that the other literal
3853 // can be materialized in an sgpr instead of a vgpr:
3854 // s_mov_b32 s0, literal
3855 // v_madak_f32 v0, s0, v0, literal
3856 // Instead of:
3857 // v_mov_b32 v1, literal
3858 // v_madmk_f32 v0, v0, literal, v1
3859 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3860 if (Def && Def->isMoveImmediate() &&
3861 !isInlineConstant(Def->getOperand(1)))
3862 return false;
3863
3864 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
3865 if (pseudoToMCOpcode(NewOpc) == -1)
3866 return false;
3867
3868 const std::optional<int64_t> SubRegImm = extractSubregFromImm(
3869 Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
3870
3871 // FIXME: This would be a lot easier if we could return a new instruction
3872 // instead of having to modify in place.
3873
3874 Register SrcReg = RegSrc->getReg();
3875 unsigned SrcSubReg = RegSrc->getSubReg();
3876 Src0->setReg(SrcReg);
3877 Src0->setSubReg(SrcSubReg);
3878 Src0->setIsKill(RegSrc->isKill());
3879
3880 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3881 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3882 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3883 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3884 UseMI.untieRegOperand(
3885 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3886
3887 Src1->ChangeToImmediate(*SubRegImm);
3888
3890 UseMI.setDesc(get(NewOpc));
3891
3892 if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3893 NewOpc == AMDGPU::V_FMAMK_F16_fake16) {
3894 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3895 Register Tmp = MRI->createVirtualRegister(NewRC);
3896 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3897 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3898 UseMI.getOperand(0).getReg())
3899 .addReg(Tmp, RegState::Kill);
3900 UseMI.getOperand(0).setReg(Tmp);
3901 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3902 CopyRegOperandToNarrowerRC(UseMI, 3, NewRC);
3903 }
3904
3905 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3906 if (DeleteDef)
3907 DefMI.eraseFromParent();
3908
3909 return true;
3910 }
3911
3912 // Added part is the constant: Use v_madak_{f16, f32}.
3913 if (Src2->isReg() && Src2->getReg() == Reg) {
3914 if (ST.getConstantBusLimit(Opc) < 2) {
3915 // Not allowed to use constant bus for another operand.
3916 // We can however allow an inline immediate as src0.
3917 bool Src0Inlined = false;
3918 if (Src0->isReg()) {
3919 // Try to inline constant if possible.
3920 // If the Def moves immediate and the use is single
3921 // We are saving VGPR here.
3922 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3923 if (Def && Def->isMoveImmediate() &&
3924 isInlineConstant(Def->getOperand(1)) &&
3925 MRI->hasOneNonDBGUse(Src0->getReg())) {
3926 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3927 Src0Inlined = true;
3928 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3929 RI.isSGPRReg(*MRI, Src0->getReg())) {
3930 return false;
3931 }
3932 // VGPR is okay as Src0 - fallthrough
3933 }
3934
3935 if (Src1->isReg() && !Src0Inlined) {
3936 // We have one slot for inlinable constant so far - try to fill it
3937 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3938 if (Def && Def->isMoveImmediate() &&
3939 isInlineConstant(Def->getOperand(1)) &&
3940 MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI))
3941 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3942 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3943 return false;
3944 // VGPR is okay as Src1 - fallthrough
3945 }
3946 }
3947
3948 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
3949 if (pseudoToMCOpcode(NewOpc) == -1)
3950 return false;
3951
3952 // FIXME: This would be a lot easier if we could return a new instruction
3953 // instead of having to modify in place.
3954
3955 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3956 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3957 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3958 Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64)
3959 UseMI.untieRegOperand(
3960 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3961
3962 const std::optional<int64_t> SubRegImm =
3963 extractSubregFromImm(Imm, Src2->getSubReg());
3964
3965 // ChangingToImmediate adds Src2 back to the instruction.
3966 Src2->ChangeToImmediate(*SubRegImm);
3967
3968 // These come before src2.
3970 UseMI.setDesc(get(NewOpc));
3971
3972 if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3973 NewOpc == AMDGPU::V_FMAAK_F16_fake16) {
3974 const TargetRegisterClass *NewRC = getRegClass(get(NewOpc), 0);
3975 Register Tmp = MRI->createVirtualRegister(NewRC);
3976 BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
3977 UseMI.getDebugLoc(), get(AMDGPU::COPY),
3978 UseMI.getOperand(0).getReg())
3979 .addReg(Tmp, RegState::Kill);
3980 UseMI.getOperand(0).setReg(Tmp);
3981 CopyRegOperandToNarrowerRC(UseMI, 1, NewRC);
3982 CopyRegOperandToNarrowerRC(UseMI, 2, NewRC);
3983 }
3984
3985 // It might happen that UseMI was commuted
3986 // and we now have SGPR as SRC1. If so 2 inlined
3987 // constant and SGPR are illegal.
3989
3990 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3991 if (DeleteDef)
3992 DefMI.eraseFromParent();
3993
3994 return true;
3995 }
3996 }
3997
3998 return false;
3999}
4000
4001static bool
4004 if (BaseOps1.size() != BaseOps2.size())
4005 return false;
4006 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
4007 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
4008 return false;
4009 }
4010 return true;
4011}
4012
4013static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
4014 LocationSize WidthB, int OffsetB) {
4015 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
4016 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
4017 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
4018 return LowWidth.hasValue() &&
4019 LowOffset + (int)LowWidth.getValue() <= HighOffset;
4020}
4021
4022bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
4023 const MachineInstr &MIb) const {
4024 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
4025 int64_t Offset0, Offset1;
4026 LocationSize Dummy0 = LocationSize::precise(0);
4027 LocationSize Dummy1 = LocationSize::precise(0);
4028 bool Offset0IsScalable, Offset1IsScalable;
4029 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
4030 Dummy0, &RI) ||
4031 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
4032 Dummy1, &RI))
4033 return false;
4034
4035 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
4036 return false;
4037
4038 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
4039 // FIXME: Handle ds_read2 / ds_write2.
4040 return false;
4041 }
4042 LocationSize Width0 = MIa.memoperands().front()->getSize();
4043 LocationSize Width1 = MIb.memoperands().front()->getSize();
4044 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
4045}
4046
4048 const MachineInstr &MIb) const {
4049 assert(MIa.mayLoadOrStore() &&
4050 "MIa must load from or modify a memory location");
4051 assert(MIb.mayLoadOrStore() &&
4052 "MIb must load from or modify a memory location");
4053
4055 return false;
4056
4057 // XXX - Can we relax this between address spaces?
4058 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
4059 return false;
4060
4061 if (isLDSDMA(MIa) || isLDSDMA(MIb))
4062 return false;
4063
4064 if (MIa.isBundle() || MIb.isBundle())
4065 return false;
4066
4067 // TODO: Should we check the address space from the MachineMemOperand? That
4068 // would allow us to distinguish objects we know don't alias based on the
4069 // underlying address space, even if it was lowered to a different one,
4070 // e.g. private accesses lowered to use MUBUF instructions on a scratch
4071 // buffer.
4072 if (isDS(MIa)) {
4073 if (isDS(MIb))
4074 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4075
4076 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
4077 }
4078
4079 if (isMUBUF(MIa) || isMTBUF(MIa)) {
4080 if (isMUBUF(MIb) || isMTBUF(MIb))
4081 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4082
4083 if (isFLAT(MIb))
4084 return isFLATScratch(MIb);
4085
4086 return !isSMRD(MIb);
4087 }
4088
4089 if (isSMRD(MIa)) {
4090 if (isSMRD(MIb))
4091 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4092
4093 if (isFLAT(MIb))
4094 return isFLATScratch(MIb);
4095
4096 return !isMUBUF(MIb) && !isMTBUF(MIb);
4097 }
4098
4099 if (isFLAT(MIa)) {
4100 if (isFLAT(MIb)) {
4101 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
4102 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
4103 return true;
4104
4105 return checkInstOffsetsDoNotOverlap(MIa, MIb);
4106 }
4107
4108 return false;
4109 }
4110
4111 return false;
4112}
4113
4115 int64_t &Imm, MachineInstr **DefMI = nullptr) {
4116 if (Reg.isPhysical())
4117 return false;
4118 auto *Def = MRI.getUniqueVRegDef(Reg);
4119 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
4120 Imm = Def->getOperand(1).getImm();
4121 if (DefMI)
4122 *DefMI = Def;
4123 return true;
4124 }
4125 return false;
4126}
4127
4128static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
4129 MachineInstr **DefMI = nullptr) {
4130 if (!MO->isReg())
4131 return false;
4132 const MachineFunction *MF = MO->getParent()->getMF();
4133 const MachineRegisterInfo &MRI = MF->getRegInfo();
4134 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
4135}
4136
4138 MachineInstr &NewMI) {
4139 if (LV) {
4140 unsigned NumOps = MI.getNumOperands();
4141 for (unsigned I = 1; I < NumOps; ++I) {
4142 MachineOperand &Op = MI.getOperand(I);
4143 if (Op.isReg() && Op.isKill())
4144 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
4145 }
4146 }
4147}
4148
4149static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
4150 switch (Opc) {
4151 case AMDGPU::V_MAC_F16_e32:
4152 case AMDGPU::V_MAC_F16_e64:
4153 return AMDGPU::V_MAD_F16_e64;
4154 case AMDGPU::V_MAC_F32_e32:
4155 case AMDGPU::V_MAC_F32_e64:
4156 return AMDGPU::V_MAD_F32_e64;
4157 case AMDGPU::V_MAC_LEGACY_F32_e32:
4158 case AMDGPU::V_MAC_LEGACY_F32_e64:
4159 return AMDGPU::V_MAD_LEGACY_F32_e64;
4160 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4161 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4162 return AMDGPU::V_FMA_LEGACY_F32_e64;
4163 case AMDGPU::V_FMAC_F16_e32:
4164 case AMDGPU::V_FMAC_F16_e64:
4165 case AMDGPU::V_FMAC_F16_t16_e64:
4166 case AMDGPU::V_FMAC_F16_fake16_e64:
4167 return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
4168 ? AMDGPU::V_FMA_F16_gfx9_t16_e64
4169 : AMDGPU::V_FMA_F16_gfx9_fake16_e64
4170 : AMDGPU::V_FMA_F16_gfx9_e64;
4171 case AMDGPU::V_FMAC_F32_e32:
4172 case AMDGPU::V_FMAC_F32_e64:
4173 return AMDGPU::V_FMA_F32_e64;
4174 case AMDGPU::V_FMAC_F64_e32:
4175 case AMDGPU::V_FMAC_F64_e64:
4176 return AMDGPU::V_FMA_F64_e64;
4177 default:
4178 llvm_unreachable("invalid instruction");
4179 }
4180}
4181
4182/// Helper struct for the implementation of 3-address conversion to communicate
4183/// updates made to instruction operands.
4185 /// Other instruction whose def is no longer used by the converted
4186 /// instruction.
4188};
4189
4191 LiveVariables *LV,
4192 LiveIntervals *LIS) const {
4193 MachineBasicBlock &MBB = *MI.getParent();
4194 MachineInstr *CandidateMI = &MI;
4195
4196 if (MI.isBundle()) {
4197 // This is a temporary placeholder for bundle handling that enables us to
4198 // exercise the relevant code paths in the two-address instruction pass.
4199 if (MI.getBundleSize() != 1)
4200 return nullptr;
4201 CandidateMI = MI.getNextNode();
4202 }
4203
4205 MachineInstr *NewMI = convertToThreeAddressImpl(*CandidateMI, U);
4206 if (!NewMI)
4207 return nullptr;
4208
4209 if (MI.isBundle()) {
4210 CandidateMI->eraseFromBundle();
4211
4212 for (MachineOperand &MO : MI.all_defs()) {
4213 if (MO.isTied())
4214 MI.untieRegOperand(MO.getOperandNo());
4215 }
4216 } else {
4217 updateLiveVariables(LV, MI, *NewMI);
4218 if (LIS) {
4219 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
4220 // SlotIndex of defs needs to be updated when converting to early-clobber
4221 MachineOperand &Def = NewMI->getOperand(0);
4222 if (Def.isEarlyClobber() && Def.isReg() &&
4223 LIS->hasInterval(Def.getReg())) {
4224 SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
4225 SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
4226 auto &LI = LIS->getInterval(Def.getReg());
4227 auto UpdateDefIndex = [&](LiveRange &LR) {
4228 auto *S = LR.find(OldIndex);
4229 if (S != LR.end() && S->start == OldIndex) {
4230 assert(S->valno && S->valno->def == OldIndex);
4231 S->start = NewIndex;
4232 S->valno->def = NewIndex;
4233 }
4234 };
4235 UpdateDefIndex(LI);
4236 for (auto &SR : LI.subranges())
4237 UpdateDefIndex(SR);
4238 }
4239 }
4240 }
4241
4242 if (U.RemoveMIUse) {
4243 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4244 // The only user is the instruction which will be killed.
4245 Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
4246
4247 if (MRI.hasOneNonDBGUse(DefReg)) {
4248 // We cannot just remove the DefMI here, calling pass will crash.
4249 U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
4250 U.RemoveMIUse->getOperand(0).setIsDead(true);
4251 for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
4252 U.RemoveMIUse->removeOperand(I);
4253 if (LV)
4254 LV->getVarInfo(DefReg).AliveBlocks.clear();
4255 }
4256
4257 if (MI.isBundle()) {
4258 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4259 if (!VRI.Reads && !VRI.Writes) {
4260 for (MachineOperand &MO : MI.all_uses()) {
4261 if (MO.isReg() && MO.getReg() == DefReg) {
4262 assert(MO.getSubReg() == 0 &&
4263 "tied sub-registers in bundles currently not supported");
4264 MI.removeOperand(MO.getOperandNo());
4265 break;
4266 }
4267 }
4268
4269 if (LIS)
4270 LIS->shrinkToUses(&LIS->getInterval(DefReg));
4271 }
4272 } else if (LIS) {
4273 LiveInterval &DefLI = LIS->getInterval(DefReg);
4274
4275 // We cannot delete the original instruction here, so hack out the use
4276 // in the original instruction with a dummy register so we can use
4277 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4278 // not have the complexity of deleting a use to consider here.
4279 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4280 for (MachineOperand &MIOp : MI.uses()) {
4281 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4282 MIOp.setIsUndef(true);
4283 MIOp.setReg(DummyReg);
4284 }
4285 }
4286
4287 if (MI.isBundle()) {
4288 VirtRegInfo VRI = AnalyzeVirtRegInBundle(MI, DefReg);
4289 if (!VRI.Reads && !VRI.Writes) {
4290 for (MachineOperand &MIOp : MI.uses()) {
4291 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4292 MIOp.setIsUndef(true);
4293 MIOp.setReg(DummyReg);
4294 }
4295 }
4296 }
4297
4298 MI.addOperand(MachineOperand::CreateReg(DummyReg, false, false, false,
4299 false, /*isUndef=*/true));
4300 }
4301
4302 LIS->shrinkToUses(&DefLI);
4303 }
4304 }
4305
4306 return MI.isBundle() ? &MI : NewMI;
4307}
4308
4310SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
4311 ThreeAddressUpdates &U) const {
4312 MachineBasicBlock &MBB = *MI.getParent();
4313 unsigned Opc = MI.getOpcode();
4314
4315 // Handle MFMA.
4316 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
4317 if (NewMFMAOpc != -1) {
4319 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
4320 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4321 MIB.add(MI.getOperand(I));
4322 return MIB;
4323 }
4324
4325 if (SIInstrInfo::isWMMA(MI)) {
4326 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
4327 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4328 .setMIFlags(MI.getFlags());
4329 for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I)
4330 MIB->addOperand(MI.getOperand(I));
4331 return MIB;
4332 }
4333
4334 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4335 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4336 "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4337 "present pre-RA");
4338
4339 // Handle MAC/FMAC.
4340 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
4341 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
4342 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
4343 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
4344 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
4345 bool Src0Literal = false;
4346
4347 switch (Opc) {
4348 default:
4349 return nullptr;
4350 case AMDGPU::V_MAC_F16_e64:
4351 case AMDGPU::V_FMAC_F16_e64:
4352 case AMDGPU::V_FMAC_F16_t16_e64:
4353 case AMDGPU::V_FMAC_F16_fake16_e64:
4354 case AMDGPU::V_MAC_F32_e64:
4355 case AMDGPU::V_MAC_LEGACY_F32_e64:
4356 case AMDGPU::V_FMAC_F32_e64:
4357 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4358 case AMDGPU::V_FMAC_F64_e64:
4359 break;
4360 case AMDGPU::V_MAC_F16_e32:
4361 case AMDGPU::V_FMAC_F16_e32:
4362 case AMDGPU::V_MAC_F32_e32:
4363 case AMDGPU::V_MAC_LEGACY_F32_e32:
4364 case AMDGPU::V_FMAC_F32_e32:
4365 case AMDGPU::V_FMAC_LEGACY_F32_e32:
4366 case AMDGPU::V_FMAC_F64_e32: {
4367 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4368 AMDGPU::OpName::src0);
4369 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
4370 if (!Src0->isReg() && !Src0->isImm())
4371 return nullptr;
4372
4373 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
4374 Src0Literal = true;
4375
4376 break;
4377 }
4378 }
4379
4380 MachineInstrBuilder MIB;
4381 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4382 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4383 const MachineOperand *Src0Mods =
4384 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4385 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4386 const MachineOperand *Src1Mods =
4387 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4388 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4389 const MachineOperand *Src2Mods =
4390 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4391 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4392 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4393 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4394
4395 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy &&
4396 (!IsF64 || ST.hasFmaakFmamkF64Insts()) &&
4397 // If we have an SGPR input, we will violate the constant bus restriction.
4398 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4399 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4400 MachineInstr *DefMI;
4401
4402 int64_t Imm;
4403 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4404 unsigned NewOpc = getNewFMAAKInst(ST, Opc);
4405 if (pseudoToMCOpcode(NewOpc) != -1) {
4406 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4407 .add(*Dst)
4408 .add(*Src0)
4409 .add(*Src1)
4410 .addImm(Imm)
4411 .setMIFlags(MI.getFlags());
4412 U.RemoveMIUse = DefMI;
4413 return MIB;
4414 }
4415 }
4416 unsigned NewOpc = getNewFMAMKInst(ST, Opc);
4417 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4418 if (pseudoToMCOpcode(NewOpc) != -1) {
4419 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4420 .add(*Dst)
4421 .add(*Src0)
4422 .addImm(Imm)
4423 .add(*Src2)
4424 .setMIFlags(MI.getFlags());
4425 U.RemoveMIUse = DefMI;
4426 return MIB;
4427 }
4428 }
4429 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4430 if (Src0Literal) {
4431 Imm = Src0->getImm();
4432 DefMI = nullptr;
4433 }
4434 if (pseudoToMCOpcode(NewOpc) != -1 &&
4436 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4437 Src1)) {
4438 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4439 .add(*Dst)
4440 .add(*Src1)
4441 .addImm(Imm)
4442 .add(*Src2)
4443 .setMIFlags(MI.getFlags());
4444 U.RemoveMIUse = DefMI;
4445 return MIB;
4446 }
4447 }
4448 }
4449
4450 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4451 // if VOP3 does not allow a literal operand.
4452 if (Src0Literal && !ST.hasVOP3Literal())
4453 return nullptr;
4454
4455 unsigned NewOpc = getNewFMAInst(ST, Opc);
4456
4457 if (pseudoToMCOpcode(NewOpc) == -1)
4458 return nullptr;
4459
4460 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4461 .add(*Dst)
4462 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4463 .add(*Src0)
4464 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4465 .add(*Src1)
4466 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4467 .add(*Src2)
4468 .addImm(Clamp ? Clamp->getImm() : 0)
4469 .addImm(Omod ? Omod->getImm() : 0)
4470 .setMIFlags(MI.getFlags());
4471 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4472 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4473 return MIB;
4474}
4475
4476// It's not generally safe to move VALU instructions across these since it will
4477// start using the register as a base index rather than directly.
4478// XXX - Why isn't hasSideEffects sufficient for these?
4480 switch (MI.getOpcode()) {
4481 case AMDGPU::S_SET_GPR_IDX_ON:
4482 case AMDGPU::S_SET_GPR_IDX_MODE:
4483 case AMDGPU::S_SET_GPR_IDX_OFF:
4484 return true;
4485 default:
4486 return false;
4487 }
4488}
4489
4491 const MachineBasicBlock *MBB,
4492 const MachineFunction &MF) const {
4493 // Skipping the check for SP writes in the base implementation. The reason it
4494 // was added was apparently due to compile time concerns.
4495 //
4496 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4497 // but is probably avoidable.
4498
4499 // Copied from base implementation.
4500 // Terminators and labels can't be scheduled around.
4501 if (MI.isTerminator() || MI.isPosition())
4502 return true;
4503
4504 // INLINEASM_BR can jump to another block
4505 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4506 return true;
4507
4508 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4509 return true;
4510
4511 // Target-independent instructions do not have an implicit-use of EXEC, even
4512 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4513 // boundaries prevents incorrect movements of such instructions.
4514 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4515 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4516 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4517 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4518 MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG ||
4520}
4521
4523 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4524 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4525 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
4526}
4527
4529 // Instructions that access scratch use FLAT encoding or BUF encodings.
4530 if ((!isFLAT(MI) || isFLATGlobal(MI)) && !isBUF(MI))
4531 return false;
4532
4533 // SCRATCH instructions always access scratch.
4534 if (isFLATScratch(MI))
4535 return true;
4536
4537 // If FLAT_SCRATCH registers are not initialized, we can never access scratch
4538 // via the aperture.
4539 if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4540 return false;
4541
4542 // If there are no memory operands then conservatively assume the flat
4543 // operation may access scratch.
4544 if (MI.memoperands_empty())
4545 return true;
4546
4547 // See if any memory operand specifies an address space that involves scratch.
4548 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4549 unsigned AS = Memop->getAddrSpace();
4550 if (AS == AMDGPUAS::FLAT_ADDRESS) {
4551 const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace;
4552 return !MD || !AMDGPU::hasValueInRangeLikeMetadata(
4553 *MD, AMDGPUAS::PRIVATE_ADDRESS);
4554 }
4555 return AS == AMDGPUAS::PRIVATE_ADDRESS;
4556 });
4557}
4558
4560 assert(isFLAT(MI));
4561
4562 // All flat instructions use the VMEM counter except prefetch.
4563 if (!usesVM_CNT(MI))
4564 return false;
4565
4566 // If there are no memory operands then conservatively assume the flat
4567 // operation may access VMEM.
4568 if (MI.memoperands_empty())
4569 return true;
4570
4571 // See if any memory operand specifies an address space that involves VMEM.
4572 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4573 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4574 // (GDS) address space is not supported by flat operations. Therefore, simply
4575 // return true unless only the LDS address space is found.
4576 for (const MachineMemOperand *Memop : MI.memoperands()) {
4577 unsigned AS = Memop->getAddrSpace();
4579 if (AS != AMDGPUAS::LOCAL_ADDRESS)
4580 return true;
4581 }
4582
4583 return false;
4584}
4585
4587 assert(isFLAT(MI));
4588
4589 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4590 if (!usesLGKM_CNT(MI))
4591 return false;
4592
4593 // If in tgsplit mode then there can be no use of LDS.
4594 if (ST.isTgSplitEnabled())
4595 return false;
4596
4597 // If there are no memory operands then conservatively assume the flat
4598 // operation may access LDS.
4599 if (MI.memoperands_empty())
4600 return true;
4601
4602 // See if any memory operand specifies an address space that involves LDS.
4603 for (const MachineMemOperand *Memop : MI.memoperands()) {
4604 unsigned AS = Memop->getAddrSpace();
4606 return true;
4607 }
4608
4609 return false;
4610}
4611
4613 // Skip the full operand and register alias search modifiesRegister
4614 // does. There's only a handful of instructions that touch this, it's only an
4615 // implicit def, and doesn't alias any other registers.
4616 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4617}
4618
4620 unsigned Opcode = MI.getOpcode();
4621
4622 if (MI.mayStore() && isSMRD(MI))
4623 return true; // scalar store or atomic
4624
4625 // This will terminate the function when other lanes may need to continue.
4626 if (MI.isReturn())
4627 return true;
4628
4629 // These instructions cause shader I/O that may cause hardware lockups
4630 // when executed with an empty EXEC mask.
4631 //
4632 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4633 // EXEC = 0, but checking for that case here seems not worth it
4634 // given the typical code patterns.
4635 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4636 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4637 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT ||
4638 Opcode == AMDGPU::S_SETHALT)
4639 return true;
4640
4641 if (MI.isCall() || MI.isInlineAsm())
4642 return true; // conservative assumption
4643
4644 // Assume that barrier interactions are only intended with active lanes.
4645 if (isBarrier(Opcode))
4646 return true;
4647
4648 // A mode change is a scalar operation that influences vector instructions.
4650 return true;
4651
4652 // These are like SALU instructions in terms of effects, so it's questionable
4653 // whether we should return true for those.
4654 //
4655 // However, executing them with EXEC = 0 causes them to operate on undefined
4656 // data, which we avoid by returning true here.
4657 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4658 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4659 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4660 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4661 return true;
4662
4663 return false;
4664}
4665
4667 const MachineInstr &MI) const {
4668 if (MI.isMetaInstruction())
4669 return false;
4670
4671 // This won't read exec if this is an SGPR->SGPR copy.
4672 if (MI.isCopyLike()) {
4673 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4674 return true;
4675
4676 // Make sure this isn't copying exec as a normal operand
4677 return MI.readsRegister(AMDGPU::EXEC, &RI);
4678 }
4679
4680 // Make a conservative assumption about the callee.
4681 if (MI.isCall())
4682 return true;
4683
4684 // Be conservative with any unhandled generic opcodes.
4685 if (!isTargetSpecificOpcode(MI.getOpcode()))
4686 return true;
4687
4688 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4689}
4690
4691bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4692 switch (Imm.getBitWidth()) {
4693 case 1: // This likely will be a condition code mask.
4694 return true;
4695
4696 case 32:
4697 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4698 ST.hasInv2PiInlineImm());
4699 case 64:
4700 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4701 ST.hasInv2PiInlineImm());
4702 case 16:
4703 return ST.has16BitInsts() &&
4704 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4705 ST.hasInv2PiInlineImm());
4706 default:
4707 llvm_unreachable("invalid bitwidth");
4708 }
4709}
4710
4712 APInt IntImm = Imm.bitcastToAPInt();
4713 int64_t IntImmVal = IntImm.getSExtValue();
4714 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4715 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4716 default:
4717 llvm_unreachable("invalid fltSemantics");
4720 return isInlineConstant(IntImm);
4722 return ST.has16BitInsts() &&
4723 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4725 return ST.has16BitInsts() &&
4726 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4727 }
4728}
4729
4730bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const {
4731 // MachineOperand provides no way to tell the true operand size, since it only
4732 // records a 64-bit value. We need to know the size to determine if a 32-bit
4733 // floating point immediate bit pattern is legal for an integer immediate. It
4734 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4735 switch (OperandType) {
4745 int32_t Trunc = static_cast<int32_t>(Imm);
4746 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
4747 }
4753 return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm());
4756 // We would expect inline immediates to not be concerned with an integer/fp
4757 // distinction. However, in the case of 16-bit integer operations, the
4758 // "floating point" values appear to not work. It seems read the low 16-bits
4759 // of 32-bit immediates, which happens to always work for the integer
4760 // values.
4761 //
4762 // See llvm bugzilla 46302.
4763 //
4764 // TODO: Theoretically we could use op-sel to use the high bits of the
4765 // 32-bit FP values.
4774 return AMDGPU::isPKFMACF16InlineConstant(Imm, ST.isGFX11Plus());
4779 return false;
4782 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4783 // A few special case instructions have 16-bit operands on subtargets
4784 // where 16-bit instructions are not legal.
4785 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4786 // constants in these cases
4787 int16_t Trunc = static_cast<int16_t>(Imm);
4788 return ST.has16BitInsts() &&
4789 AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm());
4790 }
4791
4792 return false;
4793 }
4796 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4797 int16_t Trunc = static_cast<int16_t>(Imm);
4798 return ST.has16BitInsts() &&
4799 AMDGPU::isInlinableLiteralBF16(Trunc, ST.hasInv2PiInlineImm());
4800 }
4801 return false;
4802 }
4806 return false;
4808 return isLegalAV64PseudoImm(Imm);
4811 // Always embedded in the instruction for free.
4812 return true;
4822 // Just ignore anything else.
4823 return true;
4824 default:
4825 llvm_unreachable("invalid operand type");
4826 }
4827}
4828
4829static bool compareMachineOp(const MachineOperand &Op0,
4830 const MachineOperand &Op1) {
4831 if (Op0.getType() != Op1.getType())
4832 return false;
4833
4834 switch (Op0.getType()) {
4836 return Op0.getReg() == Op1.getReg();
4838 return Op0.getImm() == Op1.getImm();
4839 default:
4840 llvm_unreachable("Didn't expect to be comparing these operand types");
4841 }
4842}
4843
4845 const MCOperandInfo &OpInfo) const {
4846 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
4847 return true;
4848
4849 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4850 return false;
4851
4852 if (!isVOP3(InstDesc) || !AMDGPU::isSISrcOperand(OpInfo))
4853 return true;
4854
4855 return ST.hasVOP3Literal();
4856}
4857
4858bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4859 int64_t ImmVal) const {
4860 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4861 if (isInlineConstant(ImmVal, OpInfo.OperandType)) {
4862 if (isMAI(InstDesc) && ST.hasMFMAInlineLiteralBug() &&
4863 OpNo == (unsigned)AMDGPU::getNamedOperandIdx(InstDesc.getOpcode(),
4864 AMDGPU::OpName::src2))
4865 return false;
4866 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4867 }
4868
4869 return isLiteralOperandLegal(InstDesc, OpInfo);
4870}
4871
4872bool SIInstrInfo::isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
4873 const MachineOperand &MO) const {
4874 if (MO.isImm())
4875 return isImmOperandLegal(InstDesc, OpNo, MO.getImm());
4876
4877 assert((MO.isTargetIndex() || MO.isFI() || MO.isGlobal()) &&
4878 "unexpected imm-like operand kind");
4879 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4880 return isLiteralOperandLegal(InstDesc, OpInfo);
4881}
4882
4884 // 2 32-bit inline constants packed into one.
4885 return AMDGPU::isInlinableLiteral32(Lo_32(Imm), ST.hasInv2PiInlineImm()) &&
4886 AMDGPU::isInlinableLiteral32(Hi_32(Imm), ST.hasInv2PiInlineImm());
4887}
4888
4889bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4890 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4891 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4892 return false;
4893
4894 int Op32 = AMDGPU::getVOPe32(Opcode);
4895 if (Op32 == -1)
4896 return false;
4897
4898 return pseudoToMCOpcode(Op32) != -1;
4899}
4900
4901bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4902 // The src0_modifier operand is present on all instructions
4903 // that have modifiers.
4904
4905 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4906}
4907
4909 AMDGPU::OpName OpName) const {
4910 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4911 return Mods && Mods->getImm();
4912}
4913
4915 return any_of(ModifierOpNames,
4916 [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); });
4917}
4918
4920 const MachineRegisterInfo &MRI) const {
4921 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4922 // Can't shrink instruction with three operands.
4923 if (Src2) {
4924 switch (MI.getOpcode()) {
4925 default: return false;
4926
4927 case AMDGPU::V_ADDC_U32_e64:
4928 case AMDGPU::V_SUBB_U32_e64:
4929 case AMDGPU::V_SUBBREV_U32_e64: {
4930 const MachineOperand *Src1
4931 = getNamedOperand(MI, AMDGPU::OpName::src1);
4932 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4933 return false;
4934 // Additional verification is needed for sdst/src2.
4935 return true;
4936 }
4937 case AMDGPU::V_MAC_F16_e64:
4938 case AMDGPU::V_MAC_F32_e64:
4939 case AMDGPU::V_MAC_LEGACY_F32_e64:
4940 case AMDGPU::V_FMAC_F16_e64:
4941 case AMDGPU::V_FMAC_F16_t16_e64:
4942 case AMDGPU::V_FMAC_F16_fake16_e64:
4943 case AMDGPU::V_FMAC_F32_e64:
4944 case AMDGPU::V_FMAC_F64_e64:
4945 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4946 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4947 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4948 return false;
4949 break;
4950
4951 case AMDGPU::V_CNDMASK_B32_e64:
4952 break;
4953 }
4954 }
4955
4956 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4957 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4958 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4959 return false;
4960
4961 // We don't need to check src0, all input types are legal, so just make sure
4962 // src0 isn't using any modifiers.
4963 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4964 return false;
4965
4966 // Can it be shrunk to a valid 32 bit opcode?
4967 if (!hasVALU32BitEncoding(MI.getOpcode()))
4968 return false;
4969
4970 // Check output modifiers
4971 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4972 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4973 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4974 // TODO: Can we avoid checking bound_ctrl/fi here?
4975 // They are only used by permlane*_swap special case.
4976 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4977 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4978}
4979
4980// Set VCC operand with all flags from \p Orig, except for setting it as
4981// implicit.
4983 const MachineOperand &Orig) {
4984
4985 for (MachineOperand &Use : MI.implicit_operands()) {
4986 if (Use.isUse() &&
4987 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4988 Use.setIsUndef(Orig.isUndef());
4989 Use.setIsKill(Orig.isKill());
4990 return;
4991 }
4992 }
4993}
4994
4996 unsigned Op32) const {
4997 MachineBasicBlock *MBB = MI.getParent();
4998
4999 const MCInstrDesc &Op32Desc = get(Op32);
5000 MachineInstrBuilder Inst32 =
5001 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
5002 .setMIFlags(MI.getFlags());
5003
5004 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
5005 // For VOPC instructions, this is replaced by an implicit def of vcc.
5006
5007 // We assume the defs of the shrunk opcode are in the same order, and the
5008 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
5009 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
5010 Inst32.add(MI.getOperand(I));
5011
5012 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
5013
5014 int Idx = MI.getNumExplicitDefs();
5015 for (const MachineOperand &Use : MI.explicit_uses()) {
5016 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
5018 continue;
5019
5020 if (&Use == Src2) {
5021 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
5022 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
5023 // replaced with an implicit read of vcc or vcc_lo. The implicit read
5024 // of vcc was already added during the initial BuildMI, but we
5025 // 1) may need to change vcc to vcc_lo to preserve the original register
5026 // 2) have to preserve the original flags.
5027 copyFlagsToImplicitVCC(*Inst32, *Src2);
5028 continue;
5029 }
5030 }
5031
5032 Inst32.add(Use);
5033 }
5034
5035 // FIXME: Losing implicit operands
5036 fixImplicitOperands(*Inst32);
5037 return Inst32;
5038}
5039
5041 // Null is free
5042 Register Reg = RegOp.getReg();
5043 if (Reg == AMDGPU::SGPR_NULL || Reg == AMDGPU::SGPR_NULL64)
5044 return false;
5045
5046 // SGPRs use the constant bus
5047
5048 // FIXME: implicit registers that are not part of the MCInstrDesc's implicit
5049 // physical register operands should also count, except for exec.
5050 if (RegOp.isImplicit())
5051 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::M0;
5052
5053 // SGPRs use the constant bus
5054 return AMDGPU::SReg_32RegClass.contains(Reg) ||
5055 AMDGPU::SReg_64RegClass.contains(Reg);
5056}
5057
5059 const MachineRegisterInfo &MRI) const {
5060 Register Reg = RegOp.getReg();
5061 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5062 : physRegUsesConstantBus(RegOp);
5063}
5064
5066 const MachineOperand &MO,
5067 const MCOperandInfo &OpInfo) const {
5068 // Literal constants use the constant bus.
5069 if (!MO.isReg())
5070 return !isInlineConstant(MO, OpInfo);
5071
5072 Register Reg = MO.getReg();
5073 return Reg.isVirtual() ? RI.isSGPRClass(MRI.getRegClass(Reg))
5075}
5076
5078 for (const MachineOperand &MO : MI.implicit_operands()) {
5079 // We only care about reads.
5080 if (MO.isDef())
5081 continue;
5082
5083 switch (MO.getReg()) {
5084 case AMDGPU::VCC:
5085 case AMDGPU::VCC_LO:
5086 case AMDGPU::VCC_HI:
5087 case AMDGPU::M0:
5088 case AMDGPU::FLAT_SCR:
5089 return MO.getReg();
5090
5091 default:
5092 break;
5093 }
5094 }
5095
5096 return Register();
5097}
5098
5099static bool shouldReadExec(const MachineInstr &MI) {
5100 if (SIInstrInfo::isVALU(MI)) {
5101 switch (MI.getOpcode()) {
5102 case AMDGPU::V_READLANE_B32:
5103 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
5104 case AMDGPU::V_WRITELANE_B32:
5105 case AMDGPU::SI_SPILL_S32_TO_VGPR:
5106 return false;
5107 }
5108
5109 return true;
5110 }
5111
5112 if (MI.isPreISelOpcode() ||
5113 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
5116 return false;
5117
5118 return true;
5119}
5120
5121static bool isRegOrFI(const MachineOperand &MO) {
5122 return MO.isReg() || MO.isFI();
5123}
5124
5125static bool isSubRegOf(const SIRegisterInfo &TRI,
5126 const MachineOperand &SuperVec,
5127 const MachineOperand &SubReg) {
5128 if (SubReg.getReg().isPhysical())
5129 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
5130
5131 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
5132 SubReg.getReg() == SuperVec.getReg();
5133}
5134
5135// Verify the illegal copy from vector register to SGPR for generic opcode COPY
5136bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
5137 const MachineRegisterInfo &MRI,
5138 StringRef &ErrInfo) const {
5139 Register DstReg = MI.getOperand(0).getReg();
5140 Register SrcReg = MI.getOperand(1).getReg();
5141 // This is a check for copy from vector register to SGPR
5142 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
5143 ErrInfo = "illegal copy from vector register to SGPR";
5144 return false;
5145 }
5146 return true;
5147}
5148
5150 StringRef &ErrInfo) const {
5151 uint32_t Opcode = MI.getOpcode();
5152 const MachineFunction *MF = MI.getMF();
5153 const MachineRegisterInfo &MRI = MF->getRegInfo();
5154
5155 // FIXME: At this point the COPY verify is done only for non-ssa forms.
5156 // Find a better property to recognize the point where instruction selection
5157 // is just done.
5158 // We can only enforce this check after SIFixSGPRCopies pass so that the
5159 // illegal copies are legalized and thereafter we don't expect a pass
5160 // inserting similar copies.
5161 if (!MRI.isSSA() && MI.isCopy())
5162 return verifyCopy(MI, MRI, ErrInfo);
5163
5164 if (SIInstrInfo::isGenericOpcode(Opcode))
5165 return true;
5166
5167 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
5168 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
5169 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
5170 int Src3Idx = -1;
5171 if (Src0Idx == -1) {
5172 // VOPD V_DUAL_* instructions use different operand names.
5173 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
5174 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
5175 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
5176 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
5177 }
5178
5179 // Make sure the number of operands is correct.
5180 const MCInstrDesc &Desc = get(Opcode);
5181 if (!Desc.isVariadic() &&
5182 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
5183 ErrInfo = "Instruction has wrong number of operands.";
5184 return false;
5185 }
5186
5187 if (MI.isInlineAsm()) {
5188 // Verify register classes for inlineasm constraints.
5189 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
5190 I != E; ++I) {
5191 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
5192 if (!RC)
5193 continue;
5194
5195 const MachineOperand &Op = MI.getOperand(I);
5196 if (!Op.isReg())
5197 continue;
5198
5199 Register Reg = Op.getReg();
5200 if (!Reg.isVirtual() && !RC->contains(Reg)) {
5201 ErrInfo = "inlineasm operand has incorrect register class.";
5202 return false;
5203 }
5204 }
5205
5206 return true;
5207 }
5208
5209 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
5210 ErrInfo = "missing memory operand from image instruction.";
5211 return false;
5212 }
5213
5214 // Make sure the register classes are correct.
5215 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
5216 const MachineOperand &MO = MI.getOperand(i);
5217 if (MO.isFPImm()) {
5218 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
5219 "all fp values to integers.";
5220 return false;
5221 }
5222
5223 const MCOperandInfo &OpInfo = Desc.operands()[i];
5224 int16_t RegClass = getOpRegClassID(OpInfo);
5225
5226 switch (OpInfo.OperandType) {
5228 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
5229 ErrInfo = "Illegal immediate value for operand.";
5230 return false;
5231 }
5232 break;
5246 break;
5248 break;
5249 break;
5263 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
5264 ErrInfo = "Illegal immediate value for operand.";
5265 return false;
5266 }
5267 break;
5268 }
5271 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
5272 ErrInfo = "Expected inline constant for operand.";
5273 return false;
5274 }
5275 break;
5278 break;
5283 // Check if this operand is an immediate.
5284 // FrameIndex operands will be replaced by immediates, so they are
5285 // allowed.
5286 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
5287 ErrInfo = "Expected immediate, but got non-immediate";
5288 return false;
5289 }
5290 break;
5294 break;
5295 default:
5296 if (OpInfo.isGenericType())
5297 continue;
5298 break;
5299 }
5300
5301 if (!MO.isReg())
5302 continue;
5303 Register Reg = MO.getReg();
5304 if (!Reg)
5305 continue;
5306
5307 // FIXME: Ideally we would have separate instruction definitions with the
5308 // aligned register constraint.
5309 // FIXME: We do not verify inline asm operands, but custom inline asm
5310 // verification is broken anyway
5311 if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO &&
5312 Opcode != AMDGPU::V_MOV_B64_PSEUDO && !isSpill(MI)) {
5313 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
5314 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
5315 if (const TargetRegisterClass *SubRC =
5316 RI.getSubRegisterClass(RC, MO.getSubReg())) {
5317 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
5318 if (RC)
5319 RC = SubRC;
5320 }
5321 }
5322
5323 // Check that this is the aligned version of the class.
5324 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
5325 ErrInfo = "Subtarget requires even aligned vector registers";
5326 return false;
5327 }
5328 }
5329
5330 if (RegClass != -1) {
5331 if (Reg.isVirtual())
5332 continue;
5333
5334 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
5335 if (!RC->contains(Reg)) {
5336 ErrInfo = "Operand has incorrect register class.";
5337 return false;
5338 }
5339 }
5340 }
5341
5342 // Verify SDWA
5343 if (isSDWA(MI)) {
5344 if (!ST.hasSDWA()) {
5345 ErrInfo = "SDWA is not supported on this target";
5346 return false;
5347 }
5348
5349 for (auto Op : {AMDGPU::OpName::src0_sel, AMDGPU::OpName::src1_sel,
5350 AMDGPU::OpName::dst_sel}) {
5351 const MachineOperand *MO = getNamedOperand(MI, Op);
5352 if (!MO)
5353 continue;
5354 int64_t Imm = MO->getImm();
5355 if (Imm < 0 || Imm > AMDGPU::SDWA::SdwaSel::DWORD) {
5356 ErrInfo = "Invalid SDWA selection";
5357 return false;
5358 }
5359 }
5360
5361 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
5362
5363 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
5364 if (OpIdx == -1)
5365 continue;
5366 const MachineOperand &MO = MI.getOperand(OpIdx);
5367
5368 if (!ST.hasSDWAScalar()) {
5369 // Only VGPRS on VI
5370 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
5371 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
5372 return false;
5373 }
5374 } else {
5375 // No immediates on GFX9
5376 if (!MO.isReg()) {
5377 ErrInfo =
5378 "Only reg allowed as operands in SDWA instructions on GFX9+";
5379 return false;
5380 }
5381 }
5382 }
5383
5384 if (!ST.hasSDWAOmod()) {
5385 // No omod allowed on VI
5386 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5387 if (OMod != nullptr &&
5388 (!OMod->isImm() || OMod->getImm() != 0)) {
5389 ErrInfo = "OMod not allowed in SDWA instructions on VI";
5390 return false;
5391 }
5392 }
5393
5394 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
5395 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
5396 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
5397 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
5398 const MachineOperand *Src0ModsMO =
5399 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
5400 unsigned Mods = Src0ModsMO->getImm();
5401 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
5402 Mods & SISrcMods::SEXT) {
5403 ErrInfo = "sext, abs and neg are not allowed on this instruction";
5404 return false;
5405 }
5406 }
5407
5408 uint32_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
5409 if (isVOPC(BasicOpcode)) {
5410 if (!ST.hasSDWASdst() && DstIdx != -1) {
5411 // Only vcc allowed as dst on VI for VOPC
5412 const MachineOperand &Dst = MI.getOperand(DstIdx);
5413 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
5414 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
5415 return false;
5416 }
5417 } else if (!ST.hasSDWAOutModsVOPC()) {
5418 // No clamp allowed on GFX9 for VOPC
5419 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
5420 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
5421 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
5422 return false;
5423 }
5424
5425 // No omod allowed on GFX9 for VOPC
5426 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
5427 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
5428 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
5429 return false;
5430 }
5431 }
5432 }
5433
5434 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
5435 if (DstUnused && DstUnused->isImm() &&
5436 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
5437 const MachineOperand &Dst = MI.getOperand(DstIdx);
5438 if (!Dst.isReg() || !Dst.isTied()) {
5439 ErrInfo = "Dst register should have tied register";
5440 return false;
5441 }
5442
5443 const MachineOperand &TiedMO =
5444 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
5445 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
5446 ErrInfo =
5447 "Dst register should be tied to implicit use of preserved register";
5448 return false;
5449 }
5450 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
5451 ErrInfo = "Dst register should use same physical register as preserved";
5452 return false;
5453 }
5454 }
5455 }
5456
5457 // Verify MIMG / VIMAGE / VSAMPLE
5458 if (isImage(Opcode) && !MI.mayStore()) {
5459 // Ensure that the return type used is large enough for all the options
5460 // being used TFE/LWE require an extra result register.
5461 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
5462 if (DMask) {
5463 uint64_t DMaskImm = DMask->getImm();
5464 uint32_t RegCount = isGather4(Opcode) ? 4 : llvm::popcount(DMaskImm);
5465 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5466 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5467 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5468
5469 // Adjust for packed 16 bit values
5470 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5471 RegCount = divideCeil(RegCount, 2);
5472
5473 // Adjust if using LWE or TFE
5474 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5475 RegCount += 1;
5476
5477 const uint32_t DstIdx =
5478 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
5479 const MachineOperand &Dst = MI.getOperand(DstIdx);
5480 if (Dst.isReg()) {
5481 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5482 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5483 if (RegCount > DstSize) {
5484 ErrInfo = "Image instruction returns too many registers for dst "
5485 "register class";
5486 return false;
5487 }
5488 }
5489 }
5490 }
5491
5492 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5493 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5494 unsigned ConstantBusCount = 0;
5495 bool UsesLiteral = false;
5496 const MachineOperand *LiteralVal = nullptr;
5497
5498 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5499 if (ImmIdx != -1) {
5500 ++ConstantBusCount;
5501 UsesLiteral = true;
5502 LiteralVal = &MI.getOperand(ImmIdx);
5503 }
5504
5505 SmallVector<Register, 2> SGPRsUsed;
5506 Register SGPRUsed;
5507
5508 // Only look at the true operands. Only a real operand can use the constant
5509 // bus, and we don't want to check pseudo-operands like the source modifier
5510 // flags.
5511 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5512 if (OpIdx == -1)
5513 continue;
5514 const MachineOperand &MO = MI.getOperand(OpIdx);
5515 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5516 if (MO.isReg()) {
5517 SGPRUsed = MO.getReg();
5518 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5519 ++ConstantBusCount;
5520 SGPRsUsed.push_back(SGPRUsed);
5521 }
5522 } else if (!MO.isFI()) { // Treat FI like a register.
5523 if (!UsesLiteral) {
5524 ++ConstantBusCount;
5525 UsesLiteral = true;
5526 LiteralVal = &MO;
5527 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5528 assert(isVOP2(MI) || isVOP3(MI));
5529 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5530 return false;
5531 }
5532 }
5533 }
5534 }
5535
5536 SGPRUsed = findImplicitSGPRRead(MI);
5537 if (SGPRUsed) {
5538 // Implicit uses may safely overlap true operands
5539 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5540 return !RI.regsOverlap(SGPRUsed, SGPR);
5541 })) {
5542 ++ConstantBusCount;
5543 SGPRsUsed.push_back(SGPRUsed);
5544 }
5545 }
5546
5547 // v_writelane_b32 is an exception from constant bus restriction:
5548 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5549 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5550 Opcode != AMDGPU::V_WRITELANE_B32) {
5551 ErrInfo = "VOP* instruction violates constant bus restriction";
5552 return false;
5553 }
5554
5555 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5556 ErrInfo = "VOP3 instruction uses literal";
5557 return false;
5558 }
5559 }
5560
5561 // Special case for writelane - this can break the multiple constant bus rule,
5562 // but still can't use more than one SGPR register
5563 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5564 unsigned SGPRCount = 0;
5565 Register SGPRUsed;
5566
5567 for (int OpIdx : {Src0Idx, Src1Idx}) {
5568 if (OpIdx == -1)
5569 break;
5570
5571 const MachineOperand &MO = MI.getOperand(OpIdx);
5572
5573 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5574 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5575 if (MO.getReg() != SGPRUsed)
5576 ++SGPRCount;
5577 SGPRUsed = MO.getReg();
5578 }
5579 }
5580 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5581 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5582 return false;
5583 }
5584 }
5585 }
5586
5587 // Verify misc. restrictions on specific instructions.
5588 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5589 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5590 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5591 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5592 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5593 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5594 if (!compareMachineOp(Src0, Src1) &&
5595 !compareMachineOp(Src0, Src2)) {
5596 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5597 return false;
5598 }
5599 }
5600 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5601 SISrcMods::ABS) ||
5602 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5603 SISrcMods::ABS) ||
5604 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5605 SISrcMods::ABS)) {
5606 ErrInfo = "ABS not allowed in VOP3B instructions";
5607 return false;
5608 }
5609 }
5610
5611 if (isSOP2(MI) || isSOPC(MI)) {
5612 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5613 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5614
5615 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5616 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5617 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5618 !Src0.isIdenticalTo(Src1)) {
5619 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5620 return false;
5621 }
5622 }
5623
5624 if (isSOPK(MI)) {
5625 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5626 if (Desc.isBranch()) {
5627 if (!Op->isMBB()) {
5628 ErrInfo = "invalid branch target for SOPK instruction";
5629 return false;
5630 }
5631 } else {
5632 uint64_t Imm = Op->getImm();
5633 if (sopkIsZext(Opcode)) {
5634 if (!isUInt<16>(Imm)) {
5635 ErrInfo = "invalid immediate for SOPK instruction";
5636 return false;
5637 }
5638 } else {
5639 if (!isInt<16>(Imm)) {
5640 ErrInfo = "invalid immediate for SOPK instruction";
5641 return false;
5642 }
5643 }
5644 }
5645 }
5646
5647 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5648 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5649 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5650 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5651 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5652 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5653
5654 const unsigned StaticNumOps =
5655 Desc.getNumOperands() + Desc.implicit_uses().size();
5656 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5657
5658 // Require additional implicit operands. This allows a fixup done by the
5659 // post RA scheduler where the main implicit operand is killed and
5660 // implicit-defs are added for sub-registers that remain live after this
5661 // instruction.
5662 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5663 ErrInfo = "missing implicit register operands";
5664 return false;
5665 }
5666
5667 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5668 if (IsDst) {
5669 if (!Dst->isUse()) {
5670 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5671 return false;
5672 }
5673
5674 unsigned UseOpIdx;
5675 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5676 UseOpIdx != StaticNumOps + 1) {
5677 ErrInfo = "movrel implicit operands should be tied";
5678 return false;
5679 }
5680 }
5681
5682 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5683 const MachineOperand &ImpUse
5684 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5685 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5686 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5687 ErrInfo = "src0 should be subreg of implicit vector use";
5688 return false;
5689 }
5690 }
5691
5692 // Make sure we aren't losing exec uses in the td files. This mostly requires
5693 // being careful when using let Uses to try to add other use registers.
5694 if (shouldReadExec(MI)) {
5695 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5696 ErrInfo = "VALU instruction does not implicitly read exec mask";
5697 return false;
5698 }
5699 }
5700
5701 if (isSMRD(MI)) {
5702 if (MI.mayStore() &&
5703 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5704 // The register offset form of scalar stores may only use m0 as the
5705 // soffset register.
5706 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5707 if (Soff && Soff->getReg() != AMDGPU::M0) {
5708 ErrInfo = "scalar stores must use m0 as offset register";
5709 return false;
5710 }
5711 }
5712 }
5713
5714 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5715 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5716 if (Offset->getImm() != 0) {
5717 ErrInfo = "subtarget does not support offsets in flat instructions";
5718 return false;
5719 }
5720 }
5721
5722 if (isDS(MI) && !ST.hasGDS()) {
5723 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5724 if (GDSOp && GDSOp->getImm() != 0) {
5725 ErrInfo = "GDS is not supported on this subtarget";
5726 return false;
5727 }
5728 }
5729
5730 if (isImage(MI)) {
5731 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5732 if (DimOp) {
5733 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5734 AMDGPU::OpName::vaddr0);
5735 AMDGPU::OpName RSrcOpName =
5736 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5737 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5738 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5739 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5740 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
5741 const AMDGPU::MIMGDimInfo *Dim =
5743
5744 if (!Dim) {
5745 ErrInfo = "dim is out of range";
5746 return false;
5747 }
5748
5749 bool IsA16 = false;
5750 if (ST.hasR128A16()) {
5751 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5752 IsA16 = R128A16->getImm() != 0;
5753 } else if (ST.hasA16()) {
5754 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5755 IsA16 = A16->getImm() != 0;
5756 }
5757
5758 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5759
5760 unsigned AddrWords =
5761 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5762
5763 unsigned VAddrWords;
5764 if (IsNSA) {
5765 VAddrWords = RsrcIdx - VAddr0Idx;
5766 if (ST.hasPartialNSAEncoding() &&
5767 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5768 unsigned LastVAddrIdx = RsrcIdx - 1;
5769 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5770 }
5771 } else {
5772 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5773 if (AddrWords > 12)
5774 AddrWords = 16;
5775 }
5776
5777 if (VAddrWords != AddrWords) {
5778 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5779 << " but got " << VAddrWords << "\n");
5780 ErrInfo = "bad vaddr size";
5781 return false;
5782 }
5783 }
5784 }
5785
5786 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5787 if (DppCt) {
5788 using namespace AMDGPU::DPP;
5789
5790 unsigned DC = DppCt->getImm();
5791 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5792 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5793 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5794 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5795 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5796 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5797 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5798 ErrInfo = "Invalid dpp_ctrl value";
5799 return false;
5800 }
5801 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5802 !ST.hasDPPWavefrontShifts()) {
5803 ErrInfo = "Invalid dpp_ctrl value: "
5804 "wavefront shifts are not supported on GFX10+";
5805 return false;
5806 }
5807 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5808 !ST.hasDPPBroadcasts()) {
5809 ErrInfo = "Invalid dpp_ctrl value: "
5810 "broadcasts are not supported on GFX10+";
5811 return false;
5812 }
5813 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5814 ST.getGeneration() < AMDGPUSubtarget::GFX10) {
5815 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5816 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5817 !ST.hasGFX90AInsts()) {
5818 ErrInfo = "Invalid dpp_ctrl value: "
5819 "row_newbroadcast/row_share is not supported before "
5820 "GFX90A/GFX10";
5821 return false;
5822 }
5823 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5824 ErrInfo = "Invalid dpp_ctrl value: "
5825 "row_share and row_xmask are not supported before GFX10";
5826 return false;
5827 }
5828 }
5829
5830 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5832 AMDGPU::isDPALU_DPP(Desc, *this, ST)) {
5833 ErrInfo = "Invalid dpp_ctrl value: "
5834 "DP ALU dpp only support row_newbcast";
5835 return false;
5836 }
5837 }
5838
5839 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5840 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5841 AMDGPU::OpName DataName =
5842 isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata;
5843 const MachineOperand *Data = getNamedOperand(MI, DataName);
5844 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5845 if (Data && !Data->isReg())
5846 Data = nullptr;
5847
5848 if (ST.hasGFX90AInsts()) {
5849 if (Dst && Data && !Dst->isTied() && !Data->isTied() &&
5850 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5851 ErrInfo = "Invalid register class: "
5852 "vdata and vdst should be both VGPR or AGPR";
5853 return false;
5854 }
5855 if (Data && Data2 &&
5856 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5857 ErrInfo = "Invalid register class: "
5858 "both data operands should be VGPR or AGPR";
5859 return false;
5860 }
5861 } else {
5862 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5863 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5864 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5865 ErrInfo = "Invalid register class: "
5866 "agpr loads and stores not supported on this GPU";
5867 return false;
5868 }
5869 }
5870 }
5871
5872 if (ST.needsAlignedVGPRs()) {
5873 const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool {
5875 if (!Op)
5876 return true;
5877 Register Reg = Op->getReg();
5878 if (Reg.isPhysical())
5879 return !(RI.getHWRegIndex(Reg) & 1);
5880 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5881 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5882 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5883 };
5884
5885 if (Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
5886 Opcode == AMDGPU::DS_GWS_BARRIER) {
5887
5888 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5889 ErrInfo = "Subtarget requires even aligned vector registers "
5890 "for DS_GWS instructions";
5891 return false;
5892 }
5893 }
5894
5895 if (isMIMG(MI)) {
5896 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5897 ErrInfo = "Subtarget requires even aligned vector registers "
5898 "for vaddr operand of image instructions";
5899 return false;
5900 }
5901 }
5902 }
5903
5904 if (Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts()) {
5905 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5906 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5907 ErrInfo = "Invalid register class: "
5908 "v_accvgpr_write with an SGPR is not supported on this GPU";
5909 return false;
5910 }
5911 }
5912
5913 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5914 const MachineOperand &SrcOp = MI.getOperand(1);
5915 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5916 ErrInfo = "pseudo expects only physical SGPRs";
5917 return false;
5918 }
5919 }
5920
5921 if (const MachineOperand *CPol = getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5922 if (CPol->getImm() & AMDGPU::CPol::SCAL) {
5923 if (!ST.hasScaleOffset()) {
5924 ErrInfo = "Subtarget does not support offset scaling";
5925 return false;
5926 }
5927 if (!AMDGPU::supportsScaleOffset(*this, MI.getOpcode())) {
5928 ErrInfo = "Instruction does not support offset scaling";
5929 return false;
5930 }
5931 }
5932 }
5933
5934 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
5935 // information.
5936 if (AMDGPU::isPackedFP32Inst(Opcode) && AMDGPU::isGFX12Plus(ST)) {
5937 for (unsigned I = 0; I < 3; ++I) {
5939 return false;
5940 }
5941 }
5942
5943 if (ST.hasFlatScratchHiInB64InstHazard() && isSALU(MI) &&
5944 MI.readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, nullptr)) {
5945 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst);
5946 if ((Dst && RI.getRegClassForReg(MRI, Dst->getReg()) ==
5947 &AMDGPU::SReg_64RegClass) ||
5948 Opcode == AMDGPU::S_BITCMP0_B64 || Opcode == AMDGPU::S_BITCMP1_B64) {
5949 ErrInfo = "Instruction cannot read flat_scratch_base_hi";
5950 return false;
5951 }
5952 }
5953
5954 return true;
5955}
5956
5958 if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
5959 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
5960 return MI.getOperand(1).isReg() || RI.isAGPR(MRI, MI.getOperand(0).getReg())
5961 ? AMDGPU::COPY
5962 : AMDGPU::V_MOV_B32_e32;
5963 }
5964 return getVALUOp(MI.getOpcode());
5965}
5966
5967// It is more readable to list mapped opcodes on the same line.
5968// clang-format off
5969
5970unsigned SIInstrInfo::getVALUOp(unsigned Opc) const {
5971 switch (Opc) {
5972 default: return AMDGPU::INSTRUCTION_LIST_END;
5973 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5974 case AMDGPU::COPY: return AMDGPU::COPY;
5975 case AMDGPU::PHI: return AMDGPU::PHI;
5976 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5977 case AMDGPU::WQM: return AMDGPU::WQM;
5978 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5979 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5980 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5981 case AMDGPU::S_ADD_I32:
5982 return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5983 case AMDGPU::S_ADDC_U32:
5984 return AMDGPU::V_ADDC_U32_e32;
5985 case AMDGPU::S_SUB_I32:
5986 return ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5987 // FIXME: These are not consistently handled, and selected when the carry is
5988 // used.
5989 case AMDGPU::S_ADD_U32:
5990 return AMDGPU::V_ADD_CO_U32_e32;
5991 case AMDGPU::S_SUB_U32:
5992 return AMDGPU::V_SUB_CO_U32_e32;
5993 case AMDGPU::S_ADD_U64_PSEUDO:
5994 return AMDGPU::V_ADD_U64_PSEUDO;
5995 case AMDGPU::S_SUB_U64_PSEUDO:
5996 return AMDGPU::V_SUB_U64_PSEUDO;
5997 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5998 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5999 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
6000 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
6001 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
6002 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
6003 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
6004 case AMDGPU::S_XNOR_B32:
6005 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
6006 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
6007 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
6008 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
6009 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
6010 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
6011 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
6012 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
6013 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
6014 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
6015 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
6016 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
6017 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
6018 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
6019 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
6020 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
6021 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
6022 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
6023 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
6024 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
6025 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
6026 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
6027 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
6028 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
6029 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
6030 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
6031 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
6032 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
6033 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
6034 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
6035 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
6036 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
6037 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
6038 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
6039 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
6040 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
6041 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
6042 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
6043 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
6044 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
6045 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
6046 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
6047 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
6048 case AMDGPU::S_CVT_F32_F16:
6049 case AMDGPU::S_CVT_HI_F32_F16:
6050 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
6051 : AMDGPU::V_CVT_F32_F16_fake16_e64;
6052 case AMDGPU::S_CVT_F16_F32:
6053 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
6054 : AMDGPU::V_CVT_F16_F32_fake16_e64;
6055 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
6056 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
6057 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
6058 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
6059 case AMDGPU::S_CEIL_F16:
6060 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
6061 : AMDGPU::V_CEIL_F16_fake16_e64;
6062 case AMDGPU::S_FLOOR_F16:
6063 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
6064 : AMDGPU::V_FLOOR_F16_fake16_e64;
6065 case AMDGPU::S_TRUNC_F16:
6066 return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
6067 : AMDGPU::V_TRUNC_F16_fake16_e64;
6068 case AMDGPU::S_RNDNE_F16:
6069 return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
6070 : AMDGPU::V_RNDNE_F16_fake16_e64;
6071 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
6072 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
6073 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
6074 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
6075 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
6076 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
6077 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
6078 case AMDGPU::S_ADD_F16:
6079 return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
6080 : AMDGPU::V_ADD_F16_fake16_e64;
6081 case AMDGPU::S_SUB_F16:
6082 return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
6083 : AMDGPU::V_SUB_F16_fake16_e64;
6084 case AMDGPU::S_MIN_F16:
6085 return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
6086 : AMDGPU::V_MIN_F16_fake16_e64;
6087 case AMDGPU::S_MAX_F16:
6088 return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
6089 : AMDGPU::V_MAX_F16_fake16_e64;
6090 case AMDGPU::S_MINIMUM_F16:
6091 return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
6092 : AMDGPU::V_MINIMUM_F16_fake16_e64;
6093 case AMDGPU::S_MAXIMUM_F16:
6094 return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
6095 : AMDGPU::V_MAXIMUM_F16_fake16_e64;
6096 case AMDGPU::S_MUL_F16:
6097 return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
6098 : AMDGPU::V_MUL_F16_fake16_e64;
6099 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
6100 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
6101 case AMDGPU::S_FMAC_F16:
6102 return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
6103 : AMDGPU::V_FMAC_F16_fake16_e64;
6104 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
6105 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
6106 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
6107 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
6108 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
6109 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
6110 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
6111 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
6112 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
6113 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
6114 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
6115 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
6116 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
6117 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
6118 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
6119 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
6120 case AMDGPU::S_CMP_LT_F16:
6121 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
6122 : AMDGPU::V_CMP_LT_F16_fake16_e64;
6123 case AMDGPU::S_CMP_EQ_F16:
6124 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
6125 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
6126 case AMDGPU::S_CMP_LE_F16:
6127 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
6128 : AMDGPU::V_CMP_LE_F16_fake16_e64;
6129 case AMDGPU::S_CMP_GT_F16:
6130 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
6131 : AMDGPU::V_CMP_GT_F16_fake16_e64;
6132 case AMDGPU::S_CMP_LG_F16:
6133 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
6134 : AMDGPU::V_CMP_LG_F16_fake16_e64;
6135 case AMDGPU::S_CMP_GE_F16:
6136 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
6137 : AMDGPU::V_CMP_GE_F16_fake16_e64;
6138 case AMDGPU::S_CMP_O_F16:
6139 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
6140 : AMDGPU::V_CMP_O_F16_fake16_e64;
6141 case AMDGPU::S_CMP_U_F16:
6142 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
6143 : AMDGPU::V_CMP_U_F16_fake16_e64;
6144 case AMDGPU::S_CMP_NGE_F16:
6145 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
6146 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
6147 case AMDGPU::S_CMP_NLG_F16:
6148 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
6149 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
6150 case AMDGPU::S_CMP_NGT_F16:
6151 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
6152 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
6153 case AMDGPU::S_CMP_NLE_F16:
6154 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
6155 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
6156 case AMDGPU::S_CMP_NEQ_F16:
6157 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
6158 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
6159 case AMDGPU::S_CMP_NLT_F16:
6160 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
6161 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
6162 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
6163 case AMDGPU::V_S_EXP_F16_e64:
6164 return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
6165 : AMDGPU::V_EXP_F16_fake16_e64;
6166 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
6167 case AMDGPU::V_S_LOG_F16_e64:
6168 return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
6169 : AMDGPU::V_LOG_F16_fake16_e64;
6170 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
6171 case AMDGPU::V_S_RCP_F16_e64:
6172 return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
6173 : AMDGPU::V_RCP_F16_fake16_e64;
6174 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
6175 case AMDGPU::V_S_RSQ_F16_e64:
6176 return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
6177 : AMDGPU::V_RSQ_F16_fake16_e64;
6178 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
6179 case AMDGPU::V_S_SQRT_F16_e64:
6180 return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
6181 : AMDGPU::V_SQRT_F16_fake16_e64;
6182 }
6184 "Unexpected scalar opcode without corresponding vector one!");
6185}
6186
6187// clang-format on
6188
6192 const DebugLoc &DL, Register Reg,
6193 bool IsSCCLive,
6194 SlotIndexes *Indexes) const {
6195 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6196 const SIInstrInfo *TII = ST.getInstrInfo();
6198 if (IsSCCLive) {
6199 // Insert two move instructions, one to save the original value of EXEC and
6200 // the other to turn on all bits in EXEC. This is required as we can't use
6201 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
6202 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), Reg)
6204 auto FlipExecMI =
6205 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
6206 if (Indexes) {
6207 Indexes->insertMachineInstrInMaps(*StoreExecMI);
6208 Indexes->insertMachineInstrInMaps(*FlipExecMI);
6209 }
6210 } else {
6211 auto SaveExec =
6212 BuildMI(MBB, MBBI, DL, TII->get(LMC.OrSaveExecOpc), Reg).addImm(-1);
6213 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
6214 if (Indexes)
6215 Indexes->insertMachineInstrInMaps(*SaveExec);
6216 }
6217}
6218
6221 const DebugLoc &DL, Register Reg,
6222 SlotIndexes *Indexes) const {
6224 auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, get(LMC.MovOpc), LMC.ExecReg)
6225 .addReg(Reg, RegState::Kill);
6226 if (Indexes)
6227 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
6228}
6229
6233 "Not a whole wave func");
6234 MachineBasicBlock &MBB = *MF.begin();
6235 for (MachineInstr &MI : MBB)
6236 if (MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_SETUP ||
6237 MI.getOpcode() == AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
6238 return &MI;
6239
6240 llvm_unreachable("Couldn't find SI_SETUP_WHOLE_WAVE_FUNC instruction");
6241}
6242
6244 unsigned OpNo) const {
6245 const MCInstrDesc &Desc = get(MI.getOpcode());
6246 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
6247 Desc.operands()[OpNo].RegClass == -1) {
6248 Register Reg = MI.getOperand(OpNo).getReg();
6249
6250 if (Reg.isVirtual()) {
6251 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6252 return MRI.getRegClass(Reg);
6253 }
6254 return RI.getPhysRegBaseClass(Reg);
6255 }
6256
6257 int16_t RegClass = getOpRegClassID(Desc.operands()[OpNo]);
6258 return RegClass < 0 ? nullptr : RI.getRegClass(RegClass);
6259}
6260
6263 MachineBasicBlock *MBB = MI.getParent();
6264 MachineOperand &MO = MI.getOperand(OpIdx);
6265 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
6266 unsigned RCID = getOpRegClassID(get(MI.getOpcode()).operands()[OpIdx]);
6267 const TargetRegisterClass *RC = RI.getRegClass(RCID);
6268 unsigned Size = RI.getRegSizeInBits(*RC);
6269 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
6270 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
6271 : AMDGPU::V_MOV_B32_e32;
6272 if (MO.isReg())
6273 Opcode = AMDGPU::COPY;
6274 else if (RI.isSGPRClass(RC))
6275 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
6276
6277 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
6278 Register Reg = MRI.createVirtualRegister(VRC);
6279 DebugLoc DL = MBB->findDebugLoc(I);
6280 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
6281 MO.ChangeToRegister(Reg, false);
6282}
6283
6286 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
6287 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6288 if (!SuperReg.getReg().isVirtual())
6289 return RI.getSubReg(SuperReg.getReg(), SubIdx);
6290
6291 MachineBasicBlock *MBB = MI->getParent();
6292 const DebugLoc &DL = MI->getDebugLoc();
6293 Register SubReg = MRI.createVirtualRegister(SubRC);
6294
6295 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
6296 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
6297 .addReg(SuperReg.getReg(), {}, NewSubIdx);
6298 return SubReg;
6299}
6300
6303 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
6304 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
6305 if (Op.isImm()) {
6306 if (SubIdx == AMDGPU::sub0)
6307 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
6308 if (SubIdx == AMDGPU::sub1)
6309 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
6310
6311 llvm_unreachable("Unhandled register index for immediate");
6312 }
6313
6314 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
6315 SubIdx, SubRC);
6316 return MachineOperand::CreateReg(SubReg, false);
6317}
6318
6319// Change the order of operands from (0, 1, 2) to (0, 2, 1)
6320void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
6321 assert(Inst.getNumExplicitOperands() == 3);
6322 MachineOperand Op1 = Inst.getOperand(1);
6323 Inst.removeOperand(1);
6324 Inst.addOperand(Op1);
6325}
6326
6328 const MCOperandInfo &OpInfo,
6329 const MachineOperand &MO) const {
6330 if (!MO.isReg())
6331 return false;
6332
6333 Register Reg = MO.getReg();
6334
6335 const TargetRegisterClass *DRC = RI.getRegClass(getOpRegClassID(OpInfo));
6336 if (Reg.isPhysical())
6337 return DRC->contains(Reg);
6338
6339 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
6340
6341 if (MO.getSubReg()) {
6342 const MachineFunction *MF = MO.getParent()->getMF();
6343 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
6344 if (!SuperRC)
6345 return false;
6346 return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
6347 }
6348
6349 return RI.getCommonSubClass(DRC, RC) != nullptr;
6350}
6351
6353 const MachineOperand &MO) const {
6354 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
6355 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
6356 unsigned Opc = MI.getOpcode();
6357
6358 // See SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more
6359 // information.
6360 if (AMDGPU::isPackedFP32Inst(MI.getOpcode()) && AMDGPU::isGFX12Plus(ST) &&
6361 MO.isReg() && RI.isSGPRReg(MRI, MO.getReg())) {
6362 constexpr AMDGPU::OpName OpNames[] = {
6363 AMDGPU::OpName::src0, AMDGPU::OpName::src1, AMDGPU::OpName::src2};
6364
6365 for (auto [I, OpName] : enumerate(OpNames)) {
6366 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[I]);
6367 if (static_cast<unsigned>(SrcIdx) == OpIdx &&
6369 return false;
6370 }
6371 }
6372
6373 if (!isLegalRegOperand(MRI, OpInfo, MO))
6374 return false;
6375
6376 // check Accumulate GPR operand
6377 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
6378 if (IsAGPR && !ST.hasMAIInsts())
6379 return false;
6380 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
6381 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
6382 return false;
6383 // Atomics should have both vdst and vdata either vgpr or agpr.
6384 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
6385 const int DataIdx = AMDGPU::getNamedOperandIdx(
6386 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
6387 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
6388 MI.getOperand(DataIdx).isReg() &&
6389 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
6390 return false;
6391 if ((int)OpIdx == DataIdx) {
6392 if (VDstIdx != -1 &&
6393 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
6394 return false;
6395 // DS instructions with 2 src operands also must have tied RC.
6396 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
6397 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
6398 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
6399 return false;
6400 }
6401
6402 // Check V_ACCVGPR_WRITE_B32_e64
6403 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
6404 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
6405 RI.isSGPRReg(MRI, MO.getReg()))
6406 return false;
6407
6408 if (ST.hasFlatScratchHiInB64InstHazard() &&
6409 MO.getReg() == AMDGPU::SRC_FLAT_SCRATCH_BASE_HI && isSALU(MI)) {
6410 if (const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::sdst)) {
6411 if (AMDGPU::getRegBitWidth(*RI.getRegClassForReg(MRI, Dst->getReg())) ==
6412 64)
6413 return false;
6414 }
6415 if (Opc == AMDGPU::S_BITCMP0_B64 || Opc == AMDGPU::S_BITCMP1_B64)
6416 return false;
6417 }
6418
6419 return true;
6420}
6421
6423 const MCOperandInfo &OpInfo,
6424 const MachineOperand &MO) const {
6425 if (MO.isReg())
6426 return isLegalRegOperand(MRI, OpInfo, MO);
6427
6428 // Handle non-register types that are treated like immediates.
6429 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
6430 return true;
6431}
6432
6434 const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN,
6435 const MachineOperand *MO) const {
6436 constexpr unsigned NumOps = 3;
6437 constexpr AMDGPU::OpName OpNames[NumOps * 2] = {
6438 AMDGPU::OpName::src0, AMDGPU::OpName::src1,
6439 AMDGPU::OpName::src2, AMDGPU::OpName::src0_modifiers,
6440 AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers};
6441
6442 assert(SrcN < NumOps);
6443
6444 if (!MO) {
6445 int SrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[SrcN]);
6446 if (SrcIdx == -1)
6447 return true;
6448 MO = &MI.getOperand(SrcIdx);
6449 }
6450
6451 if (!MO->isReg() || !RI.isSGPRReg(MRI, MO->getReg()))
6452 return true;
6453
6454 int ModsIdx =
6455 AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpNames[NumOps + SrcN]);
6456 if (ModsIdx == -1)
6457 return true;
6458
6459 unsigned Mods = MI.getOperand(ModsIdx).getImm();
6460 bool OpSel = Mods & SISrcMods::OP_SEL_0;
6461 bool OpSelHi = Mods & SISrcMods::OP_SEL_1;
6462
6463 return !OpSel && !OpSelHi;
6464}
6465
6467 const MachineOperand *MO) const {
6468 const MachineFunction &MF = *MI.getMF();
6469 const MachineRegisterInfo &MRI = MF.getRegInfo();
6470 const MCInstrDesc &InstDesc = MI.getDesc();
6471 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
6472 int64_t RegClass = getOpRegClassID(OpInfo);
6473 const TargetRegisterClass *DefinedRC =
6474 RegClass != -1 ? RI.getRegClass(RegClass) : nullptr;
6475 if (!MO)
6476 MO = &MI.getOperand(OpIdx);
6477
6478 const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo);
6479
6480 if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) {
6481 const MachineOperand *UsedLiteral = nullptr;
6482
6483 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
6484 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
6485
6486 // TODO: Be more permissive with frame indexes.
6487 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) {
6488 if (!LiteralLimit--)
6489 return false;
6490
6491 UsedLiteral = MO;
6492 }
6493
6495 if (MO->isReg())
6496 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
6497
6498 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6499 if (i == OpIdx)
6500 continue;
6501 const MachineOperand &Op = MI.getOperand(i);
6502 if (Op.isReg()) {
6503 if (Op.isUse()) {
6504 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
6505 if (regUsesConstantBus(Op, MRI) && SGPRsUsed.insert(SGPR).second) {
6506 if (--ConstantBusLimit <= 0)
6507 return false;
6508 }
6509 }
6510 } else if (AMDGPU::isSISrcOperand(InstDesc.operands()[i]) &&
6511 !isInlineConstant(Op, InstDesc.operands()[i])) {
6512 // The same literal may be used multiple times.
6513 if (!UsedLiteral)
6514 UsedLiteral = &Op;
6515 else if (UsedLiteral->isIdenticalTo(Op))
6516 continue;
6517
6518 if (!LiteralLimit--)
6519 return false;
6520 if (--ConstantBusLimit <= 0)
6521 return false;
6522 }
6523 }
6524 } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) {
6525 // There can be at most one literal operand, but it can be repeated.
6526 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
6527 if (i == OpIdx)
6528 continue;
6529 const MachineOperand &Op = MI.getOperand(i);
6530 if (!Op.isReg() && !Op.isFI() && !Op.isRegMask() &&
6531 !isInlineConstant(Op, InstDesc.operands()[i]) &&
6532 !Op.isIdenticalTo(*MO))
6533 return false;
6534
6535 // Do not fold a non-inlineable and non-register operand into an
6536 // instruction that already has a frame index. The frame index handling
6537 // code could not handle well when a frame index co-exists with another
6538 // non-register operand, unless that operand is an inlineable immediate.
6539 if (Op.isFI())
6540 return false;
6541 }
6542 } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
6543 isF16PseudoScalarTrans(MI.getOpcode())) {
6544 return false;
6545 }
6546
6547 if (MO->isReg()) {
6548 if (!DefinedRC)
6549 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
6550 return isLegalRegOperand(MI, OpIdx, *MO);
6551 }
6552
6553 if (MO->isImm()) {
6554 uint64_t Imm = MO->getImm();
6555 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
6556 bool Is64BitOp = Is64BitFPOp ||
6557 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
6558 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
6559 OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
6560 if (Is64BitOp &&
6561 !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
6562 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
6563 (!ST.has64BitLiterals() || InstDesc.getSize() != 4))
6564 return false;
6565
6566 // FIXME: We can use sign extended 64-bit literals, but only for signed
6567 // operands. At the moment we do not know if an operand is signed.
6568 // Such operand will be encoded as its low 32 bits and then either
6569 // correctly sign extended or incorrectly zero extended by HW.
6570 // If 64-bit literals are supported and the literal will be encoded
6571 // as full 64 bit we still can use it.
6572 if (!Is64BitFPOp && (int32_t)Imm < 0 &&
6573 (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false)))
6574 return false;
6575 }
6576 }
6577
6578 // Handle non-register types that are treated like immediates.
6579 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6580
6581 if (!DefinedRC) {
6582 // This operand expects an immediate.
6583 return true;
6584 }
6585
6586 return isImmOperandLegal(MI, OpIdx, *MO);
6587}
6588
6590 bool IsGFX950Only = ST.hasGFX950Insts();
6591 bool IsGFX940Only = ST.hasGFX940Insts();
6592
6593 if (!IsGFX950Only && !IsGFX940Only)
6594 return false;
6595
6596 if (!isVALU(MI))
6597 return false;
6598
6599 // V_COS, V_EXP, V_RCP, etc.
6600 if (isTRANS(MI))
6601 return true;
6602
6603 // DOT2, DOT2C, DOT4, etc.
6604 if (isDOT(MI))
6605 return true;
6606
6607 // MFMA, SMFMA
6608 if (isMFMA(MI))
6609 return true;
6610
6611 unsigned Opcode = MI.getOpcode();
6612 switch (Opcode) {
6613 case AMDGPU::V_CVT_PK_BF8_F32_e64:
6614 case AMDGPU::V_CVT_PK_FP8_F32_e64:
6615 case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6616 case AMDGPU::V_MQSAD_U32_U8_e64:
6617 case AMDGPU::V_PK_ADD_F16:
6618 case AMDGPU::V_PK_ADD_F32:
6619 case AMDGPU::V_PK_ADD_I16:
6620 case AMDGPU::V_PK_ADD_U16:
6621 case AMDGPU::V_PK_ASHRREV_I16:
6622 case AMDGPU::V_PK_FMA_F16:
6623 case AMDGPU::V_PK_FMA_F32:
6624 case AMDGPU::V_PK_FMAC_F16_e32:
6625 case AMDGPU::V_PK_FMAC_F16_e64:
6626 case AMDGPU::V_PK_LSHLREV_B16:
6627 case AMDGPU::V_PK_LSHRREV_B16:
6628 case AMDGPU::V_PK_MAD_I16:
6629 case AMDGPU::V_PK_MAD_U16:
6630 case AMDGPU::V_PK_MAX_F16:
6631 case AMDGPU::V_PK_MAX_I16:
6632 case AMDGPU::V_PK_MAX_U16:
6633 case AMDGPU::V_PK_MIN_F16:
6634 case AMDGPU::V_PK_MIN_I16:
6635 case AMDGPU::V_PK_MIN_U16:
6636 case AMDGPU::V_PK_MOV_B32:
6637 case AMDGPU::V_PK_MUL_F16:
6638 case AMDGPU::V_PK_MUL_F32:
6639 case AMDGPU::V_PK_MUL_LO_U16:
6640 case AMDGPU::V_PK_SUB_I16:
6641 case AMDGPU::V_PK_SUB_U16:
6642 case AMDGPU::V_QSAD_PK_U16_U8_e64:
6643 return true;
6644 default:
6645 return false;
6646 }
6647}
6648
6650 MachineInstr &MI) const {
6651 unsigned Opc = MI.getOpcode();
6652 const MCInstrDesc &InstrDesc = get(Opc);
6653
6654 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6655 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6656
6657 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6658 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6659
6660 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6661 // we need to only have one constant bus use before GFX10.
6662 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6663 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6664 RI.isSGPRReg(MRI, Src0.getReg()))
6665 legalizeOpWithMove(MI, Src0Idx);
6666
6667 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6668 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6669 // src0/src1 with V_READFIRSTLANE.
6670 if (Opc == AMDGPU::V_WRITELANE_B32) {
6671 const DebugLoc &DL = MI.getDebugLoc();
6672 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6673 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6674 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6675 .add(Src0);
6676 Src0.ChangeToRegister(Reg, false);
6677 }
6678 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6679 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6680 const DebugLoc &DL = MI.getDebugLoc();
6681 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6682 .add(Src1);
6683 Src1.ChangeToRegister(Reg, false);
6684 }
6685 return;
6686 }
6687
6688 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6689 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6690 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6691 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6692 legalizeOpWithMove(MI, Src2Idx);
6693 }
6694
6695 // VOP2 src0 instructions support all operand types, so we don't need to check
6696 // their legality. If src1 is already legal, we don't need to do anything.
6697 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6698 return;
6699
6700 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6701 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6702 // select is uniform.
6703 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6704 RI.isVGPR(MRI, Src1.getReg())) {
6705 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6706 const DebugLoc &DL = MI.getDebugLoc();
6707 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6708 .add(Src1);
6709 Src1.ChangeToRegister(Reg, false);
6710 return;
6711 }
6712
6713 // We do not use commuteInstruction here because it is too aggressive and will
6714 // commute if it is possible. We only want to commute here if it improves
6715 // legality. This can be called a fairly large number of times so don't waste
6716 // compile time pointlessly swapping and checking legality again.
6717 if (HasImplicitSGPR || !MI.isCommutable()) {
6718 legalizeOpWithMove(MI, Src1Idx);
6719 return;
6720 }
6721
6722 // If src0 can be used as src1, commuting will make the operands legal.
6723 // Otherwise we have to give up and insert a move.
6724 //
6725 // TODO: Other immediate-like operand kinds could be commuted if there was a
6726 // MachineOperand::ChangeTo* for them.
6727 if ((!Src1.isImm() && !Src1.isReg()) ||
6728 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6729 legalizeOpWithMove(MI, Src1Idx);
6730 return;
6731 }
6732
6733 int CommutedOpc = commuteOpcode(MI);
6734 if (CommutedOpc == -1) {
6735 legalizeOpWithMove(MI, Src1Idx);
6736 return;
6737 }
6738
6739 MI.setDesc(get(CommutedOpc));
6740
6741 Register Src0Reg = Src0.getReg();
6742 unsigned Src0SubReg = Src0.getSubReg();
6743 bool Src0Kill = Src0.isKill();
6744
6745 if (Src1.isImm())
6746 Src0.ChangeToImmediate(Src1.getImm());
6747 else if (Src1.isReg()) {
6748 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6749 Src0.setSubReg(Src1.getSubReg());
6750 } else
6751 llvm_unreachable("Should only have register or immediate operands");
6752
6753 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6754 Src1.setSubReg(Src0SubReg);
6756}
6757
6758// Legalize VOP3 operands. All operand types are supported for any operand
6759// but only one literal constant and only starting from GFX10.
6761 MachineInstr &MI) const {
6762 unsigned Opc = MI.getOpcode();
6763
6764 int VOP3Idx[3] = {
6765 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6766 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6767 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6768 };
6769
6770 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6771 Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
6772 Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
6773 Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
6774 Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
6775 Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
6776 Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
6777 // src1 and src2 must be scalar
6778 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6779 const DebugLoc &DL = MI.getDebugLoc();
6780 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6781 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6782 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6783 .add(Src1);
6784 Src1.ChangeToRegister(Reg, false);
6785 }
6786 if (VOP3Idx[2] != -1) {
6787 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6788 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6789 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6790 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6791 .add(Src2);
6792 Src2.ChangeToRegister(Reg, false);
6793 }
6794 }
6795 }
6796
6797 // Find the one SGPR operand we are allowed to use.
6798 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6799 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6800 SmallDenseSet<unsigned> SGPRsUsed;
6801 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6802 if (SGPRReg) {
6803 SGPRsUsed.insert(SGPRReg);
6804 --ConstantBusLimit;
6805 }
6806
6807 for (int Idx : VOP3Idx) {
6808 if (Idx == -1)
6809 break;
6810 MachineOperand &MO = MI.getOperand(Idx);
6811
6812 if (!MO.isReg()) {
6813 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6814 continue;
6815
6816 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6817 --LiteralLimit;
6818 --ConstantBusLimit;
6819 continue;
6820 }
6821
6822 --LiteralLimit;
6823 --ConstantBusLimit;
6824 legalizeOpWithMove(MI, Idx);
6825 continue;
6826 }
6827
6828 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6829 continue; // VGPRs are legal
6830
6831 // We can use one SGPR in each VOP3 instruction prior to GFX10
6832 // and two starting from GFX10.
6833 if (SGPRsUsed.count(MO.getReg()))
6834 continue;
6835 if (ConstantBusLimit > 0) {
6836 SGPRsUsed.insert(MO.getReg());
6837 --ConstantBusLimit;
6838 continue;
6839 }
6840
6841 // If we make it this far, then the operand is not legal and we must
6842 // legalize it.
6843 legalizeOpWithMove(MI, Idx);
6844 }
6845
6846 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6847 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6848 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6849 legalizeOpWithMove(MI, VOP3Idx[2]);
6850
6851 // Fix the register class of packed FP32 instructions on gfx12+. See
6852 // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
6854 for (unsigned I = 0; I < 3; ++I) {
6855 if (!isLegalGFX12PlusPackedMathFP32Operand(MRI, MI, /*SrcN=*/I))
6856 legalizeOpWithMove(MI, VOP3Idx[I]);
6857 }
6858 }
6859}
6860
6863 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6864 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6865 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6866 if (DstRC)
6867 SRC = RI.getCommonSubClass(SRC, DstRC);
6868
6869 Register DstReg = MRI.createVirtualRegister(SRC);
6870 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6871
6872 if (RI.hasAGPRs(VRC)) {
6873 VRC = RI.getEquivalentVGPRClass(VRC);
6874 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6875 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6876 get(TargetOpcode::COPY), NewSrcReg)
6877 .addReg(SrcReg);
6878 SrcReg = NewSrcReg;
6879 }
6880
6881 if (SubRegs == 1) {
6882 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6883 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6884 .addReg(SrcReg);
6885 return DstReg;
6886 }
6887
6889 for (unsigned i = 0; i < SubRegs; ++i) {
6890 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6891 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6892 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6893 .addReg(SrcReg, {}, RI.getSubRegFromChannel(i));
6894 SRegs.push_back(SGPR);
6895 }
6896
6898 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6899 get(AMDGPU::REG_SEQUENCE), DstReg);
6900 for (unsigned i = 0; i < SubRegs; ++i) {
6901 MIB.addReg(SRegs[i]);
6902 MIB.addImm(RI.getSubRegFromChannel(i));
6903 }
6904 return DstReg;
6905}
6906
6908 MachineInstr &MI) const {
6909
6910 // If the pointer is store in VGPRs, then we need to move them to
6911 // SGPRs using v_readfirstlane. This is safe because we only select
6912 // loads with uniform pointers to SMRD instruction so we know the
6913 // pointer value is uniform.
6914 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6915 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6916 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6917 SBase->setReg(SGPR);
6918 }
6919 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6920 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6921 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6922 SOff->setReg(SGPR);
6923 }
6924}
6925
6927 unsigned Opc = Inst.getOpcode();
6928 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6929 if (OldSAddrIdx < 0)
6930 return false;
6931
6932 assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
6933
6934 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6935 if (NewOpc < 0)
6937 if (NewOpc < 0)
6938 return false;
6939
6940 MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();
6941 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6942 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6943 return false;
6944
6945 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6946 if (NewVAddrIdx < 0)
6947 return false;
6948
6949 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6950
6951 // Check vaddr, it shall be zero or absent.
6952 MachineInstr *VAddrDef = nullptr;
6953 if (OldVAddrIdx >= 0) {
6954 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6955 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6956 if (!VAddrDef || !VAddrDef->isMoveImmediate() ||
6957 !VAddrDef->getOperand(1).isImm() ||
6958 VAddrDef->getOperand(1).getImm() != 0)
6959 return false;
6960 }
6961
6962 const MCInstrDesc &NewDesc = get(NewOpc);
6963 Inst.setDesc(NewDesc);
6964
6965 // Callers expect iterator to be valid after this call, so modify the
6966 // instruction in place.
6967 if (OldVAddrIdx == NewVAddrIdx) {
6968 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6969 // Clear use list from the old vaddr holding a zero register.
6970 MRI.removeRegOperandFromUseList(&NewVAddr);
6971 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6972 Inst.removeOperand(OldSAddrIdx);
6973 // Update the use list with the pointer we have just moved from vaddr to
6974 // saddr position. Otherwise new vaddr will be missing from the use list.
6975 MRI.removeRegOperandFromUseList(&NewVAddr);
6976 MRI.addRegOperandToUseList(&NewVAddr);
6977 } else {
6978 assert(OldSAddrIdx == NewVAddrIdx);
6979
6980 if (OldVAddrIdx >= 0) {
6981 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6982 AMDGPU::OpName::vdst_in);
6983
6984 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6985 // it asserts. Untie the operands for now and retie them afterwards.
6986 if (NewVDstIn != -1) {
6987 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6988 Inst.untieRegOperand(OldVDstIn);
6989 }
6990
6991 Inst.removeOperand(OldVAddrIdx);
6992
6993 if (NewVDstIn != -1) {
6994 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6995 Inst.tieOperands(NewVDst, NewVDstIn);
6996 }
6997 }
6998 }
6999
7000 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
7001 VAddrDef->eraseFromParent();
7002
7003 return true;
7004}
7005
7006// FIXME: Remove this when SelectionDAG is obsoleted.
7008 MachineInstr &MI) const {
7009 if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
7010 return;
7011
7012 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
7013 // thinks they are uniform, so a readfirstlane should be valid.
7014 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
7015 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
7016 return;
7017
7019 return;
7020
7021 const TargetRegisterClass *DeclaredRC =
7022 getRegClass(MI.getDesc(), SAddr->getOperandNo());
7023
7024 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
7025 SAddr->setReg(ToSGPR);
7026}
7027
7030 const TargetRegisterClass *DstRC,
7033 const DebugLoc &DL) const {
7034 Register OpReg = Op.getReg();
7035 unsigned OpSubReg = Op.getSubReg();
7036
7037 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
7038 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
7039
7040 // Check if operand is already the correct register class.
7041 if (DstRC == OpRC)
7042 return;
7043
7044 Register DstReg = MRI.createVirtualRegister(DstRC);
7045 auto Copy =
7046 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
7047 Op.setReg(DstReg);
7048
7049 MachineInstr *Def = MRI.getVRegDef(OpReg);
7050 if (!Def)
7051 return;
7052
7053 // Try to eliminate the copy if it is copying an immediate value.
7054 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
7055 foldImmediate(*Copy, *Def, OpReg, &MRI);
7056
7057 bool ImpDef = Def->isImplicitDef();
7058 while (!ImpDef && Def && Def->isCopy()) {
7059 if (Def->getOperand(1).getReg().isPhysical())
7060 break;
7061 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
7062 ImpDef = Def && Def->isImplicitDef();
7063 }
7064 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
7065 !ImpDef)
7066 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
7067}
7068
7069// Emit the actual waterfall loop, executing the wrapped instruction for each
7070// unique value of \p ScalarOps across all lanes. In the best case we execute 1
7071// iteration, in the worst case we execute 64 (once per lane).
7074 MachineBasicBlock &BodyBB, const DebugLoc &DL,
7075 ArrayRef<MachineOperand *> ScalarOps, ArrayRef<Register> PhySGPRs = {}) {
7076 MachineFunction &MF = *LoopBB.getParent();
7078 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7080 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7081
7083 Register CondReg;
7084 for (auto [Idx, ScalarOp] : enumerate(ScalarOps)) {
7085 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
7086 unsigned NumSubRegs = RegSize / 32;
7087 Register VScalarOp = ScalarOp->getReg();
7088
7089 if (NumSubRegs == 1) {
7090 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7091
7092 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
7093 .addReg(VScalarOp);
7094
7095 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7096
7097 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
7098 .addReg(CurReg)
7099 .addReg(VScalarOp);
7100
7101 // Combine the comparison results with AND.
7102 if (!CondReg) // First.
7103 CondReg = NewCondReg;
7104 else { // If not the first, we create an AND.
7105 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7106 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7107 .addReg(CondReg)
7108 .addReg(NewCondReg);
7109 CondReg = AndReg;
7110 }
7111
7112 // Update ScalarOp operand to use the SGPR ScalarOp.
7113 if (PhySGPRs.empty() || !PhySGPRs[Idx].isValid())
7114 ScalarOp->setReg(CurReg);
7115 else {
7116 // Insert into the same block of use
7117 BuildMI(*ScalarOp->getParent()->getParent(), ScalarOp->getParent(), DL,
7118 TII.get(AMDGPU::COPY), PhySGPRs[Idx])
7119 .addReg(CurReg);
7120 ScalarOp->setReg(PhySGPRs[Idx]);
7121 }
7122 ScalarOp->setIsKill();
7123 } else {
7124 SmallVector<Register, 8> ReadlanePieces;
7125 RegState VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
7126 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
7127 "Unhandled register size");
7128
7129 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
7130 Register CurRegLo =
7131 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7132 Register CurRegHi =
7133 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7134
7135 // Read the next variant <- also loop target.
7136 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
7137 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
7138
7139 // Read the next variant <- also loop target.
7140 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
7141 .addReg(VScalarOp, VScalarOpUndef,
7142 TRI->getSubRegFromChannel(Idx + 1));
7143
7144 ReadlanePieces.push_back(CurRegLo);
7145 ReadlanePieces.push_back(CurRegHi);
7146
7147 // Comparison is to be done as 64-bit.
7148 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
7149 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
7150 .addReg(CurRegLo)
7151 .addImm(AMDGPU::sub0)
7152 .addReg(CurRegHi)
7153 .addImm(AMDGPU::sub1);
7154
7155 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
7156 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
7157 NewCondReg)
7158 .addReg(CurReg);
7159 if (NumSubRegs <= 2)
7160 Cmp.addReg(VScalarOp);
7161 else
7162 Cmp.addReg(VScalarOp, VScalarOpUndef,
7163 TRI->getSubRegFromChannel(Idx, 2));
7164
7165 // Combine the comparison results with AND.
7166 if (!CondReg) // First.
7167 CondReg = NewCondReg;
7168 else { // If not the first, we create an AND.
7169 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
7170 BuildMI(LoopBB, I, DL, TII.get(LMC.AndOpc), AndReg)
7171 .addReg(CondReg)
7172 .addReg(NewCondReg);
7173 CondReg = AndReg;
7174 }
7175 } // End for loop.
7176
7177 const auto *SScalarOpRC =
7178 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
7179 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
7180
7181 // Build scalar ScalarOp.
7182 auto Merge =
7183 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
7184 unsigned Channel = 0;
7185 for (Register Piece : ReadlanePieces) {
7186 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
7187 }
7188
7189 // Update ScalarOp operand to use the SGPR ScalarOp.
7190 if (PhySGPRs.empty() || !PhySGPRs[Idx].isValid())
7191 ScalarOp->setReg(SScalarOp);
7192 else {
7193 BuildMI(*ScalarOp->getParent()->getParent(), ScalarOp->getParent(), DL,
7194 TII.get(AMDGPU::COPY), PhySGPRs[Idx])
7195 .addReg(SScalarOp);
7196 ScalarOp->setReg(PhySGPRs[Idx]);
7197 }
7198 ScalarOp->setIsKill();
7199 }
7200 }
7201
7202 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7203 MRI.setSimpleHint(SaveExec, CondReg);
7204
7205 // Update EXEC to matching lanes, saving original to SaveExec.
7206 BuildMI(LoopBB, I, DL, TII.get(LMC.AndSaveExecOpc), SaveExec)
7207 .addReg(CondReg, RegState::Kill);
7208
7209 // The original instruction is here; we insert the terminators after it.
7210 I = BodyBB.end();
7211
7212 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
7213 BuildMI(BodyBB, I, DL, TII.get(LMC.XorTermOpc), LMC.ExecReg)
7214 .addReg(LMC.ExecReg)
7215 .addReg(SaveExec);
7216
7217 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
7218}
7219
7220// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
7221// with SGPRs by iterating over all unique values across all lanes.
7222// Returns the loop basic block that now contains \p MI.
7223static MachineBasicBlock *
7227 MachineBasicBlock::iterator Begin = nullptr,
7228 MachineBasicBlock::iterator End = nullptr,
7229 ArrayRef<Register> PhySGPRs = {}) {
7230 assert((PhySGPRs.empty() || PhySGPRs.size() == ScalarOps.size()) &&
7231 "Physical SGPRs must be empty or match the number of scalar operands");
7232 MachineBasicBlock &MBB = *MI.getParent();
7233 MachineFunction &MF = *MBB.getParent();
7235 const SIRegisterInfo *TRI = ST.getRegisterInfo();
7236 MachineRegisterInfo &MRI = MF.getRegInfo();
7237 if (!Begin.isValid())
7238 Begin = &MI;
7239 if (!End.isValid()) {
7240 End = &MI;
7241 ++End;
7242 }
7243 const DebugLoc &DL = MI.getDebugLoc();
7245 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
7246
7247 // Save SCC. Waterfall Loop may overwrite SCC.
7248 Register SaveSCCReg;
7249
7250 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
7251 // rather than unlimited scan everywhere
7252 bool SCCNotDead =
7253 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
7254 std::numeric_limits<unsigned>::max()) !=
7256 if (SCCNotDead) {
7257 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7258 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
7259 .addImm(1)
7260 .addImm(0);
7261 }
7262
7263 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
7264
7265 // Save the EXEC mask
7266 BuildMI(MBB, Begin, DL, TII.get(LMC.MovOpc), SaveExec).addReg(LMC.ExecReg);
7267
7268 // Killed uses in the instruction we are waterfalling around will be
7269 // incorrect due to the added control-flow.
7271 ++AfterMI;
7272 for (auto I = Begin; I != AfterMI; I++) {
7273 for (auto &MO : I->all_uses())
7274 MRI.clearKillFlags(MO.getReg());
7275 }
7276
7277 // To insert the loop we need to split the block. Move everything after this
7278 // point to a new block, and insert a new empty block between the two.
7281 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
7283 ++MBBI;
7284
7285 MF.insert(MBBI, LoopBB);
7286 MF.insert(MBBI, BodyBB);
7287 MF.insert(MBBI, RemainderBB);
7288
7289 LoopBB->addSuccessor(BodyBB);
7290 BodyBB->addSuccessor(LoopBB);
7291 BodyBB->addSuccessor(RemainderBB);
7292
7293 // Move Begin to MI to the BodyBB, and the remainder of the block to
7294 // RemainderBB.
7295 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
7296 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
7297 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
7298
7299 MBB.addSuccessor(LoopBB);
7300
7301 // Update dominators. We know that MBB immediately dominates LoopBB, that
7302 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
7303 // RemainderBB. RemainderBB immediately dominates all of the successors
7304 // transferred to it from MBB that MBB used to properly dominate.
7305 if (MDT) {
7306 MDT->addNewBlock(LoopBB, &MBB);
7307 MDT->addNewBlock(BodyBB, LoopBB);
7308 MDT->addNewBlock(RemainderBB, BodyBB);
7309 for (auto &Succ : RemainderBB->successors()) {
7310 if (MDT->properlyDominates(&MBB, Succ)) {
7311 MDT->changeImmediateDominator(Succ, RemainderBB);
7312 }
7313 }
7314 }
7315
7316 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps,
7317 PhySGPRs);
7318
7319 MachineBasicBlock::iterator First = RemainderBB->begin();
7320 // Restore SCC
7321 if (SCCNotDead) {
7322 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
7323 .addReg(SaveSCCReg, RegState::Kill)
7324 .addImm(0);
7325 }
7326
7327 // Restore the EXEC mask
7328 BuildMI(*RemainderBB, First, DL, TII.get(LMC.MovOpc), LMC.ExecReg)
7329 .addReg(SaveExec);
7330 return BodyBB;
7331}
7332
7333// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
7334static std::tuple<unsigned, unsigned>
7336 MachineBasicBlock &MBB = *MI.getParent();
7337 MachineFunction &MF = *MBB.getParent();
7338 MachineRegisterInfo &MRI = MF.getRegInfo();
7339
7340 // Extract the ptr from the resource descriptor.
7341 unsigned RsrcPtr =
7342 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
7343 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
7344
7345 // Create an empty resource descriptor
7346 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
7347 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7348 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7349 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
7350 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
7351
7352 // Zero64 = 0
7353 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
7354 .addImm(0);
7355
7356 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
7357 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
7358 .addImm(Lo_32(RsrcDataFormat));
7359
7360 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
7361 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
7362 .addImm(Hi_32(RsrcDataFormat));
7363
7364 // NewSRsrc = {Zero64, SRsrcFormat}
7365 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
7366 .addReg(Zero64)
7367 .addImm(AMDGPU::sub0_sub1)
7368 .addReg(SRsrcFormatLo)
7369 .addImm(AMDGPU::sub2)
7370 .addReg(SRsrcFormatHi)
7371 .addImm(AMDGPU::sub3);
7372
7373 return std::tuple(RsrcPtr, NewSRsrc);
7374}
7375
7378 MachineDominatorTree *MDT) const {
7379 MachineFunction &MF = *MI.getMF();
7380 MachineRegisterInfo &MRI = MF.getRegInfo();
7381 MachineBasicBlock *CreatedBB = nullptr;
7382
7383 // Legalize VOP2
7384 if (isVOP2(MI) || isVOPC(MI)) {
7386 return CreatedBB;
7387 }
7388
7389 // Legalize VOP3
7390 if (isVOP3(MI)) {
7392 return CreatedBB;
7393 }
7394
7395 // Legalize SMRD
7396 if (isSMRD(MI)) {
7398 return CreatedBB;
7399 }
7400
7401 // Legalize FLAT
7402 if (isFLAT(MI)) {
7404 return CreatedBB;
7405 }
7406
7407 // Legalize PHI
7408 // The register class of the operands must be the same type as the register
7409 // class of the output.
7410 if (MI.getOpcode() == AMDGPU::PHI) {
7411 const TargetRegisterClass *VRC = getOpRegClass(MI, 0);
7412 assert(!RI.isSGPRClass(VRC));
7413
7414 // Update all the operands so they have the same type.
7415 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7416 MachineOperand &Op = MI.getOperand(I);
7417 if (!Op.isReg() || !Op.getReg().isVirtual())
7418 continue;
7419
7420 // MI is a PHI instruction.
7421 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
7423
7424 // Avoid creating no-op copies with the same src and dst reg class. These
7425 // confuse some of the machine passes.
7426 legalizeGenericOperand(*InsertBB, Insert, VRC, Op, MRI, MI.getDebugLoc());
7427 }
7428 }
7429
7430 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
7431 // VGPR dest type and SGPR sources, insert copies so all operands are
7432 // VGPRs. This seems to help operand folding / the register coalescer.
7433 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
7434 MachineBasicBlock *MBB = MI.getParent();
7435 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
7436 if (RI.hasVGPRs(DstRC)) {
7437 // Update all the operands so they are VGPR register classes. These may
7438 // not be the same register class because REG_SEQUENCE supports mixing
7439 // subregister index types e.g. sub0_sub1 + sub2 + sub3
7440 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
7441 MachineOperand &Op = MI.getOperand(I);
7442 if (!Op.isReg() || !Op.getReg().isVirtual())
7443 continue;
7444
7445 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
7446 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
7447 if (VRC == OpRC)
7448 continue;
7449
7450 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
7451 Op.setIsKill();
7452 }
7453 }
7454
7455 return CreatedBB;
7456 }
7457
7458 // Legalize INSERT_SUBREG
7459 // src0 must have the same register class as dst
7460 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
7461 Register Dst = MI.getOperand(0).getReg();
7462 Register Src0 = MI.getOperand(1).getReg();
7463 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
7464 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
7465 if (DstRC != Src0RC) {
7466 MachineBasicBlock *MBB = MI.getParent();
7467 MachineOperand &Op = MI.getOperand(1);
7468 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
7469 }
7470 return CreatedBB;
7471 }
7472
7473 // Legalize SI_INIT_M0
7474 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
7475 MachineOperand &Src = MI.getOperand(0);
7476 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7477 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7478 return CreatedBB;
7479 }
7480
7481 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
7482 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
7483 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
7484 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
7485 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
7486 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
7487 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
7488 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
7489 MachineOperand &Src = MI.getOperand(1);
7490 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7491 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7492 return CreatedBB;
7493 }
7494
7495 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
7496 //
7497 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
7498 // scratch memory access. In both cases, the legalization never involves
7499 // conversion to the addr64 form.
7501 (isMUBUF(MI) || isMTBUF(MI)))) {
7502 AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI))
7503 ? AMDGPU::OpName::rsrc
7504 : AMDGPU::OpName::srsrc;
7505 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
7506 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
7507 CreatedBB = generateWaterFallLoop(*this, MI, {SRsrc}, MDT);
7508
7509 AMDGPU::OpName SampOpName =
7510 isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
7511 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
7512 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
7513 CreatedBB = generateWaterFallLoop(*this, MI, {SSamp}, MDT);
7514
7515 return CreatedBB;
7516 }
7517
7518 // Legalize SI_CALL
7519 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
7520 MachineOperand *Dest = &MI.getOperand(0);
7521 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
7522 createWaterFallForSiCall(&MI, MDT, {Dest});
7523 }
7524 }
7525
7526 // Legalize s_sleep_var.
7527 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
7528 const DebugLoc &DL = MI.getDebugLoc();
7529 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7530 int Src0Idx =
7531 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
7532 MachineOperand &Src0 = MI.getOperand(Src0Idx);
7533 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
7534 .add(Src0);
7535 Src0.ChangeToRegister(Reg, false);
7536 return nullptr;
7537 }
7538
7539 // Legalize TENSOR_LOAD_TO_LDS_d2/_d4, TENSOR_STORE_FROM_LDS_d2/_d4. All their
7540 // operands are scalar.
7541 if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d2 ||
7542 MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d4 ||
7543 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d2 ||
7544 MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d4) {
7545 for (MachineOperand &Src : MI.explicit_operands()) {
7546 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
7547 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
7548 }
7549 return CreatedBB;
7550 }
7551
7552 // Legalize MUBUF instructions.
7553 bool isSoffsetLegal = true;
7554 int SoffsetIdx =
7555 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
7556 if (SoffsetIdx != -1) {
7557 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
7558 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
7559 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
7560 isSoffsetLegal = false;
7561 }
7562 }
7563
7564 bool isRsrcLegal = true;
7565 int RsrcIdx =
7566 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
7567 if (RsrcIdx != -1) {
7568 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7569 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
7570 isRsrcLegal = false;
7571 }
7572
7573 // The operands are legal.
7574 if (isRsrcLegal && isSoffsetLegal)
7575 return CreatedBB;
7576
7577 if (!isRsrcLegal) {
7578 // Legalize a VGPR Rsrc
7579 //
7580 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
7581 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
7582 // a zero-value SRsrc.
7583 //
7584 // If the instruction is _OFFSET (both idxen and offen disabled), and we
7585 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
7586 // above.
7587 //
7588 // Otherwise we are on non-ADDR64 hardware, and/or we have
7589 // idxen/offen/bothen and we fall back to a waterfall loop.
7590
7591 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
7592 MachineBasicBlock &MBB = *MI.getParent();
7593
7594 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7595 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
7596 // This is already an ADDR64 instruction so we need to add the pointer
7597 // extracted from the resource descriptor to the current value of VAddr.
7598 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7599 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7600 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7601
7602 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
7603 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
7604 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
7605
7606 unsigned RsrcPtr, NewSRsrc;
7607 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7608
7609 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7610 const DebugLoc &DL = MI.getDebugLoc();
7611 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7612 .addDef(CondReg0)
7613 .addReg(RsrcPtr, {}, AMDGPU::sub0)
7614 .addReg(VAddr->getReg(), {}, AMDGPU::sub0)
7615 .addImm(0);
7616
7617 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7618 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7619 .addDef(CondReg1, RegState::Dead)
7620 .addReg(RsrcPtr, {}, AMDGPU::sub1)
7621 .addReg(VAddr->getReg(), {}, AMDGPU::sub1)
7622 .addReg(CondReg0, RegState::Kill)
7623 .addImm(0);
7624
7625 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7626 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7627 .addReg(NewVAddrLo)
7628 .addImm(AMDGPU::sub0)
7629 .addReg(NewVAddrHi)
7630 .addImm(AMDGPU::sub1);
7631
7632 VAddr->setReg(NewVAddr);
7633 Rsrc->setReg(NewSRsrc);
7634 } else if (!VAddr && ST.hasAddr64()) {
7635 // This instructions is the _OFFSET variant, so we need to convert it to
7636 // ADDR64.
7637 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7638 "FIXME: Need to emit flat atomics here");
7639
7640 unsigned RsrcPtr, NewSRsrc;
7641 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7642
7643 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7644 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7645 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7646 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7647 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7648
7649 // Atomics with return have an additional tied operand and are
7650 // missing some of the special bits.
7651 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7652 MachineInstr *Addr64;
7653
7654 if (!VDataIn) {
7655 // Regular buffer load / store.
7657 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7658 .add(*VData)
7659 .addReg(NewVAddr)
7660 .addReg(NewSRsrc)
7661 .add(*SOffset)
7662 .add(*Offset);
7663
7664 if (const MachineOperand *CPol =
7665 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7666 MIB.addImm(CPol->getImm());
7667 }
7668
7669 if (const MachineOperand *TFE =
7670 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7671 MIB.addImm(TFE->getImm());
7672 }
7673
7674 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7675
7676 MIB.cloneMemRefs(MI);
7677 Addr64 = MIB;
7678 } else {
7679 // Atomics with return.
7680 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7681 .add(*VData)
7682 .add(*VDataIn)
7683 .addReg(NewVAddr)
7684 .addReg(NewSRsrc)
7685 .add(*SOffset)
7686 .add(*Offset)
7687 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7688 .cloneMemRefs(MI);
7689 }
7690
7691 MI.removeFromParent();
7692
7693 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7694 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7695 NewVAddr)
7696 .addReg(RsrcPtr, {}, AMDGPU::sub0)
7697 .addImm(AMDGPU::sub0)
7698 .addReg(RsrcPtr, {}, AMDGPU::sub1)
7699 .addImm(AMDGPU::sub1);
7700 } else {
7701 // Legalize a VGPR Rsrc and soffset together.
7702 if (!isSoffsetLegal) {
7703 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7704 CreatedBB = generateWaterFallLoop(*this, MI, {Rsrc, Soffset}, MDT);
7705 return CreatedBB;
7706 }
7707 CreatedBB = generateWaterFallLoop(*this, MI, {Rsrc}, MDT);
7708 return CreatedBB;
7709 }
7710 }
7711
7712 // Legalize a VGPR soffset.
7713 if (!isSoffsetLegal) {
7714 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7715 CreatedBB = generateWaterFallLoop(*this, MI, {Soffset}, MDT);
7716 return CreatedBB;
7717 }
7718 return CreatedBB;
7719}
7720
7722 InstrList.insert(MI);
7723 // Add MBUF instructiosn to deferred list.
7724 int RsrcIdx =
7725 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7726 if (RsrcIdx != -1) {
7727 DeferredList.insert(MI);
7728 }
7729}
7730
7732 return DeferredList.contains(MI);
7733}
7734
7735// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7736// lowering (change sgpr to vgpr).
7737// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7738// size. Need to legalize the size of the operands during the vgpr lowering
7739// chain. This can be removed after we have sgpr16 in place
7741 MachineRegisterInfo &MRI) const {
7742 if (!ST.useRealTrue16Insts())
7743 return;
7744
7745 unsigned Opcode = MI.getOpcode();
7746 MachineBasicBlock *MBB = MI.getParent();
7747 // Legalize operands and check for size mismatch
7748 if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7749 OpIdx >= get(Opcode).getNumOperands() ||
7750 get(Opcode).operands()[OpIdx].RegClass == -1)
7751 return;
7752
7753 MachineOperand &Op = MI.getOperand(OpIdx);
7754 if (!Op.isReg() || !Op.getReg().isVirtual())
7755 return;
7756
7757 const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7758 if (!RI.isVGPRClass(CurrRC))
7759 return;
7760
7761 int16_t RCID = getOpRegClassID(get(Opcode).operands()[OpIdx]);
7762 const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7763 if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7764 Op.setSubReg(AMDGPU::lo16);
7765 } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7766 const DebugLoc &DL = MI.getDebugLoc();
7767 Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7768 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7769 BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7770 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7771 .addReg(Op.getReg())
7772 .addImm(AMDGPU::lo16)
7773 .addReg(Undef)
7774 .addImm(AMDGPU::hi16);
7775 Op.setReg(NewDstReg);
7776 }
7777}
7779 MachineRegisterInfo &MRI) const {
7780 for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7782}
7783
7787 ArrayRef<Register> PhySGPRs) const {
7788 assert(MI->getOpcode() == AMDGPU::SI_CALL_ISEL &&
7789 "This only handle waterfall for SI_CALL_ISEL");
7790 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
7791 // following copies, we also need to move copies from and to physical
7792 // registers into the loop block.
7793 // Also move the copies to physical registers into the loop block
7794 MachineBasicBlock &MBB = *MI->getParent();
7796 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
7797 --Start;
7799 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
7800 ++End;
7801
7802 // Also include following copies of the return value
7803 ++End;
7804 while (End != MBB.end() && End->isCopy() &&
7805 MI->definesRegister(End->getOperand(1).getReg(), &RI))
7806 ++End;
7807
7808 generateWaterFallLoop(*this, *MI, ScalarOps, MDT, Start, End, PhySGPRs);
7809}
7810
7812 MachineDominatorTree *MDT) const {
7814 DenseMap<MachineInstr *, bool> V2SPhyCopiesToErase;
7815 while (!Worklist.empty()) {
7816 MachineInstr &Inst = *Worklist.top();
7817 Worklist.erase_top();
7818 // Skip MachineInstr in the deferred list.
7819 if (Worklist.isDeferred(&Inst))
7820 continue;
7821 moveToVALUImpl(Worklist, MDT, Inst, WaterFalls, V2SPhyCopiesToErase);
7822 }
7823
7824 // Deferred list of instructions will be processed once
7825 // all the MachineInstr in the worklist are done.
7826 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7827 moveToVALUImpl(Worklist, MDT, *Inst, WaterFalls, V2SPhyCopiesToErase);
7828 assert(Worklist.empty() &&
7829 "Deferred MachineInstr are not supposed to re-populate worklist");
7830 }
7831
7832 for (std::pair<MachineInstr *, V2PhysSCopyInfo> &Entry : WaterFalls) {
7833 if (Entry.first->getOpcode() == AMDGPU::SI_CALL_ISEL)
7834 createWaterFallForSiCall(Entry.first, MDT, Entry.second.MOs,
7835 Entry.second.SGPRs);
7836 }
7837
7838 for (std::pair<MachineInstr *, bool> Entry : V2SPhyCopiesToErase)
7839 if (Entry.second)
7840 Entry.first->eraseFromParent();
7841}
7843 MachineRegisterInfo &MRI, Register DstReg, MachineInstr &Inst) const {
7844 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7845 // hope for the best.
7846 const TargetRegisterClass *DstRC = RI.getRegClassForReg(MRI, DstReg);
7847 ArrayRef<int16_t> SubRegIndices = RI.getRegSplitParts(DstRC, 4);
7848 if (SubRegIndices.size() <= 1) {
7849 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7850 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7851 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
7852 .add(Inst.getOperand(1));
7853 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
7854 DstReg)
7855 .addReg(NewDst);
7856 } else {
7858 for (int16_t Indice : SubRegIndices) {
7859 Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7860 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7861 get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
7862 .addReg(Inst.getOperand(1).getReg(), {}, Indice);
7863
7864 DstRegs.push_back(NewDst);
7865 }
7867 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7868 get(AMDGPU::REG_SEQUENCE), DstReg);
7869 for (unsigned i = 0; i < SubRegIndices.size(); ++i) {
7870 MIB.addReg(DstRegs[i]);
7871 MIB.addImm(RI.getSubRegFromChannel(i));
7872 }
7873 }
7874}
7875
7877 SIInstrWorklist &Worklist, Register DstReg, MachineInstr &Inst,
7880 DenseMap<MachineInstr *, bool> &V2SPhyCopiesToErase) const {
7881 if (DstReg == AMDGPU::M0) {
7882 createReadFirstLaneFromCopyToPhysReg(MRI, DstReg, Inst);
7883 V2SPhyCopiesToErase.try_emplace(&Inst, true);
7884 return;
7885 }
7886 Register SrcReg = Inst.getOperand(1).getReg();
7889 // Only search current block since phyreg's def & use cannot cross
7890 // blocks when MF.NoPhi = false.
7891 while (++I != E) {
7892 // For SI_CALL_ISEL users, replace the phys SGPR with the VGPR source
7893 // and record the operand for later waterfall loop generation.
7894 if (I->getOpcode() == AMDGPU::SI_CALL_ISEL) {
7895 MachineInstr *UseMI = &*I;
7896 for (unsigned i = 0; i < UseMI->getNumOperands(); ++i) {
7897 if (UseMI->getOperand(i).isReg() &&
7898 UseMI->getOperand(i).getReg() == DstReg) {
7899 MachineOperand *MO = &UseMI->getOperand(i);
7900 MO->setReg(SrcReg);
7901 V2PhysSCopyInfo &V2SCopyInfo = WaterFalls[UseMI];
7902 V2SCopyInfo.MOs.push_back(MO);
7903 V2SCopyInfo.SGPRs.push_back(DstReg);
7904 V2SPhyCopiesToErase.try_emplace(&Inst, true);
7905 }
7906 }
7907 } else if (I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG &&
7908 I->getOperand(0).isReg() &&
7909 I->getOperand(0).getReg() == DstReg) {
7910 createReadFirstLaneFromCopyToPhysReg(MRI, DstReg, Inst);
7911 V2SPhyCopiesToErase.try_emplace(&Inst, true);
7912 } else if (I->readsRegister(DstReg, &RI)) {
7913 // COPY cannot be erased if other type of inst uses it.
7914 V2SPhyCopiesToErase[&Inst] = false;
7915 }
7916 if (I->findRegisterDefOperand(DstReg, &RI))
7917 break;
7918 }
7919}
7920
7922 SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst,
7924 DenseMap<MachineInstr *, bool> &V2SPhyCopiesToErase) const {
7925
7927 if (!MBB)
7928 return;
7929 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
7930 unsigned Opcode = Inst.getOpcode();
7931 unsigned NewOpcode = getVALUOp(Inst);
7932 const DebugLoc &DL = Inst.getDebugLoc();
7933
7934 // Handle some special cases
7935 switch (Opcode) {
7936 default:
7937 break;
7938 case AMDGPU::S_ADD_I32:
7939 case AMDGPU::S_SUB_I32: {
7940 // FIXME: The u32 versions currently selected use the carry.
7941 bool Changed;
7942 MachineBasicBlock *CreatedBBTmp = nullptr;
7943 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7944 if (Changed)
7945 return;
7946
7947 // Default handling
7948 break;
7949 }
7950
7951 case AMDGPU::S_MUL_U64:
7952 if (ST.hasVectorMulU64()) {
7953 NewOpcode = AMDGPU::V_MUL_U64_e64;
7954 break;
7955 }
7956 // Split s_mul_u64 in 32-bit vector multiplications.
7957 splitScalarSMulU64(Worklist, Inst, MDT);
7958 Inst.eraseFromParent();
7959 return;
7960
7961 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7962 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7963 // This is a special case of s_mul_u64 where all the operands are either
7964 // zero extended or sign extended.
7965 splitScalarSMulPseudo(Worklist, Inst, MDT);
7966 Inst.eraseFromParent();
7967 return;
7968
7969 case AMDGPU::S_AND_B64:
7970 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7971 Inst.eraseFromParent();
7972 return;
7973
7974 case AMDGPU::S_OR_B64:
7975 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7976 Inst.eraseFromParent();
7977 return;
7978
7979 case AMDGPU::S_XOR_B64:
7980 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7981 Inst.eraseFromParent();
7982 return;
7983
7984 case AMDGPU::S_NAND_B64:
7985 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7986 Inst.eraseFromParent();
7987 return;
7988
7989 case AMDGPU::S_NOR_B64:
7990 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7991 Inst.eraseFromParent();
7992 return;
7993
7994 case AMDGPU::S_XNOR_B64:
7995 if (ST.hasDLInsts())
7996 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7997 else
7998 splitScalar64BitXnor(Worklist, Inst, MDT);
7999 Inst.eraseFromParent();
8000 return;
8001
8002 case AMDGPU::S_ANDN2_B64:
8003 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
8004 Inst.eraseFromParent();
8005 return;
8006
8007 case AMDGPU::S_ORN2_B64:
8008 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
8009 Inst.eraseFromParent();
8010 return;
8011
8012 case AMDGPU::S_BREV_B64:
8013 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
8014 Inst.eraseFromParent();
8015 return;
8016
8017 case AMDGPU::S_NOT_B64:
8018 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
8019 Inst.eraseFromParent();
8020 return;
8021
8022 case AMDGPU::S_BCNT1_I32_B64:
8023 splitScalar64BitBCNT(Worklist, Inst);
8024 Inst.eraseFromParent();
8025 return;
8026
8027 case AMDGPU::S_BFE_I64:
8028 splitScalar64BitBFE(Worklist, Inst);
8029 Inst.eraseFromParent();
8030 return;
8031
8032 case AMDGPU::S_FLBIT_I32_B64:
8033 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
8034 Inst.eraseFromParent();
8035 return;
8036 case AMDGPU::S_FF1_I32_B64:
8037 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
8038 Inst.eraseFromParent();
8039 return;
8040
8041 case AMDGPU::S_LSHL_B32:
8042 if (ST.hasOnlyRevVALUShifts()) {
8043 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
8044 swapOperands(Inst);
8045 }
8046 break;
8047 case AMDGPU::S_ASHR_I32:
8048 if (ST.hasOnlyRevVALUShifts()) {
8049 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
8050 swapOperands(Inst);
8051 }
8052 break;
8053 case AMDGPU::S_LSHR_B32:
8054 if (ST.hasOnlyRevVALUShifts()) {
8055 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
8056 swapOperands(Inst);
8057 }
8058 break;
8059 case AMDGPU::S_LSHL_B64:
8060 if (ST.hasOnlyRevVALUShifts()) {
8061 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
8062 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
8063 : AMDGPU::V_LSHLREV_B64_e64;
8064 swapOperands(Inst);
8065 }
8066 break;
8067 case AMDGPU::S_ASHR_I64:
8068 if (ST.hasOnlyRevVALUShifts()) {
8069 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
8070 swapOperands(Inst);
8071 }
8072 break;
8073 case AMDGPU::S_LSHR_B64:
8074 if (ST.hasOnlyRevVALUShifts()) {
8075 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
8076 swapOperands(Inst);
8077 }
8078 break;
8079
8080 case AMDGPU::S_ABS_I32:
8081 lowerScalarAbs(Worklist, Inst);
8082 Inst.eraseFromParent();
8083 return;
8084
8085 case AMDGPU::S_ABSDIFF_I32:
8086 lowerScalarAbsDiff(Worklist, Inst);
8087 Inst.eraseFromParent();
8088 return;
8089
8090 case AMDGPU::S_CBRANCH_SCC0:
8091 case AMDGPU::S_CBRANCH_SCC1: {
8092 // Clear unused bits of vcc
8093 Register CondReg = Inst.getOperand(1).getReg();
8094 bool IsSCC = CondReg == AMDGPU::SCC;
8096 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(LMC.AndOpc), LMC.VccReg)
8097 .addReg(LMC.ExecReg)
8098 .addReg(IsSCC ? LMC.VccReg : CondReg);
8099 Inst.removeOperand(1);
8100 } break;
8101
8102 case AMDGPU::S_BFE_U64:
8103 case AMDGPU::S_BFM_B64:
8104 llvm_unreachable("Moving this op to VALU not implemented");
8105
8106 case AMDGPU::S_PACK_LL_B32_B16:
8107 case AMDGPU::S_PACK_LH_B32_B16:
8108 case AMDGPU::S_PACK_HL_B32_B16:
8109 case AMDGPU::S_PACK_HH_B32_B16:
8110 movePackToVALU(Worklist, MRI, Inst);
8111 Inst.eraseFromParent();
8112 return;
8113
8114 case AMDGPU::S_XNOR_B32:
8115 lowerScalarXnor(Worklist, Inst);
8116 Inst.eraseFromParent();
8117 return;
8118
8119 case AMDGPU::S_NAND_B32:
8120 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
8121 Inst.eraseFromParent();
8122 return;
8123
8124 case AMDGPU::S_NOR_B32:
8125 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
8126 Inst.eraseFromParent();
8127 return;
8128
8129 case AMDGPU::S_ANDN2_B32:
8130 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
8131 Inst.eraseFromParent();
8132 return;
8133
8134 case AMDGPU::S_ORN2_B32:
8135 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
8136 Inst.eraseFromParent();
8137 return;
8138
8139 // TODO: remove as soon as everything is ready
8140 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
8141 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
8142 // can only be selected from the uniform SDNode.
8143 case AMDGPU::S_ADD_CO_PSEUDO:
8144 case AMDGPU::S_SUB_CO_PSEUDO: {
8145 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
8146 ? AMDGPU::V_ADDC_U32_e64
8147 : AMDGPU::V_SUBB_U32_e64;
8148 const auto *CarryRC = RI.getWaveMaskRegClass();
8149
8150 Register CarryInReg = Inst.getOperand(4).getReg();
8151 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
8152 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
8153 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
8154 .addReg(CarryInReg);
8155 }
8156
8157 Register CarryOutReg = Inst.getOperand(1).getReg();
8158
8159 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
8160 MRI.getRegClass(Inst.getOperand(0).getReg())));
8161 MachineInstr *CarryOp =
8162 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
8163 .addReg(CarryOutReg, RegState::Define)
8164 .add(Inst.getOperand(2))
8165 .add(Inst.getOperand(3))
8166 .addReg(CarryInReg)
8167 .addImm(0);
8168 legalizeOperands(*CarryOp);
8169 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
8170 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8171 Inst.eraseFromParent();
8172 }
8173 return;
8174 case AMDGPU::S_UADDO_PSEUDO:
8175 case AMDGPU::S_USUBO_PSEUDO: {
8176 MachineOperand &Dest0 = Inst.getOperand(0);
8177 MachineOperand &Dest1 = Inst.getOperand(1);
8178 MachineOperand &Src0 = Inst.getOperand(2);
8179 MachineOperand &Src1 = Inst.getOperand(3);
8180
8181 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
8182 ? AMDGPU::V_ADD_CO_U32_e64
8183 : AMDGPU::V_SUB_CO_U32_e64;
8184 const TargetRegisterClass *NewRC =
8185 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
8186 Register DestReg = MRI.createVirtualRegister(NewRC);
8187 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
8188 .addReg(Dest1.getReg(), RegState::Define)
8189 .add(Src0)
8190 .add(Src1)
8191 .addImm(0); // clamp bit
8192
8193 legalizeOperands(*NewInstr, MDT);
8194 MRI.replaceRegWith(Dest0.getReg(), DestReg);
8195 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8196 Inst.eraseFromParent();
8197 }
8198 return;
8199 case AMDGPU::S_LSHL1_ADD_U32:
8200 case AMDGPU::S_LSHL2_ADD_U32:
8201 case AMDGPU::S_LSHL3_ADD_U32:
8202 case AMDGPU::S_LSHL4_ADD_U32: {
8203 MachineOperand &Dest = Inst.getOperand(0);
8204 MachineOperand &Src0 = Inst.getOperand(1);
8205 MachineOperand &Src1 = Inst.getOperand(2);
8206 unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32 ? 1
8207 : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
8208 : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
8209 : 4);
8210
8211 const TargetRegisterClass *NewRC =
8212 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
8213 Register DestReg = MRI.createVirtualRegister(NewRC);
8214 MachineInstr *NewInstr =
8215 BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
8216 .add(Src0)
8217 .addImm(ShiftAmt)
8218 .add(Src1);
8219
8220 legalizeOperands(*NewInstr, MDT);
8221 MRI.replaceRegWith(Dest.getReg(), DestReg);
8222 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
8223 Inst.eraseFromParent();
8224 }
8225 return;
8226 case AMDGPU::S_CSELECT_B32:
8227 case AMDGPU::S_CSELECT_B64:
8228 lowerSelect(Worklist, Inst, MDT);
8229 Inst.eraseFromParent();
8230 return;
8231 case AMDGPU::S_CMP_EQ_I32:
8232 case AMDGPU::S_CMP_LG_I32:
8233 case AMDGPU::S_CMP_GT_I32:
8234 case AMDGPU::S_CMP_GE_I32:
8235 case AMDGPU::S_CMP_LT_I32:
8236 case AMDGPU::S_CMP_LE_I32:
8237 case AMDGPU::S_CMP_EQ_U32:
8238 case AMDGPU::S_CMP_LG_U32:
8239 case AMDGPU::S_CMP_GT_U32:
8240 case AMDGPU::S_CMP_GE_U32:
8241 case AMDGPU::S_CMP_LT_U32:
8242 case AMDGPU::S_CMP_LE_U32:
8243 case AMDGPU::S_CMP_EQ_U64:
8244 case AMDGPU::S_CMP_LG_U64:
8245 case AMDGPU::S_CMP_LT_F32:
8246 case AMDGPU::S_CMP_EQ_F32:
8247 case AMDGPU::S_CMP_LE_F32:
8248 case AMDGPU::S_CMP_GT_F32:
8249 case AMDGPU::S_CMP_LG_F32:
8250 case AMDGPU::S_CMP_GE_F32:
8251 case AMDGPU::S_CMP_O_F32:
8252 case AMDGPU::S_CMP_U_F32:
8253 case AMDGPU::S_CMP_NGE_F32:
8254 case AMDGPU::S_CMP_NLG_F32:
8255 case AMDGPU::S_CMP_NGT_F32:
8256 case AMDGPU::S_CMP_NLE_F32:
8257 case AMDGPU::S_CMP_NEQ_F32:
8258 case AMDGPU::S_CMP_NLT_F32: {
8259 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8260 auto NewInstr =
8261 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8262 .setMIFlags(Inst.getFlags());
8263 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
8264 0) {
8265 NewInstr
8266 .addImm(0) // src0_modifiers
8267 .add(Inst.getOperand(0)) // src0
8268 .addImm(0) // src1_modifiers
8269 .add(Inst.getOperand(1)) // src1
8270 .addImm(0); // clamp
8271 } else {
8272 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
8273 }
8274 legalizeOperands(*NewInstr, MDT);
8275 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8276 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8277 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8278 Inst.eraseFromParent();
8279 return;
8280 }
8281 case AMDGPU::S_CMP_LT_F16:
8282 case AMDGPU::S_CMP_EQ_F16:
8283 case AMDGPU::S_CMP_LE_F16:
8284 case AMDGPU::S_CMP_GT_F16:
8285 case AMDGPU::S_CMP_LG_F16:
8286 case AMDGPU::S_CMP_GE_F16:
8287 case AMDGPU::S_CMP_O_F16:
8288 case AMDGPU::S_CMP_U_F16:
8289 case AMDGPU::S_CMP_NGE_F16:
8290 case AMDGPU::S_CMP_NLG_F16:
8291 case AMDGPU::S_CMP_NGT_F16:
8292 case AMDGPU::S_CMP_NLE_F16:
8293 case AMDGPU::S_CMP_NEQ_F16:
8294 case AMDGPU::S_CMP_NLT_F16: {
8295 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
8296 auto NewInstr =
8297 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
8298 .setMIFlags(Inst.getFlags());
8299 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
8300 NewInstr
8301 .addImm(0) // src0_modifiers
8302 .add(Inst.getOperand(0)) // src0
8303 .addImm(0) // src1_modifiers
8304 .add(Inst.getOperand(1)) // src1
8305 .addImm(0); // clamp
8306 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8307 NewInstr.addImm(0); // op_sel0
8308 } else {
8309 NewInstr
8310 .add(Inst.getOperand(0))
8311 .add(Inst.getOperand(1));
8312 }
8313 legalizeOperandsVALUt16(*NewInstr, MRI);
8314 legalizeOperands(*NewInstr, MDT);
8315 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
8316 const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
8317 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
8318 Inst.eraseFromParent();
8319 return;
8320 }
8321 case AMDGPU::S_CVT_HI_F32_F16: {
8322 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8323 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8324 if (ST.useRealTrue16Insts()) {
8325 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
8326 .add(Inst.getOperand(1));
8327 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8328 .addImm(0) // src0_modifiers
8329 .addReg(TmpReg, {}, AMDGPU::hi16)
8330 .addImm(0) // clamp
8331 .addImm(0) // omod
8332 .addImm(0); // op_sel0
8333 } else {
8334 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8335 .addImm(16)
8336 .add(Inst.getOperand(1));
8337 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8338 .addImm(0) // src0_modifiers
8339 .addReg(TmpReg)
8340 .addImm(0) // clamp
8341 .addImm(0); // omod
8342 }
8343
8344 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8345 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8346 Inst.eraseFromParent();
8347 return;
8348 }
8349 case AMDGPU::S_MINIMUM_F32:
8350 case AMDGPU::S_MAXIMUM_F32: {
8351 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8352 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8353 .addImm(0) // src0_modifiers
8354 .add(Inst.getOperand(1))
8355 .addImm(0) // src1_modifiers
8356 .add(Inst.getOperand(2))
8357 .addImm(0) // clamp
8358 .addImm(0); // omod
8359 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8360
8361 legalizeOperands(*NewInstr, MDT);
8362 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8363 Inst.eraseFromParent();
8364 return;
8365 }
8366 case AMDGPU::S_MINIMUM_F16:
8367 case AMDGPU::S_MAXIMUM_F16: {
8368 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8369 ? &AMDGPU::VGPR_16RegClass
8370 : &AMDGPU::VGPR_32RegClass);
8371 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8372 .addImm(0) // src0_modifiers
8373 .add(Inst.getOperand(1))
8374 .addImm(0) // src1_modifiers
8375 .add(Inst.getOperand(2))
8376 .addImm(0) // clamp
8377 .addImm(0) // omod
8378 .addImm(0); // opsel0
8379 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8380 legalizeOperandsVALUt16(*NewInstr, MRI);
8381 legalizeOperands(*NewInstr, MDT);
8382 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8383 Inst.eraseFromParent();
8384 return;
8385 }
8386 case AMDGPU::V_S_EXP_F16_e64:
8387 case AMDGPU::V_S_LOG_F16_e64:
8388 case AMDGPU::V_S_RCP_F16_e64:
8389 case AMDGPU::V_S_RSQ_F16_e64:
8390 case AMDGPU::V_S_SQRT_F16_e64: {
8391 Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
8392 ? &AMDGPU::VGPR_16RegClass
8393 : &AMDGPU::VGPR_32RegClass);
8394 auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
8395 .add(Inst.getOperand(1)) // src0_modifiers
8396 .add(Inst.getOperand(2))
8397 .add(Inst.getOperand(3)) // clamp
8398 .add(Inst.getOperand(4)) // omod
8399 .setMIFlags(Inst.getFlags());
8400 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
8401 NewInstr.addImm(0); // opsel0
8402 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
8403 legalizeOperandsVALUt16(*NewInstr, MRI);
8404 legalizeOperands(*NewInstr, MDT);
8405 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
8406 Inst.eraseFromParent();
8407 return;
8408 }
8409 }
8410
8411 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
8412 // We cannot move this instruction to the VALU, so we should try to
8413 // legalize its operands instead.
8414 legalizeOperands(Inst, MDT);
8415 return;
8416 }
8417 // Handle converting generic instructions like COPY-to-SGPR into
8418 // COPY-to-VGPR.
8419 if (NewOpcode == Opcode) {
8420 Register DstReg = Inst.getOperand(0).getReg();
8421 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
8422
8423 if (Inst.isCopy() && DstReg.isPhysical() &&
8424 Inst.getOperand(1).getReg().isVirtual()) {
8425 handleCopyToPhysHelper(Worklist, DstReg, Inst, MRI, WaterFalls,
8426 V2SPhyCopiesToErase);
8427 return;
8428 }
8429
8430 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual()) {
8431 Register NewDstReg = Inst.getOperand(1).getReg();
8432 const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
8433 if (const TargetRegisterClass *CommonRC =
8434 RI.getCommonSubClass(NewDstRC, SrcRC)) {
8435 // Instead of creating a copy where src and dst are the same register
8436 // class, we just replace all uses of dst with src. These kinds of
8437 // copies interfere with the heuristics MachineSink uses to decide
8438 // whether or not to split a critical edge. Since the pass assumes
8439 // that copies will end up as machine instructions and not be
8440 // eliminated.
8441 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
8442 MRI.replaceRegWith(DstReg, NewDstReg);
8443 MRI.clearKillFlags(NewDstReg);
8444 Inst.getOperand(0).setReg(DstReg);
8445
8446 if (!MRI.constrainRegClass(NewDstReg, CommonRC))
8447 llvm_unreachable("failed to constrain register");
8448
8449 Inst.eraseFromParent();
8450
8451 for (MachineOperand &UseMO :
8452 make_early_inc_range(MRI.use_operands(NewDstReg))) {
8453 MachineInstr &UseMI = *UseMO.getParent();
8454
8455 // Legalize t16 operands since replaceReg is called after
8456 // addUsersToVALU.
8458
8459 unsigned OpIdx = UseMI.getOperandNo(&UseMO);
8460 if (const TargetRegisterClass *OpRC =
8461 getRegClass(UseMI.getDesc(), OpIdx))
8462 MRI.constrainRegClass(NewDstReg, OpRC);
8463 }
8464
8465 return;
8466 }
8467 }
8468
8469 // If this is a v2s copy between 16bit and 32bit reg,
8470 // replace vgpr copy to reg_sequence/extract_subreg
8471 // This can be remove after we have sgpr16 in place
8472 if (ST.useRealTrue16Insts() && Inst.isCopy() &&
8473 Inst.getOperand(1).getReg().isVirtual() &&
8474 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
8475 const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
8476 if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
8477 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8478 Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
8479 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8480 get(AMDGPU::IMPLICIT_DEF), Undef);
8481 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
8482 get(AMDGPU::REG_SEQUENCE), NewDstReg)
8483 .addReg(Inst.getOperand(1).getReg())
8484 .addImm(AMDGPU::lo16)
8485 .addReg(Undef)
8486 .addImm(AMDGPU::hi16);
8487 Inst.eraseFromParent();
8488 MRI.replaceRegWith(DstReg, NewDstReg);
8489 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8490 return;
8491 } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
8492 AMDGPU::lo16)) {
8493 Inst.getOperand(1).setSubReg(AMDGPU::lo16);
8494 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8495 MRI.replaceRegWith(DstReg, NewDstReg);
8496 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8497 return;
8498 }
8499 }
8500
8501 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
8502 MRI.replaceRegWith(DstReg, NewDstReg);
8503 legalizeOperands(Inst, MDT);
8504 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8505 return;
8506 }
8507
8508 // Use the new VALU Opcode.
8509 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
8510 .setMIFlags(Inst.getFlags());
8511 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
8512 // Intersperse VOP3 modifiers among the SALU operands.
8513 NewInstr->addOperand(Inst.getOperand(0));
8514 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8515 AMDGPU::OpName::src0_modifiers) >= 0)
8516 NewInstr.addImm(0);
8517 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
8518 const MachineOperand &Src = Inst.getOperand(1);
8519 NewInstr->addOperand(Src);
8520 }
8521
8522 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
8523 // We are converting these to a BFE, so we need to add the missing
8524 // operands for the size and offset.
8525 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
8526 NewInstr.addImm(0);
8527 NewInstr.addImm(Size);
8528 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
8529 // The VALU version adds the second operand to the result, so insert an
8530 // extra 0 operand.
8531 NewInstr.addImm(0);
8532 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
8533 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
8534 // If we need to move this to VGPRs, we need to unpack the second
8535 // operand back into the 2 separate ones for bit offset and width.
8536 assert(OffsetWidthOp.isImm() &&
8537 "Scalar BFE is only implemented for constant width and offset");
8538 uint32_t Imm = OffsetWidthOp.getImm();
8539
8540 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8541 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8542 NewInstr.addImm(Offset);
8543 NewInstr.addImm(BitWidth);
8544 } else {
8545 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8546 AMDGPU::OpName::src1_modifiers) >= 0)
8547 NewInstr.addImm(0);
8548 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
8549 NewInstr->addOperand(Inst.getOperand(2));
8550 if (AMDGPU::getNamedOperandIdx(NewOpcode,
8551 AMDGPU::OpName::src2_modifiers) >= 0)
8552 NewInstr.addImm(0);
8553 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
8554 NewInstr->addOperand(Inst.getOperand(3));
8555 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
8556 NewInstr.addImm(0);
8557 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
8558 NewInstr.addImm(0);
8559 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
8560 NewInstr.addImm(0);
8561 }
8562 } else {
8563 // Just copy the SALU operands.
8564 for (const MachineOperand &Op : Inst.explicit_operands())
8565 NewInstr->addOperand(Op);
8566 }
8567
8568 // Remove any references to SCC. Vector instructions can't read from it, and
8569 // We're just about to add the implicit use / defs of VCC, and we don't want
8570 // both.
8571 for (MachineOperand &Op : Inst.implicit_operands()) {
8572 if (Op.getReg() == AMDGPU::SCC) {
8573 // Only propagate through live-def of SCC.
8574 if (Op.isDef() && !Op.isDead())
8575 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
8576 if (Op.isUse())
8577 addSCCDefsToVALUWorklist(NewInstr, Worklist);
8578 }
8579 }
8580 Inst.eraseFromParent();
8581 Register NewDstReg;
8582 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
8583 Register DstReg = NewInstr->getOperand(0).getReg();
8584 assert(DstReg.isVirtual());
8585 // Update the destination register class.
8586 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
8587 assert(NewDstRC);
8588 NewDstReg = MRI.createVirtualRegister(NewDstRC);
8589 MRI.replaceRegWith(DstReg, NewDstReg);
8590 }
8591 fixImplicitOperands(*NewInstr);
8592
8593 legalizeOperandsVALUt16(*NewInstr, MRI);
8594
8595 // Legalize the operands
8596 legalizeOperands(*NewInstr, MDT);
8597 if (NewDstReg)
8598 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
8599}
8600
8601// Add/sub require special handling to deal with carry outs.
8602std::pair<bool, MachineBasicBlock *>
8603SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
8604 MachineDominatorTree *MDT) const {
8605 if (ST.hasAddNoCarryInsts()) {
8606 // Assume there is no user of scc since we don't select this in that case.
8607 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
8608 // is used.
8609
8610 MachineBasicBlock &MBB = *Inst.getParent();
8611 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8612
8613 Register OldDstReg = Inst.getOperand(0).getReg();
8614 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8615
8616 unsigned Opc = Inst.getOpcode();
8617 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
8618
8619 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
8620 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
8621
8622 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
8623 Inst.removeOperand(3);
8624
8625 Inst.setDesc(get(NewOpc));
8626 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
8627 Inst.addImplicitDefUseOperands(*MBB.getParent());
8628 MRI.replaceRegWith(OldDstReg, ResultReg);
8629 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
8630
8631 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8632 return std::pair(true, NewBB);
8633 }
8634
8635 return std::pair(false, nullptr);
8636}
8637
8638void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
8639 MachineDominatorTree *MDT) const {
8640
8641 MachineBasicBlock &MBB = *Inst.getParent();
8642 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8643 MachineBasicBlock::iterator MII = Inst;
8644 const DebugLoc &DL = Inst.getDebugLoc();
8645
8646 MachineOperand &Dest = Inst.getOperand(0);
8647 MachineOperand &Src0 = Inst.getOperand(1);
8648 MachineOperand &Src1 = Inst.getOperand(2);
8649 MachineOperand &Cond = Inst.getOperand(3);
8650
8651 Register CondReg = Cond.getReg();
8652 bool IsSCC = (CondReg == AMDGPU::SCC);
8653
8654 // If this is a trivial select where the condition is effectively not SCC
8655 // (CondReg is a source of copy to SCC), then the select is semantically
8656 // equivalent to copying CondReg. Hence, there is no need to create
8657 // V_CNDMASK, we can just use that and bail out.
8658 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
8659 (Src1.getImm() == 0)) {
8660 MRI.replaceRegWith(Dest.getReg(), CondReg);
8661 return;
8662 }
8663
8664 Register NewCondReg = CondReg;
8665 if (IsSCC) {
8666 const TargetRegisterClass *TC = RI.getWaveMaskRegClass();
8667 NewCondReg = MRI.createVirtualRegister(TC);
8668
8669 // Now look for the closest SCC def if it is a copy
8670 // replacing the CondReg with the COPY source register
8671 bool CopyFound = false;
8672 for (MachineInstr &CandI :
8674 Inst.getParent()->rend())) {
8675 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
8676 -1) {
8677 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
8678 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
8679 .addReg(CandI.getOperand(1).getReg());
8680 CopyFound = true;
8681 }
8682 break;
8683 }
8684 }
8685 if (!CopyFound) {
8686 // SCC def is not a copy
8687 // Insert a trivial select instead of creating a copy, because a copy from
8688 // SCC would semantically mean just copying a single bit, but we may need
8689 // the result to be a vector condition mask that needs preserving.
8690 unsigned Opcode =
8691 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
8692 auto NewSelect =
8693 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
8694 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
8695 }
8696 }
8697
8698 Register NewDestReg = MRI.createVirtualRegister(
8699 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
8700 MachineInstr *NewInst;
8701 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
8702 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
8703 .addImm(0)
8704 .add(Src1) // False
8705 .addImm(0)
8706 .add(Src0) // True
8707 .addReg(NewCondReg);
8708 } else {
8709 NewInst =
8710 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
8711 .add(Src1) // False
8712 .add(Src0) // True
8713 .addReg(NewCondReg);
8714 }
8715 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
8716 legalizeOperands(*NewInst, MDT);
8717 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
8718}
8719
8720void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
8721 MachineInstr &Inst) const {
8722 MachineBasicBlock &MBB = *Inst.getParent();
8723 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8724 MachineBasicBlock::iterator MII = Inst;
8725 const DebugLoc &DL = Inst.getDebugLoc();
8726
8727 MachineOperand &Dest = Inst.getOperand(0);
8728 MachineOperand &Src = Inst.getOperand(1);
8729 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8730 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8731
8732 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8733 : AMDGPU::V_SUB_CO_U32_e32;
8734
8735 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
8736 .addImm(0)
8737 .addReg(Src.getReg());
8738
8739 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8740 .addReg(Src.getReg())
8741 .addReg(TmpReg);
8742
8743 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8744 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8745}
8746
8747void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
8748 MachineInstr &Inst) const {
8749 MachineBasicBlock &MBB = *Inst.getParent();
8750 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8751 MachineBasicBlock::iterator MII = Inst;
8752 const DebugLoc &DL = Inst.getDebugLoc();
8753
8754 MachineOperand &Dest = Inst.getOperand(0);
8755 MachineOperand &Src1 = Inst.getOperand(1);
8756 MachineOperand &Src2 = Inst.getOperand(2);
8757 Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8758 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8759 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8760
8761 unsigned SubOp = ST.hasAddNoCarryInsts() ? AMDGPU::V_SUB_U32_e32
8762 : AMDGPU::V_SUB_CO_U32_e32;
8763
8764 BuildMI(MBB, MII, DL, get(SubOp), SubResultReg)
8765 .addReg(Src1.getReg())
8766 .addReg(Src2.getReg());
8767
8768 BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg);
8769
8770 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8771 .addReg(SubResultReg)
8772 .addReg(TmpReg);
8773
8774 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8775 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8776}
8777
8778void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
8779 MachineInstr &Inst) const {
8780 MachineBasicBlock &MBB = *Inst.getParent();
8781 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8782 MachineBasicBlock::iterator MII = Inst;
8783 const DebugLoc &DL = Inst.getDebugLoc();
8784
8785 MachineOperand &Dest = Inst.getOperand(0);
8786 MachineOperand &Src0 = Inst.getOperand(1);
8787 MachineOperand &Src1 = Inst.getOperand(2);
8788
8789 if (ST.hasDLInsts()) {
8790 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8791 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
8792 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
8793
8794 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
8795 .add(Src0)
8796 .add(Src1);
8797
8798 MRI.replaceRegWith(Dest.getReg(), NewDest);
8799 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8800 } else {
8801 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
8802 // invert either source and then perform the XOR. If either source is a
8803 // scalar register, then we can leave the inversion on the scalar unit to
8804 // achieve a better distribution of scalar and vector instructions.
8805 bool Src0IsSGPR = Src0.isReg() &&
8806 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
8807 bool Src1IsSGPR = Src1.isReg() &&
8808 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
8809 MachineInstr *Xor;
8810 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8811 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8812
8813 // Build a pair of scalar instructions and add them to the work list.
8814 // The next iteration over the work list will lower these to the vector
8815 // unit as necessary.
8816 if (Src0IsSGPR) {
8817 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
8818 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8819 .addReg(Temp)
8820 .add(Src1);
8821 } else if (Src1IsSGPR) {
8822 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
8823 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
8824 .add(Src0)
8825 .addReg(Temp);
8826 } else {
8827 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
8828 .add(Src0)
8829 .add(Src1);
8830 MachineInstr *Not =
8831 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
8832 Worklist.insert(Not);
8833 }
8834
8835 MRI.replaceRegWith(Dest.getReg(), NewDest);
8836
8837 Worklist.insert(Xor);
8838
8839 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8840 }
8841}
8842
8843void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
8844 MachineInstr &Inst,
8845 unsigned Opcode) const {
8846 MachineBasicBlock &MBB = *Inst.getParent();
8847 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8848 MachineBasicBlock::iterator MII = Inst;
8849 const DebugLoc &DL = Inst.getDebugLoc();
8850
8851 MachineOperand &Dest = Inst.getOperand(0);
8852 MachineOperand &Src0 = Inst.getOperand(1);
8853 MachineOperand &Src1 = Inst.getOperand(2);
8854
8855 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8856 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
8857
8858 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
8859 .add(Src0)
8860 .add(Src1);
8861
8862 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
8863 .addReg(Interm);
8864
8865 Worklist.insert(&Op);
8866 Worklist.insert(&Not);
8867
8868 MRI.replaceRegWith(Dest.getReg(), NewDest);
8869 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8870}
8871
8872void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
8873 MachineInstr &Inst,
8874 unsigned Opcode) const {
8875 MachineBasicBlock &MBB = *Inst.getParent();
8876 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8877 MachineBasicBlock::iterator MII = Inst;
8878 const DebugLoc &DL = Inst.getDebugLoc();
8879
8880 MachineOperand &Dest = Inst.getOperand(0);
8881 MachineOperand &Src0 = Inst.getOperand(1);
8882 MachineOperand &Src1 = Inst.getOperand(2);
8883
8884 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8885 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
8886
8887 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
8888 .add(Src1);
8889
8890 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
8891 .add(Src0)
8892 .addReg(Interm);
8893
8894 Worklist.insert(&Not);
8895 Worklist.insert(&Op);
8896
8897 MRI.replaceRegWith(Dest.getReg(), NewDest);
8898 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
8899}
8900
8901void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
8902 MachineInstr &Inst, unsigned Opcode,
8903 bool Swap) const {
8904 MachineBasicBlock &MBB = *Inst.getParent();
8905 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8906
8907 MachineOperand &Dest = Inst.getOperand(0);
8908 MachineOperand &Src0 = Inst.getOperand(1);
8909 const DebugLoc &DL = Inst.getDebugLoc();
8910
8911 MachineBasicBlock::iterator MII = Inst;
8912
8913 const MCInstrDesc &InstDesc = get(Opcode);
8914 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8915 MRI.getRegClass(Src0.getReg()) :
8916 &AMDGPU::SGPR_32RegClass;
8917
8918 const TargetRegisterClass *Src0SubRC =
8919 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8920
8921 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8922 AMDGPU::sub0, Src0SubRC);
8923
8924 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8925 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8926 const TargetRegisterClass *NewDestSubRC =
8927 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8928
8929 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8930 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8931
8932 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8933 AMDGPU::sub1, Src0SubRC);
8934
8935 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8936 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8937
8938 if (Swap)
8939 std::swap(DestSub0, DestSub1);
8940
8941 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8942 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8943 .addReg(DestSub0)
8944 .addImm(AMDGPU::sub0)
8945 .addReg(DestSub1)
8946 .addImm(AMDGPU::sub1);
8947
8948 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8949
8950 Worklist.insert(&LoHalf);
8951 Worklist.insert(&HiHalf);
8952
8953 // We don't need to legalizeOperands here because for a single operand, src0
8954 // will support any kind of input.
8955
8956 // Move all users of this moved value.
8957 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8958}
8959
8960// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8961// split the s_mul_u64 in 32-bit vector multiplications.
8962void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8963 MachineInstr &Inst,
8964 MachineDominatorTree *MDT) const {
8965 MachineBasicBlock &MBB = *Inst.getParent();
8966 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8967
8968 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8969 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8970 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8971
8972 MachineOperand &Dest = Inst.getOperand(0);
8973 MachineOperand &Src0 = Inst.getOperand(1);
8974 MachineOperand &Src1 = Inst.getOperand(2);
8975 const DebugLoc &DL = Inst.getDebugLoc();
8976 MachineBasicBlock::iterator MII = Inst;
8977
8978 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8979 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8980 const TargetRegisterClass *Src0SubRC =
8981 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8982 if (RI.isSGPRClass(Src0SubRC))
8983 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8984 const TargetRegisterClass *Src1SubRC =
8985 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8986 if (RI.isSGPRClass(Src1SubRC))
8987 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8988
8989 // First, we extract the low 32-bit and high 32-bit values from each of the
8990 // operands.
8991 MachineOperand Op0L =
8992 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8993 MachineOperand Op1L =
8994 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8995 MachineOperand Op0H =
8996 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8997 MachineOperand Op1H =
8998 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8999
9000 // The multilication is done as follows:
9001 //
9002 // Op1H Op1L
9003 // * Op0H Op0L
9004 // --------------------
9005 // Op1H*Op0L Op1L*Op0L
9006 // + Op1H*Op0H Op1L*Op0H
9007 // -----------------------------------------
9008 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
9009 //
9010 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
9011 // value and that would overflow.
9012 // The low 32-bit value is Op1L*Op0L.
9013 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
9014
9015 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9016 MachineInstr *Op1L_Op0H =
9017 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
9018 .add(Op1L)
9019 .add(Op0H);
9020
9021 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9022 MachineInstr *Op1H_Op0L =
9023 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
9024 .add(Op1H)
9025 .add(Op0L);
9026
9027 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9028 MachineInstr *Carry =
9029 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
9030 .add(Op1L)
9031 .add(Op0L);
9032
9033 MachineInstr *LoHalf =
9034 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
9035 .add(Op1L)
9036 .add(Op0L);
9037
9038 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9039 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
9040 .addReg(Op1L_Op0H_Reg)
9041 .addReg(Op1H_Op0L_Reg);
9042
9043 MachineInstr *HiHalf =
9044 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
9045 .addReg(AddReg)
9046 .addReg(CarryReg);
9047
9048 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9049 .addReg(DestSub0)
9050 .addImm(AMDGPU::sub0)
9051 .addReg(DestSub1)
9052 .addImm(AMDGPU::sub1);
9053
9054 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9055
9056 // Try to legalize the operands in case we need to swap the order to keep it
9057 // valid.
9058 legalizeOperands(*Op1L_Op0H, MDT);
9059 legalizeOperands(*Op1H_Op0L, MDT);
9060 legalizeOperands(*Carry, MDT);
9061 legalizeOperands(*LoHalf, MDT);
9062 legalizeOperands(*Add, MDT);
9063 legalizeOperands(*HiHalf, MDT);
9064
9065 // Move all users of this moved value.
9066 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9067}
9068
9069// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
9070// multiplications.
9071void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
9072 MachineInstr &Inst,
9073 MachineDominatorTree *MDT) const {
9074 MachineBasicBlock &MBB = *Inst.getParent();
9075 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9076
9077 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9078 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9079 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9080
9081 MachineOperand &Dest = Inst.getOperand(0);
9082 MachineOperand &Src0 = Inst.getOperand(1);
9083 MachineOperand &Src1 = Inst.getOperand(2);
9084 const DebugLoc &DL = Inst.getDebugLoc();
9085 MachineBasicBlock::iterator MII = Inst;
9086
9087 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
9088 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
9089 const TargetRegisterClass *Src0SubRC =
9090 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9091 if (RI.isSGPRClass(Src0SubRC))
9092 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
9093 const TargetRegisterClass *Src1SubRC =
9094 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9095 if (RI.isSGPRClass(Src1SubRC))
9096 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
9097
9098 // First, we extract the low 32-bit and high 32-bit values from each of the
9099 // operands.
9100 MachineOperand Op0L =
9101 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
9102 MachineOperand Op1L =
9103 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
9104
9105 unsigned Opc = Inst.getOpcode();
9106 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
9107 ? AMDGPU::V_MUL_HI_U32_e64
9108 : AMDGPU::V_MUL_HI_I32_e64;
9109 MachineInstr *HiHalf =
9110 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
9111
9112 MachineInstr *LoHalf =
9113 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
9114 .add(Op1L)
9115 .add(Op0L);
9116
9117 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9118 .addReg(DestSub0)
9119 .addImm(AMDGPU::sub0)
9120 .addReg(DestSub1)
9121 .addImm(AMDGPU::sub1);
9122
9123 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9124
9125 // Try to legalize the operands in case we need to swap the order to keep it
9126 // valid.
9127 legalizeOperands(*HiHalf, MDT);
9128 legalizeOperands(*LoHalf, MDT);
9129
9130 // Move all users of this moved value.
9131 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9132}
9133
9134void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
9135 MachineInstr &Inst, unsigned Opcode,
9136 MachineDominatorTree *MDT) const {
9137 MachineBasicBlock &MBB = *Inst.getParent();
9138 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9139
9140 MachineOperand &Dest = Inst.getOperand(0);
9141 MachineOperand &Src0 = Inst.getOperand(1);
9142 MachineOperand &Src1 = Inst.getOperand(2);
9143 const DebugLoc &DL = Inst.getDebugLoc();
9144
9145 MachineBasicBlock::iterator MII = Inst;
9146
9147 const MCInstrDesc &InstDesc = get(Opcode);
9148 const TargetRegisterClass *Src0RC = Src0.isReg() ?
9149 MRI.getRegClass(Src0.getReg()) :
9150 &AMDGPU::SGPR_32RegClass;
9151
9152 const TargetRegisterClass *Src0SubRC =
9153 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
9154 const TargetRegisterClass *Src1RC = Src1.isReg() ?
9155 MRI.getRegClass(Src1.getReg()) :
9156 &AMDGPU::SGPR_32RegClass;
9157
9158 const TargetRegisterClass *Src1SubRC =
9159 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
9160
9161 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9162 AMDGPU::sub0, Src0SubRC);
9163 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9164 AMDGPU::sub0, Src1SubRC);
9165 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
9166 AMDGPU::sub1, Src0SubRC);
9167 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
9168 AMDGPU::sub1, Src1SubRC);
9169
9170 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9171 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
9172 const TargetRegisterClass *NewDestSubRC =
9173 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
9174
9175 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
9176 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
9177 .add(SrcReg0Sub0)
9178 .add(SrcReg1Sub0);
9179
9180 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
9181 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
9182 .add(SrcReg0Sub1)
9183 .add(SrcReg1Sub1);
9184
9185 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
9186 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
9187 .addReg(DestSub0)
9188 .addImm(AMDGPU::sub0)
9189 .addReg(DestSub1)
9190 .addImm(AMDGPU::sub1);
9191
9192 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
9193
9194 Worklist.insert(&LoHalf);
9195 Worklist.insert(&HiHalf);
9196
9197 // Move all users of this moved value.
9198 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
9199}
9200
9201void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
9202 MachineInstr &Inst,
9203 MachineDominatorTree *MDT) const {
9204 MachineBasicBlock &MBB = *Inst.getParent();
9205 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9206
9207 MachineOperand &Dest = Inst.getOperand(0);
9208 MachineOperand &Src0 = Inst.getOperand(1);
9209 MachineOperand &Src1 = Inst.getOperand(2);
9210 const DebugLoc &DL = Inst.getDebugLoc();
9211
9212 MachineBasicBlock::iterator MII = Inst;
9213
9214 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
9215
9216 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
9217
9218 MachineOperand* Op0;
9219 MachineOperand* Op1;
9220
9221 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
9222 Op0 = &Src0;
9223 Op1 = &Src1;
9224 } else {
9225 Op0 = &Src1;
9226 Op1 = &Src0;
9227 }
9228
9229 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
9230 .add(*Op0);
9231
9232 Register NewDest = MRI.createVirtualRegister(DestRC);
9233
9234 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
9235 .addReg(Interm)
9236 .add(*Op1);
9237
9238 MRI.replaceRegWith(Dest.getReg(), NewDest);
9239
9240 Worklist.insert(&Xor);
9241}
9242
9243void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
9244 MachineInstr &Inst) const {
9245 MachineBasicBlock &MBB = *Inst.getParent();
9246 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9247
9248 MachineBasicBlock::iterator MII = Inst;
9249 const DebugLoc &DL = Inst.getDebugLoc();
9250
9251 MachineOperand &Dest = Inst.getOperand(0);
9252 MachineOperand &Src = Inst.getOperand(1);
9253
9254 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
9255 const TargetRegisterClass *SrcRC = Src.isReg() ?
9256 MRI.getRegClass(Src.getReg()) :
9257 &AMDGPU::SGPR_32RegClass;
9258
9259 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9260 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9261
9262 const TargetRegisterClass *SrcSubRC =
9263 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9264
9265 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9266 AMDGPU::sub0, SrcSubRC);
9267 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
9268 AMDGPU::sub1, SrcSubRC);
9269
9270 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
9271
9272 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
9273
9274 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9275
9276 // We don't need to legalize operands here. src0 for either instruction can be
9277 // an SGPR, and the second input is unused or determined here.
9278 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9279}
9280
9281void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
9282 MachineInstr &Inst) const {
9283 MachineBasicBlock &MBB = *Inst.getParent();
9284 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9285 MachineBasicBlock::iterator MII = Inst;
9286 const DebugLoc &DL = Inst.getDebugLoc();
9287
9288 MachineOperand &Dest = Inst.getOperand(0);
9289 uint32_t Imm = Inst.getOperand(2).getImm();
9290 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
9291 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
9292
9293 (void) Offset;
9294
9295 // Only sext_inreg cases handled.
9296 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
9297 Offset == 0 && "Not implemented");
9298
9299 if (BitWidth < 32) {
9300 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9301 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9302 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9303
9304 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
9305 .addReg(Inst.getOperand(1).getReg(), {}, AMDGPU::sub0)
9306 .addImm(0)
9307 .addImm(BitWidth);
9308
9309 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
9310 .addImm(31)
9311 .addReg(MidRegLo);
9312
9313 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9314 .addReg(MidRegLo)
9315 .addImm(AMDGPU::sub0)
9316 .addReg(MidRegHi)
9317 .addImm(AMDGPU::sub1);
9318
9319 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9320 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9321 return;
9322 }
9323
9324 MachineOperand &Src = Inst.getOperand(1);
9325 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9326 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
9327
9328 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
9329 .addImm(31)
9330 .addReg(Src.getReg(), {}, AMDGPU::sub0);
9331
9332 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
9333 .addReg(Src.getReg(), {}, AMDGPU::sub0)
9334 .addImm(AMDGPU::sub0)
9335 .addReg(TmpReg)
9336 .addImm(AMDGPU::sub1);
9337
9338 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9339 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9340}
9341
9342void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
9343 MachineInstr &Inst, unsigned Opcode,
9344 MachineDominatorTree *MDT) const {
9345 // (S_FLBIT_I32_B64 hi:lo) ->
9346 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
9347 // (S_FF1_I32_B64 hi:lo) ->
9348 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
9349
9350 MachineBasicBlock &MBB = *Inst.getParent();
9351 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9352 MachineBasicBlock::iterator MII = Inst;
9353 const DebugLoc &DL = Inst.getDebugLoc();
9354
9355 MachineOperand &Dest = Inst.getOperand(0);
9356 MachineOperand &Src = Inst.getOperand(1);
9357
9358 const MCInstrDesc &InstDesc = get(Opcode);
9359
9360 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
9361 unsigned OpcodeAdd = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64
9362 : AMDGPU::V_ADD_CO_U32_e32;
9363
9364 const TargetRegisterClass *SrcRC =
9365 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
9366 const TargetRegisterClass *SrcSubRC =
9367 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
9368
9369 MachineOperand SrcRegSub0 =
9370 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
9371 MachineOperand SrcRegSub1 =
9372 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
9373
9374 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9375 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9376 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9377 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9378
9379 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
9380
9381 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
9382
9383 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
9384 .addReg(IsCtlz ? MidReg1 : MidReg2)
9385 .addImm(32)
9386 .addImm(1); // enable clamp
9387
9388 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
9389 .addReg(MidReg3)
9390 .addReg(IsCtlz ? MidReg2 : MidReg1);
9391
9392 MRI.replaceRegWith(Dest.getReg(), MidReg4);
9393
9394 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
9395}
9396
9397void SIInstrInfo::addUsersToMoveToVALUWorklist(
9398 Register DstReg, MachineRegisterInfo &MRI,
9399 SIInstrWorklist &Worklist) const {
9400 for (MachineOperand &MO : make_early_inc_range(MRI.use_operands(DstReg))) {
9401 MachineInstr &UseMI = *MO.getParent();
9402
9403 unsigned OpNo = 0;
9404
9405 switch (UseMI.getOpcode()) {
9406 case AMDGPU::COPY:
9407 case AMDGPU::WQM:
9408 case AMDGPU::SOFT_WQM:
9409 case AMDGPU::STRICT_WWM:
9410 case AMDGPU::STRICT_WQM:
9411 case AMDGPU::REG_SEQUENCE:
9412 case AMDGPU::PHI:
9413 case AMDGPU::INSERT_SUBREG:
9414 break;
9415 default:
9416 OpNo = MO.getOperandNo();
9417 break;
9418 }
9419
9420 const TargetRegisterClass *OpRC = getOpRegClass(UseMI, OpNo);
9421 MRI.constrainRegClass(DstReg, OpRC);
9422
9423 if (!RI.hasVectorRegisters(OpRC))
9424 Worklist.insert(&UseMI);
9425 else
9426 // Legalization could change user list.
9427 legalizeOperandsVALUt16(UseMI, OpNo, MRI);
9428 }
9429}
9430
9431void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
9433 MachineInstr &Inst) const {
9434 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9435 MachineBasicBlock *MBB = Inst.getParent();
9436 MachineOperand &Src0 = Inst.getOperand(1);
9437 MachineOperand &Src1 = Inst.getOperand(2);
9438 const DebugLoc &DL = Inst.getDebugLoc();
9439
9440 if (ST.useRealTrue16Insts()) {
9441 Register SrcReg0, SrcReg1;
9442 if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
9443 SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9444 BuildMI(*MBB, Inst, DL,
9445 get(Src0.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), SrcReg0)
9446 .add(Src0);
9447 } else {
9448 SrcReg0 = Src0.getReg();
9449 }
9450
9451 if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
9452 SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9453 BuildMI(*MBB, Inst, DL,
9454 get(Src1.isImm() ? AMDGPU::V_MOV_B32_e32 : AMDGPU::COPY), SrcReg1)
9455 .add(Src1);
9456 } else {
9457 SrcReg1 = Src1.getReg();
9458 }
9459
9460 bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
9461 bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
9462
9463 auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
9464 switch (Inst.getOpcode()) {
9465 case AMDGPU::S_PACK_LL_B32_B16:
9466 NewMI
9467 .addReg(SrcReg0, {},
9468 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9469 .addImm(AMDGPU::lo16)
9470 .addReg(SrcReg1, {},
9471 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9472 .addImm(AMDGPU::hi16);
9473 break;
9474 case AMDGPU::S_PACK_LH_B32_B16:
9475 NewMI
9476 .addReg(SrcReg0, {},
9477 isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9478 .addImm(AMDGPU::lo16)
9479 .addReg(SrcReg1, {}, AMDGPU::hi16)
9480 .addImm(AMDGPU::hi16);
9481 break;
9482 case AMDGPU::S_PACK_HL_B32_B16:
9483 NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
9484 .addImm(AMDGPU::lo16)
9485 .addReg(SrcReg1, {},
9486 isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
9487 .addImm(AMDGPU::hi16);
9488 break;
9489 case AMDGPU::S_PACK_HH_B32_B16:
9490 NewMI.addReg(SrcReg0, {}, AMDGPU::hi16)
9491 .addImm(AMDGPU::lo16)
9492 .addReg(SrcReg1, {}, AMDGPU::hi16)
9493 .addImm(AMDGPU::hi16);
9494 break;
9495 default:
9496 llvm_unreachable("unhandled s_pack_* instruction");
9497 }
9498
9499 MachineOperand &Dest = Inst.getOperand(0);
9500 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9501 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9502 return;
9503 }
9504
9505 switch (Inst.getOpcode()) {
9506 case AMDGPU::S_PACK_LL_B32_B16: {
9507 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9508 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9509
9510 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
9511 // 0.
9512 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9513 .addImm(0xffff);
9514
9515 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
9516 .addReg(ImmReg, RegState::Kill)
9517 .add(Src0);
9518
9519 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9520 .add(Src1)
9521 .addImm(16)
9522 .addReg(TmpReg, RegState::Kill);
9523 break;
9524 }
9525 case AMDGPU::S_PACK_LH_B32_B16: {
9526 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9527 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9528 .addImm(0xffff);
9529 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
9530 .addReg(ImmReg, RegState::Kill)
9531 .add(Src0)
9532 .add(Src1);
9533 break;
9534 }
9535 case AMDGPU::S_PACK_HL_B32_B16: {
9536 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9537 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9538 .addImm(16)
9539 .add(Src0);
9540 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
9541 .add(Src1)
9542 .addImm(16)
9543 .addReg(TmpReg, RegState::Kill);
9544 break;
9545 }
9546 case AMDGPU::S_PACK_HH_B32_B16: {
9547 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9548 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
9549 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
9550 .addImm(16)
9551 .add(Src0);
9552 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
9553 .addImm(0xffff0000);
9554 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
9555 .add(Src1)
9556 .addReg(ImmReg, RegState::Kill)
9557 .addReg(TmpReg, RegState::Kill);
9558 break;
9559 }
9560 default:
9561 llvm_unreachable("unhandled s_pack_* instruction");
9562 }
9563
9564 MachineOperand &Dest = Inst.getOperand(0);
9565 MRI.replaceRegWith(Dest.getReg(), ResultReg);
9566 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
9567}
9568
9569void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
9570 MachineInstr &SCCDefInst,
9571 SIInstrWorklist &Worklist,
9572 Register NewCond) const {
9573
9574 // Ensure that def inst defines SCC, which is still live.
9575 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
9576 !Op.isDead() && Op.getParent() == &SCCDefInst);
9577 SmallVector<MachineInstr *, 4> CopyToDelete;
9578 // This assumes that all the users of SCC are in the same block
9579 // as the SCC def.
9580 for (MachineInstr &MI : // Skip the def inst itself.
9581 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
9582 SCCDefInst.getParent()->end())) {
9583 // Check if SCC is used first.
9584 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
9585 if (SCCIdx != -1) {
9586 if (MI.isCopy()) {
9587 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9588 Register DestReg = MI.getOperand(0).getReg();
9589
9590 MRI.replaceRegWith(DestReg, NewCond);
9591 CopyToDelete.push_back(&MI);
9592 } else {
9593
9594 if (NewCond.isValid())
9595 MI.getOperand(SCCIdx).setReg(NewCond);
9596
9597 Worklist.insert(&MI);
9598 }
9599 }
9600 // Exit if we find another SCC def.
9601 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
9602 break;
9603 }
9604 for (auto &Copy : CopyToDelete)
9605 Copy->eraseFromParent();
9606}
9607
9608// Instructions that use SCC may be converted to VALU instructions. When that
9609// happens, the SCC register is changed to VCC_LO. The instruction that defines
9610// SCC must be changed to an instruction that defines VCC. This function makes
9611// sure that the instruction that defines SCC is added to the moveToVALU
9612// worklist.
9613void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
9614 SIInstrWorklist &Worklist) const {
9615 // Look for a preceding instruction that either defines VCC or SCC. If VCC
9616 // then there is nothing to do because the defining instruction has been
9617 // converted to a VALU already. If SCC then that instruction needs to be
9618 // converted to a VALU.
9619 for (MachineInstr &MI :
9620 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
9621 SCCUseInst->getParent()->rend())) {
9622 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
9623 break;
9624 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
9625 Worklist.insert(&MI);
9626 break;
9627 }
9628 }
9629}
9630
9631const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
9632 const MachineInstr &Inst) const {
9633 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
9634
9635 switch (Inst.getOpcode()) {
9636 // For target instructions, getOpRegClass just returns the virtual register
9637 // class associated with the operand, so we need to find an equivalent VGPR
9638 // register class in order to move the instruction to the VALU.
9639 case AMDGPU::COPY:
9640 case AMDGPU::PHI:
9641 case AMDGPU::REG_SEQUENCE:
9642 case AMDGPU::INSERT_SUBREG:
9643 case AMDGPU::WQM:
9644 case AMDGPU::SOFT_WQM:
9645 case AMDGPU::STRICT_WWM:
9646 case AMDGPU::STRICT_WQM: {
9647 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
9648 if (RI.isAGPRClass(SrcRC)) {
9649 if (RI.isAGPRClass(NewDstRC))
9650 return nullptr;
9651
9652 switch (Inst.getOpcode()) {
9653 case AMDGPU::PHI:
9654 case AMDGPU::REG_SEQUENCE:
9655 case AMDGPU::INSERT_SUBREG:
9656 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
9657 break;
9658 default:
9659 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9660 }
9661
9662 if (!NewDstRC)
9663 return nullptr;
9664 } else {
9665 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
9666 return nullptr;
9667
9668 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
9669 if (!NewDstRC)
9670 return nullptr;
9671 }
9672
9673 return NewDstRC;
9674 }
9675 default:
9676 return NewDstRC;
9677 }
9678}
9679
9680// Find the one SGPR operand we are allowed to use.
9681Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
9682 int OpIndices[3]) const {
9683 const MCInstrDesc &Desc = MI.getDesc();
9684
9685 // Find the one SGPR operand we are allowed to use.
9686 //
9687 // First we need to consider the instruction's operand requirements before
9688 // legalizing. Some operands are required to be SGPRs, such as implicit uses
9689 // of VCC, but we are still bound by the constant bus requirement to only use
9690 // one.
9691 //
9692 // If the operand's class is an SGPR, we can never move it.
9693
9694 Register SGPRReg = findImplicitSGPRRead(MI);
9695 if (SGPRReg)
9696 return SGPRReg;
9697
9698 Register UsedSGPRs[3] = {Register()};
9699 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
9700
9701 for (unsigned i = 0; i < 3; ++i) {
9702 int Idx = OpIndices[i];
9703 if (Idx == -1)
9704 break;
9705
9706 const MachineOperand &MO = MI.getOperand(Idx);
9707 if (!MO.isReg())
9708 continue;
9709
9710 // Is this operand statically required to be an SGPR based on the operand
9711 // constraints?
9712 const TargetRegisterClass *OpRC =
9713 RI.getRegClass(getOpRegClassID(Desc.operands()[Idx]));
9714 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
9715 if (IsRequiredSGPR)
9716 return MO.getReg();
9717
9718 // If this could be a VGPR or an SGPR, Check the dynamic register class.
9719 Register Reg = MO.getReg();
9720 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
9721 if (RI.isSGPRClass(RegRC))
9722 UsedSGPRs[i] = Reg;
9723 }
9724
9725 // We don't have a required SGPR operand, so we have a bit more freedom in
9726 // selecting operands to move.
9727
9728 // Try to select the most used SGPR. If an SGPR is equal to one of the
9729 // others, we choose that.
9730 //
9731 // e.g.
9732 // V_FMA_F32 v0, s0, s0, s0 -> No moves
9733 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
9734
9735 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
9736 // prefer those.
9737
9738 if (UsedSGPRs[0]) {
9739 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
9740 SGPRReg = UsedSGPRs[0];
9741 }
9742
9743 if (!SGPRReg && UsedSGPRs[1]) {
9744 if (UsedSGPRs[1] == UsedSGPRs[2])
9745 SGPRReg = UsedSGPRs[1];
9746 }
9747
9748 return SGPRReg;
9749}
9750
9752 AMDGPU::OpName OperandName) const {
9753 if (OperandName == AMDGPU::OpName::NUM_OPERAND_NAMES)
9754 return nullptr;
9755
9756 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
9757 if (Idx == -1)
9758 return nullptr;
9759
9760 return &MI.getOperand(Idx);
9761}
9762
9764 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
9765 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11
9768 return (Format << 44) |
9769 (1ULL << 56) | // RESOURCE_LEVEL = 1
9770 (3ULL << 60); // OOB_SELECT = 3
9771 }
9772
9773 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
9774 if (ST.isAmdHsaOS()) {
9775 // Set ATC = 1. GFX9 doesn't have this bit.
9776 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9777 RsrcDataFormat |= (1ULL << 56);
9778
9779 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
9780 // BTW, it disables TC L2 and therefore decreases performance.
9781 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
9782 RsrcDataFormat |= (2ULL << 59);
9783 }
9784
9785 return RsrcDataFormat;
9786}
9787
9791 0xffffffff; // Size;
9792
9793 // GFX9 doesn't have ELEMENT_SIZE.
9794 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
9795 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
9796 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
9797 }
9798
9799 // IndexStride = 64 / 32.
9800 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
9801 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
9802
9803 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
9804 // Clear them unless we want a huge stride.
9805 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
9806 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
9807 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
9808
9809 return Rsrc23;
9810}
9811
9813 unsigned Opc = MI.getOpcode();
9814
9815 return isSMRD(Opc);
9816}
9817
9819 return get(Opc).mayLoad() &&
9820 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
9821}
9822
9824 TypeSize &MemBytes) const {
9825 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
9826 if (!Addr || !Addr->isFI())
9827 return Register();
9828
9829 assert(!MI.memoperands_empty() &&
9830 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
9831
9832 FrameIndex = Addr->getIndex();
9833
9834 int VDataIdx =
9835 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
9836 MemBytes = TypeSize::getFixed(getOpSize(MI.getOpcode(), VDataIdx));
9837 return MI.getOperand(VDataIdx).getReg();
9838}
9839
9841 TypeSize &MemBytes) const {
9842 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
9843 assert(Addr && Addr->isFI());
9844 FrameIndex = Addr->getIndex();
9845
9846 int DataIdx =
9847 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::data);
9848 MemBytes = TypeSize::getFixed(getOpSize(MI.getOpcode(), DataIdx));
9849 return MI.getOperand(DataIdx).getReg();
9850}
9851
9853 int &FrameIndex,
9854 TypeSize &MemBytes) const {
9855 if (!MI.mayLoad())
9856 return Register();
9857
9858 if (isMUBUF(MI) || isVGPRSpill(MI))
9859 return isStackAccess(MI, FrameIndex, MemBytes);
9860
9861 if (isSGPRSpill(MI))
9862 return isSGPRStackAccess(MI, FrameIndex, MemBytes);
9863
9864 return Register();
9865}
9866
9868 int &FrameIndex,
9869 TypeSize &MemBytes) const {
9870 if (!MI.mayStore())
9871 return Register();
9872
9873 if (isMUBUF(MI) || isVGPRSpill(MI))
9874 return isStackAccess(MI, FrameIndex, MemBytes);
9875
9876 if (isSGPRSpill(MI))
9877 return isSGPRStackAccess(MI, FrameIndex, MemBytes);
9878
9879 return Register();
9880}
9881
9883 unsigned Opc = MI.getOpcode();
9885 unsigned DescSize = Desc.getSize();
9886
9887 // If we have a definitive size, we can use it. Otherwise we need to inspect
9888 // the operands to know the size.
9889 if (isFixedSize(MI)) {
9890 unsigned Size = DescSize;
9891
9892 // If we hit the buggy offset, an extra nop will be inserted in MC so
9893 // estimate the worst case.
9894 if (MI.isBranch() && ST.hasOffset3fBug())
9895 Size += 4;
9896
9897 return Size;
9898 }
9899
9900 // Instructions may have a 32-bit literal encoded after them. Check
9901 // operands that could ever be literals.
9902 if (isVALU(MI) || isSALU(MI)) {
9903 if (isDPP(MI))
9904 return DescSize;
9905 bool HasLiteral = false;
9906 unsigned LiteralSize = 4;
9907 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
9908 const MachineOperand &Op = MI.getOperand(I);
9909 const MCOperandInfo &OpInfo = Desc.operands()[I];
9910 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
9911 HasLiteral = true;
9912 if (ST.has64BitLiterals()) {
9913 switch (OpInfo.OperandType) {
9914 default:
9915 break;
9917 if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true))
9918 LiteralSize = 8;
9919 break;
9921 // A 32-bit literal is only valid when the value fits in BOTH signed
9922 // and unsigned 32-bit ranges [0, 2^31-1], matching the MC code
9923 // emitter's getLit64Encoding logic. This is because of the lack of
9924 // abilility to tell signedness of the literal, therefore we need to
9925 // be conservative and assume values outside this range require a
9926 // 64-bit literal encoding (8 bytes).
9927 if (!Op.isImm() || !isInt<32>(Op.getImm()) ||
9928 !isUInt<32>(Op.getImm()))
9929 LiteralSize = 8;
9930 break;
9931 }
9932 }
9933 break;
9934 }
9935 }
9936 return HasLiteral ? DescSize + LiteralSize : DescSize;
9937 }
9938
9939 // Check whether we have extra NSA words.
9940 if (isMIMG(MI)) {
9941 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
9942 if (VAddr0Idx < 0)
9943 return 8;
9944
9945 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
9946 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
9947 }
9948
9949 switch (Opc) {
9950 case TargetOpcode::BUNDLE:
9951 return getInstBundleSize(MI);
9952 case TargetOpcode::INLINEASM:
9953 case TargetOpcode::INLINEASM_BR: {
9954 const MachineFunction *MF = MI.getMF();
9955 const char *AsmStr = MI.getOperand(0).getSymbolName();
9956 return getInlineAsmLength(AsmStr, MF->getTarget().getMCAsmInfo(), &ST);
9957 }
9958 default:
9959 if (MI.isMetaInstruction())
9960 return 0;
9961
9962 // If D16 Pseudo inst, get correct MC code size
9963 const auto *D16Info = AMDGPU::getT16D16Helper(Opc);
9964 if (D16Info) {
9965 // Assume d16_lo/hi inst are always in same size
9966 unsigned LoInstOpcode = D16Info->LoOp;
9967 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(LoInstOpcode);
9968 DescSize = Desc.getSize();
9969 }
9970
9971 // If FMA Pseudo inst, get correct MC code size
9972 if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
9973 // All potential lowerings are the same size; arbitrarily pick one.
9974 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
9975 DescSize = Desc.getSize();
9976 }
9977
9978 return DescSize;
9979 }
9980}
9981
9984 if (MI.isBranch() && ST.hasOffset3fBug())
9985 return InstSizeVerifyMode::NoVerify;
9986 return InstSizeVerifyMode::ExactSize;
9987}
9988
9990 if (!isFLAT(MI))
9991 return false;
9992
9993 if (MI.memoperands_empty())
9994 return true;
9995
9996 for (const MachineMemOperand *MMO : MI.memoperands()) {
9997 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
9998 return true;
9999 }
10000 return false;
10001}
10002
10005 static const std::pair<int, const char *> TargetIndices[] = {
10006 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
10007 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
10008 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
10009 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
10010 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
10011 return ArrayRef(TargetIndices);
10012}
10013
10014/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
10015/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
10018 const ScheduleDAG *DAG) const {
10019 return new GCNHazardRecognizer(DAG->MF);
10020}
10021
10022/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
10023/// pass.
10026 MachineLoopInfo *MLI) const {
10027 return new GCNHazardRecognizer(MF, MLI);
10028}
10029
10030// Called during:
10031// - pre-RA scheduling and post-RA scheduling
10034 const ScheduleDAGMI *DAG) const {
10035 // Borrowed from Arm Target
10036 // We would like to restrict this hazard recognizer to only
10037 // post-RA scheduling; we can tell that we're post-RA because we don't
10038 // track VRegLiveness.
10039 if (!DAG->hasVRegLiveness())
10040 return new GCNHazardRecognizer(DAG->MF);
10042}
10043
10044std::pair<unsigned, unsigned>
10046 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
10047}
10048
10051 static const std::pair<unsigned, const char *> TargetFlags[] = {
10052 {MO_GOTPCREL, "amdgpu-gotprel"},
10053 {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"},
10054 {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"},
10055 {MO_GOTPCREL64, "amdgpu-gotprel64"},
10056 {MO_REL32_LO, "amdgpu-rel32-lo"},
10057 {MO_REL32_HI, "amdgpu-rel32-hi"},
10058 {MO_REL64, "amdgpu-rel64"},
10059 {MO_ABS32_LO, "amdgpu-abs32-lo"},
10060 {MO_ABS32_HI, "amdgpu-abs32-hi"},
10061 {MO_ABS64, "amdgpu-abs64"},
10062 };
10063
10064 return ArrayRef(TargetFlags);
10065}
10066
10069 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
10070 {
10071 {MONoClobber, "amdgpu-noclobber"},
10072 {MOLastUse, "amdgpu-last-use"},
10073 {MOCooperative, "amdgpu-cooperative"},
10074 {MOThreadPrivate, "amdgpu-thread-private"},
10075 };
10076
10077 return ArrayRef(TargetFlags);
10078}
10079
10081 const MachineFunction &MF) const {
10083 assert(SrcReg.isVirtual());
10084 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
10085 return AMDGPU::WWM_COPY;
10086
10087 return AMDGPU::COPY;
10088}
10089
10091 uint32_t Opcode = MI.getOpcode();
10092 // Check if it is SGPR spill or wwm-register spill Opcode.
10093 if (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode))
10094 return true;
10095
10096 const MachineFunction *MF = MI.getMF();
10097 const MachineRegisterInfo &MRI = MF->getRegInfo();
10099
10100 // See if this is Liverange split instruction inserted for SGPR or
10101 // wwm-register. The implicit def inserted for wwm-registers should also be
10102 // included as they can appear at the bb begin.
10103 bool IsLRSplitInst = MI.getFlag(MachineInstr::LRSplit);
10104 if (!IsLRSplitInst && Opcode != AMDGPU::IMPLICIT_DEF)
10105 return false;
10106
10107 Register Reg = MI.getOperand(0).getReg();
10108 if (RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg)))
10109 return IsLRSplitInst;
10110
10111 return MFI->isWWMReg(Reg);
10112}
10113
10115 Register Reg) const {
10116 // We need to handle instructions which may be inserted during register
10117 // allocation to handle the prolog. The initial prolog instruction may have
10118 // been separated from the start of the block by spills and copies inserted
10119 // needed by the prolog. However, the insertions for scalar registers can
10120 // always be placed at the BB top as they are independent of the exec mask
10121 // value.
10122 bool IsNullOrVectorRegister = true;
10123 if (Reg) {
10124 const MachineFunction *MF = MI.getMF();
10125 const MachineRegisterInfo &MRI = MF->getRegInfo();
10126 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
10127 }
10128
10129 return IsNullOrVectorRegister &&
10130 (canAddToBBProlog(MI) ||
10131 (!MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
10132 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
10133}
10134
10138 const DebugLoc &DL,
10139 Register DestReg) const {
10140 if (ST.hasAddNoCarryInsts())
10141 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
10142
10143 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
10144 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
10145 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
10146
10147 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
10148 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
10149}
10150
10153 const DebugLoc &DL,
10154 Register DestReg,
10155 RegScavenger &RS) const {
10156 if (ST.hasAddNoCarryInsts())
10157 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
10158
10159 // If available, prefer to use vcc.
10160 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
10161 ? Register(RI.getVCC())
10162 : RS.scavengeRegisterBackwards(
10163 *RI.getBoolRC(), I, /* RestoreAfter */ false,
10164 0, /* AllowSpill */ false);
10165
10166 // TODO: Users need to deal with this.
10167 if (!UnusedCarry.isValid())
10168 return MachineInstrBuilder();
10169
10170 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
10171 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
10172}
10173
10174bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
10175 switch (Opcode) {
10176 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
10177 case AMDGPU::SI_KILL_I1_TERMINATOR:
10178 return true;
10179 default:
10180 return false;
10181 }
10182}
10183
10185 switch (Opcode) {
10186 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
10187 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
10188 case AMDGPU::SI_KILL_I1_PSEUDO:
10189 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
10190 default:
10191 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
10192 }
10193}
10194
10195bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
10196 return Imm <= getMaxMUBUFImmOffset(ST);
10197}
10198
10200 // GFX12 field is non-negative 24-bit signed byte offset.
10201 const unsigned OffsetBits =
10202 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
10203 return (1 << OffsetBits) - 1;
10204}
10205
10207 if (!ST.isWave32())
10208 return;
10209
10210 if (MI.isInlineAsm())
10211 return;
10212
10213 if (MI.getNumOperands() < MI.getNumExplicitOperands())
10214 return;
10215
10216 for (auto &Op : MI.implicit_operands()) {
10217 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
10218 Op.setReg(AMDGPU::VCC_LO);
10219 }
10220}
10221
10223 if (!isSMRD(MI))
10224 return false;
10225
10226 // Check that it is using a buffer resource.
10227 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
10228 if (Idx == -1) // e.g. s_memtime
10229 return false;
10230
10231 const int16_t RCID = getOpRegClassID(MI.getDesc().operands()[Idx]);
10232 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
10233}
10234
10235// Given Imm, split it into the values to put into the SOffset and ImmOffset
10236// fields in an MUBUF instruction. Return false if it is not possible (due to a
10237// hardware bug needing a workaround).
10238//
10239// The required alignment ensures that individual address components remain
10240// aligned if they are aligned to begin with. It also ensures that additional
10241// offsets within the given alignment can be added to the resulting ImmOffset.
10243 uint32_t &ImmOffset, Align Alignment) const {
10244 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
10245 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
10246 uint32_t Overflow = 0;
10247
10248 if (Imm > MaxImm) {
10249 if (Imm <= MaxImm + 64) {
10250 // Use an SOffset inline constant for 4..64
10251 Overflow = Imm - MaxImm;
10252 Imm = MaxImm;
10253 } else {
10254 // Try to keep the same value in SOffset for adjacent loads, so that
10255 // the corresponding register contents can be re-used.
10256 //
10257 // Load values with all low-bits (except for alignment bits) set into
10258 // SOffset, so that a larger range of values can be covered using
10259 // s_movk_i32.
10260 //
10261 // Atomic operations fail to work correctly when individual address
10262 // components are unaligned, even if their sum is aligned.
10263 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
10264 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
10265 Imm = Low;
10266 Overflow = High - Alignment.value();
10267 }
10268 }
10269
10270 if (Overflow > 0) {
10271 // There is a hardware bug in SI and CI which prevents address clamping in
10272 // MUBUF instructions from working correctly with SOffsets. The immediate
10273 // offset is unaffected.
10274 if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
10275 return false;
10276
10277 // It is not possible to set immediate in SOffset field on some targets.
10278 if (ST.hasRestrictedSOffset())
10279 return false;
10280 }
10281
10282 ImmOffset = Imm;
10283 SOffset = Overflow;
10284 return true;
10285}
10286
10287// Depending on the used address space and instructions, some immediate offsets
10288// are allowed and some are not.
10289// Pre-GFX12, flat instruction offsets can only be non-negative, global and
10290// scratch instruction offsets can also be negative. On GFX12, offsets can be
10291// negative for all variants.
10292//
10293// There are several bugs related to these offsets:
10294// On gfx10.1, flat instructions that go into the global address space cannot
10295// use an offset.
10296//
10297// For scratch instructions, the address can be either an SGPR or a VGPR.
10298// The following offsets can be used, depending on the architecture (x means
10299// cannot be used):
10300// +----------------------------+------+------+
10301// | Address-Mode | SGPR | VGPR |
10302// +----------------------------+------+------+
10303// | gfx9 | | |
10304// | negative, 4-aligned offset | x | ok |
10305// | negative, unaligned offset | x | ok |
10306// +----------------------------+------+------+
10307// | gfx10 | | |
10308// | negative, 4-aligned offset | ok | ok |
10309// | negative, unaligned offset | ok | x |
10310// +----------------------------+------+------+
10311// | gfx10.3 | | |
10312// | negative, 4-aligned offset | ok | ok |
10313// | negative, unaligned offset | ok | ok |
10314// +----------------------------+------+------+
10315//
10316// This function ignores the addressing mode, so if an offset cannot be used in
10317// one addressing mode, it is considered illegal.
10318bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
10319 uint64_t FlatVariant) const {
10320 // TODO: Should 0 be special cased?
10321 if (!ST.hasFlatInstOffsets())
10322 return false;
10323
10324 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
10325 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
10326 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
10327 return false;
10328
10329 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10330 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
10331 (Offset % 4) != 0) {
10332 return false;
10333 }
10334
10335 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10336 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
10337 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
10338}
10339
10340// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
10341std::pair<int64_t, int64_t>
10342SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
10343 uint64_t FlatVariant) const {
10344 int64_t RemainderOffset = COffsetVal;
10345 int64_t ImmField = 0;
10346
10347 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
10348 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
10349
10350 if (AllowNegative) {
10351 // Use signed division by a power of two to truncate towards 0.
10352 int64_t D = 1LL << NumBits;
10353 RemainderOffset = (COffsetVal / D) * D;
10354 ImmField = COffsetVal - RemainderOffset;
10355
10356 if (ST.hasNegativeUnalignedScratchOffsetBug() &&
10357 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
10358 (ImmField % 4) != 0) {
10359 // Make ImmField a multiple of 4
10360 RemainderOffset += ImmField % 4;
10361 ImmField -= ImmField % 4;
10362 }
10363 } else if (COffsetVal >= 0) {
10364 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
10365 RemainderOffset = COffsetVal - ImmField;
10366 }
10367
10368 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
10369 assert(RemainderOffset + ImmField == COffsetVal);
10370 return {ImmField, RemainderOffset};
10371}
10372
10374 if (ST.hasNegativeScratchOffsetBug() &&
10375 FlatVariant == SIInstrFlags::FlatScratch)
10376 return false;
10377
10378 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
10379}
10380
10381static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
10382 switch (ST.getGeneration()) {
10383 default:
10384 break;
10387 return SIEncodingFamily::SI;
10390 return SIEncodingFamily::VI;
10394 return ST.hasGFX11_7Insts() ? SIEncodingFamily::GFX1170
10397 return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
10401 }
10402 llvm_unreachable("Unknown subtarget generation!");
10403}
10404
10405bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
10406 switch(MCOp) {
10407 // These opcodes use indirect register addressing so
10408 // they need special handling by codegen (currently missing).
10409 // Therefore it is too risky to allow these opcodes
10410 // to be selected by dpp combiner or sdwa peepholer.
10411 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
10412 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
10413 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
10414 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
10415 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
10416 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
10417 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
10418 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
10419 return true;
10420 default:
10421 return false;
10422 }
10423}
10424
10425#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
10426 case OPCODE##_dpp: \
10427 case OPCODE##_e32: \
10428 case OPCODE##_e64: \
10429 case OPCODE##_e64_dpp: \
10430 case OPCODE##_sdwa:
10431
10432static bool isRenamedInGFX9(int Opcode) {
10433 switch (Opcode) {
10434 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
10435 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
10436 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
10437 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
10438 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
10439 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
10440 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
10441 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
10442 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
10443 //
10444 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
10445 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
10446 case AMDGPU::V_FMA_F16_gfx9_e64:
10447 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
10448 case AMDGPU::V_INTERP_P2_F16:
10449 case AMDGPU::V_MAD_F16_e64:
10450 case AMDGPU::V_MAD_U16_e64:
10451 case AMDGPU::V_MAD_I16_e64:
10452 return true;
10453 default:
10454 return false;
10455 }
10456}
10457
10458int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
10459 assert(Opcode == (int)SIInstrInfo::getNonSoftWaitcntOpcode(Opcode) &&
10460 "SIInsertWaitcnts should have promoted soft waitcnt instructions!");
10461
10462 unsigned Gen = subtargetEncodingFamily(ST);
10463
10464 if (ST.getGeneration() == AMDGPUSubtarget::GFX9 && isRenamedInGFX9(Opcode))
10466
10467 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
10468 // subtarget has UnpackedD16VMem feature.
10469 // TODO: remove this when we discard GFX80 encoding.
10470 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
10472
10473 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
10474 switch (ST.getGeneration()) {
10475 default:
10477 break;
10480 break;
10483 break;
10484 }
10485 }
10486
10487 if (isMAI(Opcode)) {
10488 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
10489 if (MFMAOp != -1)
10490 Opcode = MFMAOp;
10491 }
10492
10493 int32_t MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
10494
10495 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX11_7Insts())
10497
10498 if (MCOp == AMDGPU::INSTRUCTION_LIST_END && ST.hasGFX1250Insts())
10500
10501 // -1 means that Opcode is already a native instruction.
10502 if (MCOp == -1)
10503 return Opcode;
10504
10505 if (ST.hasGFX90AInsts()) {
10506 uint32_t NMCOp = AMDGPU::INSTRUCTION_LIST_END;
10507 if (ST.hasGFX940Insts())
10509 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10511 if (NMCOp == AMDGPU::INSTRUCTION_LIST_END)
10513 if (NMCOp != AMDGPU::INSTRUCTION_LIST_END)
10514 MCOp = NMCOp;
10515 }
10516
10517 // INSTRUCTION_LIST_END means that Opcode is a pseudo instruction that has no
10518 // encoding in the given subtarget generation.
10519 if (MCOp == AMDGPU::INSTRUCTION_LIST_END)
10520 return -1;
10521
10522 if (isAsmOnlyOpcode(MCOp))
10523 return -1;
10524
10525 return MCOp;
10526}
10527
10528static
10530 assert(RegOpnd.isReg());
10531 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
10532 getRegSubRegPair(RegOpnd);
10533}
10534
10537 assert(MI.isRegSequence());
10538 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
10539 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
10540 auto &RegOp = MI.getOperand(1 + 2 * I);
10541 return getRegOrUndef(RegOp);
10542 }
10544}
10545
10546// Try to find the definition of reg:subreg in subreg-manipulation pseudos
10547// Following a subreg of reg:subreg isn't supported
10550 if (!RSR.SubReg)
10551 return false;
10552 switch (MI.getOpcode()) {
10553 default: break;
10554 case AMDGPU::REG_SEQUENCE:
10555 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
10556 return true;
10557 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
10558 case AMDGPU::INSERT_SUBREG:
10559 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
10560 // inserted the subreg we're looking for
10561 RSR = getRegOrUndef(MI.getOperand(2));
10562 else { // the subreg in the rest of the reg
10563 auto R1 = getRegOrUndef(MI.getOperand(1));
10564 if (R1.SubReg) // subreg of subreg isn't supported
10565 return false;
10566 RSR.Reg = R1.Reg;
10567 }
10568 return true;
10569 }
10570 return false;
10571}
10572
10574 const MachineRegisterInfo &MRI) {
10575 assert(MRI.isSSA());
10576 if (!P.Reg.isVirtual())
10577 return nullptr;
10578
10579 auto RSR = P;
10580 auto *DefInst = MRI.getVRegDef(RSR.Reg);
10581 while (auto *MI = DefInst) {
10582 DefInst = nullptr;
10583 switch (MI->getOpcode()) {
10584 case AMDGPU::COPY:
10585 case AMDGPU::V_MOV_B32_e32: {
10586 auto &Op1 = MI->getOperand(1);
10587 if (Op1.isReg() && Op1.getReg().isVirtual()) {
10588 if (Op1.isUndef())
10589 return nullptr;
10590 RSR = getRegSubRegPair(Op1);
10591 DefInst = MRI.getVRegDef(RSR.Reg);
10592 }
10593 break;
10594 }
10595 default:
10596 if (followSubRegDef(*MI, RSR)) {
10597 if (!RSR.Reg)
10598 return nullptr;
10599 DefInst = MRI.getVRegDef(RSR.Reg);
10600 }
10601 }
10602 if (!DefInst)
10603 return MI;
10604 }
10605 return nullptr;
10606}
10607
10609 Register VReg,
10610 const MachineInstr &DefMI,
10611 const MachineInstr &UseMI) {
10612 assert(MRI.isSSA() && "Must be run on SSA");
10613
10614 auto *TRI = MRI.getTargetRegisterInfo();
10615 auto *DefBB = DefMI.getParent();
10616
10617 // Don't bother searching between blocks, although it is possible this block
10618 // doesn't modify exec.
10619 if (UseMI.getParent() != DefBB)
10620 return true;
10621
10622 const int MaxInstScan = 20;
10623 int NumInst = 0;
10624
10625 // Stop scan at the use.
10626 auto E = UseMI.getIterator();
10627 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
10628 if (I->isDebugInstr())
10629 continue;
10630
10631 if (++NumInst > MaxInstScan)
10632 return true;
10633
10634 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
10635 return true;
10636 }
10637
10638 return false;
10639}
10640
10642 Register VReg,
10643 const MachineInstr &DefMI) {
10644 assert(MRI.isSSA() && "Must be run on SSA");
10645
10646 auto *TRI = MRI.getTargetRegisterInfo();
10647 auto *DefBB = DefMI.getParent();
10648
10649 const int MaxUseScan = 10;
10650 int NumUse = 0;
10651
10652 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
10653 auto &UseInst = *Use.getParent();
10654 // Don't bother searching between blocks, although it is possible this block
10655 // doesn't modify exec.
10656 if (UseInst.getParent() != DefBB || UseInst.isPHI())
10657 return true;
10658
10659 if (++NumUse > MaxUseScan)
10660 return true;
10661 }
10662
10663 if (NumUse == 0)
10664 return false;
10665
10666 const int MaxInstScan = 20;
10667 int NumInst = 0;
10668
10669 // Stop scan when we have seen all the uses.
10670 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
10671 assert(I != DefBB->end());
10672
10673 if (I->isDebugInstr())
10674 continue;
10675
10676 if (++NumInst > MaxInstScan)
10677 return true;
10678
10679 for (const MachineOperand &Op : I->operands()) {
10680 // We don't check reg masks here as they're used only on calls:
10681 // 1. EXEC is only considered const within one BB
10682 // 2. Call should be a terminator instruction if present in a BB
10683
10684 if (!Op.isReg())
10685 continue;
10686
10687 Register Reg = Op.getReg();
10688 if (Op.isUse()) {
10689 if (Reg == VReg && --NumUse == 0)
10690 return false;
10691 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
10692 return true;
10693 }
10694 }
10695}
10696
10699 const DebugLoc &DL, Register Src, Register Dst) const {
10700 auto Cur = MBB.begin();
10701 if (Cur != MBB.end())
10702 do {
10703 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
10704 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
10705 ++Cur;
10706 } while (Cur != MBB.end() && Cur != LastPHIIt);
10707
10708 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
10709 Dst);
10710}
10711
10714 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
10715 if (InsPt != MBB.end() &&
10716 (InsPt->getOpcode() == AMDGPU::SI_IF ||
10717 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
10718 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
10719 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
10720 InsPt++;
10721 return BuildMI(MBB, InsPt, DL,
10722 get(AMDGPU::LaneMaskConstants::get(ST).MovTermOpc), Dst)
10723 .addReg(Src, {}, SrcSubReg)
10724 .addReg(AMDGPU::EXEC, RegState::Implicit);
10725 }
10726 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
10727 Dst);
10728}
10729
10730bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
10731
10733 const MachineInstr &SecondMI) const {
10734 for (const auto &Use : SecondMI.all_uses()) {
10735 if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg(), &RI))
10736 return true;
10737 }
10738 return false;
10739}
10740
10741/// If OpX is multicycle, anti-dependencies are not allowed.
10742/// isDPMACCInstruction was not designed for VOPD, but it is fit for the
10743/// purpose.
10745 const MachineInstr &OpX) const {
10747}
10748
10751 ArrayRef<unsigned> Ops, int FrameIndex,
10752 MachineInstr *&CopyMI, LiveIntervals *LIS,
10753 VirtRegMap *VRM) const {
10754 // This is a bit of a hack (copied from AArch64). Consider this instruction:
10755 //
10756 // %0:sreg_32 = COPY $m0
10757 //
10758 // We explicitly chose SReg_32 for the virtual register so such a copy might
10759 // be eliminated by RegisterCoalescer. However, that may not be possible, and
10760 // %0 may even spill. We can't spill $m0 normally (it would require copying to
10761 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
10762 // TargetInstrInfo::foldMemoryOperand() is going to try.
10763 // A similar issue also exists with spilling and reloading $exec registers.
10764 //
10765 // To prevent that, constrain the %0 register class here.
10766 if (isFullCopyInstr(MI)) {
10767 Register DstReg = MI.getOperand(0).getReg();
10768 Register SrcReg = MI.getOperand(1).getReg();
10769 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
10770 (DstReg.isVirtual() != SrcReg.isVirtual())) {
10771 MachineRegisterInfo &MRI = MF.getRegInfo();
10772 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
10773 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
10774 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
10775 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
10776 return nullptr;
10777 }
10778 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
10779 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
10780 return nullptr;
10781 }
10782 }
10783 }
10784
10785 return nullptr;
10786}
10787
10789 const MachineInstr &MI,
10790 unsigned *PredCost) const {
10791 if (MI.isBundle()) {
10793 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
10794 unsigned Lat = 0, Count = 0;
10795 for (++I; I != E && I->isBundledWithPred(); ++I) {
10796 ++Count;
10797 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
10798 }
10799 return Lat + Count - 1;
10800 }
10801
10802 return SchedModel.computeInstrLatency(&MI);
10803}
10804
10805const MachineOperand &
10807 if (const MachineOperand *CallAddrOp =
10808 getNamedOperand(MI, AMDGPU::OpName::src0))
10809 return *CallAddrOp;
10811}
10812
10815 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10816 unsigned Opcode = MI.getOpcode();
10817
10818 auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
10819 Register Dst = MI.getOperand(0).getReg();
10820 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
10821 : MI.getOperand(1).getReg();
10822 LLT DstTy = MRI.getType(Dst);
10823 LLT SrcTy = MRI.getType(Src);
10824 unsigned DstAS = DstTy.getAddressSpace();
10825 unsigned SrcAS = SrcTy.getAddressSpace();
10826 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
10827 DstAS == AMDGPUAS::FLAT_ADDRESS &&
10828 ST.hasGloballyAddressableScratch()
10831 };
10832
10833 // If the target supports globally addressable scratch, the mapping from
10834 // scratch memory to the flat aperture changes therefore an address space cast
10835 // is no longer uniform.
10836 if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
10837 return HandleAddrSpaceCast(MI);
10838
10839 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
10840 auto IID = GI->getIntrinsicID();
10845
10846 switch (IID) {
10847 case Intrinsic::amdgcn_addrspacecast_nonnull:
10848 return HandleAddrSpaceCast(MI);
10849 case Intrinsic::amdgcn_if:
10850 case Intrinsic::amdgcn_else:
10851 // FIXME: Uniform if second result
10852 break;
10853 }
10854
10856 }
10857
10858 // Loads from the private and flat address spaces are divergent, because
10859 // threads can execute the load instruction with the same inputs and get
10860 // different results.
10861 //
10862 // All other loads are not divergent, because if threads issue loads with the
10863 // same arguments, they will always get the same result.
10864 if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
10865 Opcode == AMDGPU::G_SEXTLOAD) {
10866 if (MI.memoperands_empty())
10867 return ValueUniformity::NeverUniform; // conservative assumption
10868
10869 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10870 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10871 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10872 })) {
10873 // At least one MMO in a non-global address space.
10875 }
10877 }
10878
10879 if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
10880 Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
10881 Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
10882 AMDGPU::isGenericAtomic(Opcode)) {
10884 }
10886}
10887
10889 if (!Formatter)
10890 Formatter = std::make_unique<AMDGPUMIRFormatter>(ST);
10891 return Formatter.get();
10892}
10893
10895
10896 if (isNeverUniform(MI))
10898
10899 unsigned opcode = MI.getOpcode();
10900 if (opcode == AMDGPU::V_READLANE_B32 ||
10901 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
10902 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
10904
10905 if (isCopyInstr(MI)) {
10906 const MachineOperand &srcOp = MI.getOperand(1);
10907 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
10908 const TargetRegisterClass *regClass =
10909 RI.getPhysRegBaseClass(srcOp.getReg());
10910 return RI.isSGPRClass(regClass) ? ValueUniformity::AlwaysUniform
10912 }
10914 }
10915
10916 // GMIR handling
10917 if (MI.isPreISelOpcode())
10919
10920 // Atomics are divergent because they are executed sequentially: when an
10921 // atomic operation refers to the same address in each thread, then each
10922 // thread after the first sees the value written by the previous thread as
10923 // original value.
10924
10925 if (isAtomic(MI))
10927
10928 // Loads from the private and flat address spaces are divergent, because
10929 // threads can execute the load instruction with the same inputs and get
10930 // different results.
10931 if (isFLAT(MI) && MI.mayLoad()) {
10932 if (MI.memoperands_empty())
10933 return ValueUniformity::NeverUniform; // conservative assumption
10934
10935 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
10936 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
10937 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
10938 })) {
10939 // At least one MMO in a non-global address space.
10941 }
10942
10944 }
10945
10946 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
10947 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
10948
10949 // FIXME: It's conceptually broken to report this for an instruction, and not
10950 // a specific def operand. For inline asm in particular, there could be mixed
10951 // uniform and divergent results.
10952 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
10953 const MachineOperand &SrcOp = MI.getOperand(I);
10954 if (!SrcOp.isReg())
10955 continue;
10956
10957 Register Reg = SrcOp.getReg();
10958 if (!Reg || !SrcOp.readsReg())
10959 continue;
10960
10961 // If RegBank is null, this is unassigned or an unallocatable special
10962 // register, which are all scalars.
10963 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
10964 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
10966 }
10967
10968 // TODO: Uniformity check condtions above can be rearranged for more
10969 // redability
10970
10971 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
10972 // currently turned into no-op COPYs by SelectionDAG ISel and are
10973 // therefore no longer recognizable.
10974
10976}
10977
10979 switch (MF.getFunction().getCallingConv()) {
10981 return 1;
10983 return 2;
10985 return 3;
10989 const Function &F = MF.getFunction();
10990 F.getContext().diagnose(DiagnosticInfoUnsupported(
10991 F, "ds_ordered_count unsupported for this calling conv"));
10992 [[fallthrough]];
10993 }
10996 case CallingConv::C:
10997 case CallingConv::Fast:
10998 default:
10999 // Assume other calling conventions are various compute callable functions
11000 return 0;
11001 }
11002}
11003
11005 Register &SrcReg2, int64_t &CmpMask,
11006 int64_t &CmpValue) const {
11007 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
11008 return false;
11009
11010 switch (MI.getOpcode()) {
11011 default:
11012 break;
11013 case AMDGPU::S_CMP_EQ_U32:
11014 case AMDGPU::S_CMP_EQ_I32:
11015 case AMDGPU::S_CMP_LG_U32:
11016 case AMDGPU::S_CMP_LG_I32:
11017 case AMDGPU::S_CMP_LT_U32:
11018 case AMDGPU::S_CMP_LT_I32:
11019 case AMDGPU::S_CMP_GT_U32:
11020 case AMDGPU::S_CMP_GT_I32:
11021 case AMDGPU::S_CMP_LE_U32:
11022 case AMDGPU::S_CMP_LE_I32:
11023 case AMDGPU::S_CMP_GE_U32:
11024 case AMDGPU::S_CMP_GE_I32:
11025 case AMDGPU::S_CMP_EQ_U64:
11026 case AMDGPU::S_CMP_LG_U64:
11027 SrcReg = MI.getOperand(0).getReg();
11028 if (MI.getOperand(1).isReg()) {
11029 if (MI.getOperand(1).getSubReg())
11030 return false;
11031 SrcReg2 = MI.getOperand(1).getReg();
11032 CmpValue = 0;
11033 } else if (MI.getOperand(1).isImm()) {
11034 SrcReg2 = Register();
11035 CmpValue = MI.getOperand(1).getImm();
11036 } else {
11037 return false;
11038 }
11039 CmpMask = ~0;
11040 return true;
11041 case AMDGPU::S_CMPK_EQ_U32:
11042 case AMDGPU::S_CMPK_EQ_I32:
11043 case AMDGPU::S_CMPK_LG_U32:
11044 case AMDGPU::S_CMPK_LG_I32:
11045 case AMDGPU::S_CMPK_LT_U32:
11046 case AMDGPU::S_CMPK_LT_I32:
11047 case AMDGPU::S_CMPK_GT_U32:
11048 case AMDGPU::S_CMPK_GT_I32:
11049 case AMDGPU::S_CMPK_LE_U32:
11050 case AMDGPU::S_CMPK_LE_I32:
11051 case AMDGPU::S_CMPK_GE_U32:
11052 case AMDGPU::S_CMPK_GE_I32:
11053 SrcReg = MI.getOperand(0).getReg();
11054 SrcReg2 = Register();
11055 CmpValue = MI.getOperand(1).getImm();
11056 CmpMask = ~0;
11057 return true;
11058 }
11059
11060 return false;
11061}
11062
11064 for (MachineBasicBlock *S : MBB->successors()) {
11065 if (S->isLiveIn(AMDGPU::SCC))
11066 return false;
11067 }
11068 return true;
11069}
11070
11071// Invert all uses of SCC following SCCDef because SCCDef may be deleted and
11072// (incoming SCC) = !(SCC defined by SCCDef).
11073// Return true if all uses can be re-written, false otherwise.
11074bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
11075 MachineBasicBlock *MBB = SCCDef->getParent();
11076 SmallVector<MachineInstr *> InvertInstr;
11077 bool SCCIsDead = false;
11078
11079 // Scan instructions for SCC uses that need to be inverted until SCC is dead.
11080 constexpr unsigned ScanLimit = 12;
11081 unsigned Count = 0;
11082 for (MachineInstr &MI :
11083 make_range(std::next(MachineBasicBlock::iterator(SCCDef)), MBB->end())) {
11084 if (++Count > ScanLimit)
11085 return false;
11086 if (MI.readsRegister(AMDGPU::SCC, &RI)) {
11087 if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
11088 MI.getOpcode() == AMDGPU::S_CSELECT_B64 ||
11089 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
11090 MI.getOpcode() == AMDGPU::S_CBRANCH_SCC1)
11091 InvertInstr.push_back(&MI);
11092 else
11093 return false;
11094 }
11095 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
11096 SCCIsDead = true;
11097 break;
11098 }
11099 }
11100 if (!SCCIsDead && isSCCDeadOnExit(MBB))
11101 SCCIsDead = true;
11102
11103 // SCC may have more uses. Can't invert all of them.
11104 if (!SCCIsDead)
11105 return false;
11106
11107 // Invert uses
11108 for (MachineInstr *MI : InvertInstr) {
11109 if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
11110 MI->getOpcode() == AMDGPU::S_CSELECT_B64) {
11111 swapOperands(*MI);
11112 } else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
11113 MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) {
11114 MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
11115 ? AMDGPU::S_CBRANCH_SCC1
11116 : AMDGPU::S_CBRANCH_SCC0));
11117 } else {
11118 llvm_unreachable("SCC used but no inversion handling");
11119 }
11120 }
11121 return true;
11122}
11123
11124// SCC is already valid after SCCValid.
11125// SCCRedefine will redefine SCC to the same value already available after
11126// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
11127// update kill/dead flags if necessary.
11128bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
11129 bool NeedInversion) const {
11130 MachineInstr *KillsSCC = nullptr;
11131 if (SCCValid->getParent() != SCCRedefine->getParent())
11132 return false;
11133 for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
11134 SCCRedefine->getIterator())) {
11135 if (MI.modifiesRegister(AMDGPU::SCC, &RI))
11136 return false;
11137 if (MI.killsRegister(AMDGPU::SCC, &RI))
11138 KillsSCC = &MI;
11139 }
11140 if (NeedInversion && !invertSCCUse(SCCRedefine))
11141 return false;
11142 if (MachineOperand *SccDef =
11143 SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
11144 SccDef->setIsDead(false);
11145 if (KillsSCC)
11146 KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
11147 SCCRedefine->eraseFromParent();
11148 return true;
11149}
11150
11151static bool foldableSelect(const MachineInstr &Def) {
11152 if (Def.getOpcode() != AMDGPU::S_CSELECT_B32 &&
11153 Def.getOpcode() != AMDGPU::S_CSELECT_B64)
11154 return false;
11155 bool Op1IsNonZeroImm =
11156 Def.getOperand(1).isImm() && Def.getOperand(1).getImm() != 0;
11157 bool Op2IsZeroImm =
11158 Def.getOperand(2).isImm() && Def.getOperand(2).getImm() == 0;
11159 if (!Op1IsNonZeroImm || !Op2IsZeroImm)
11160 return false;
11161 return true;
11162}
11163
11164static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion,
11165 unsigned &NewDefOpc) {
11166 // S_ADD_U32 X, 1 sets SCC on carryout which can only happen if result==0.
11167 // S_ADD_I32 X, 1 can be converted to S_ADD_U32 X, 1 if SCC is dead.
11168 if (Def.getOpcode() != AMDGPU::S_ADD_I32 &&
11169 Def.getOpcode() != AMDGPU::S_ADD_U32)
11170 return false;
11171 const MachineOperand &AddSrc1 = Def.getOperand(1);
11172 const MachineOperand &AddSrc2 = Def.getOperand(2);
11173 int64_t addend;
11174
11175 if ((!AddSrc1.isImm() || AddSrc1.getImm() != 1) &&
11176 (!AddSrc2.isImm() || AddSrc2.getImm() != 1) &&
11177 (!getFoldableImm(&AddSrc1, addend) || addend != 1) &&
11178 (!getFoldableImm(&AddSrc2, addend) || addend != 1))
11179 return false;
11180
11181 if (Def.getOpcode() == AMDGPU::S_ADD_I32) {
11182 const MachineOperand *SccDef =
11183 Def.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
11184 if (!SccDef->isDead())
11185 return false;
11186 NewDefOpc = AMDGPU::S_ADD_U32;
11187 }
11188 NeedInversion = !NeedInversion;
11189 return true;
11190}
11191
11193 Register SrcReg2, int64_t CmpMask,
11194 int64_t CmpValue,
11195 const MachineRegisterInfo *MRI) const {
11196 if (!SrcReg || SrcReg.isPhysical())
11197 return false;
11198
11199 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
11200 return false;
11201
11202 const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
11203 this](bool NeedInversion) -> bool {
11204 if (CmpValue != 0)
11205 return false;
11206
11207 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11208 if (!Def)
11209 return false;
11210
11211 // For S_OP that set SCC = DST!=0, do the transformation
11212 //
11213 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11214 //
11215 // For (S_OP ...) that set SCC = DST==0, invert NeedInversion and
11216 // do the transformation:
11217 //
11218 // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
11219 //
11220 // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
11221 // for S_CSELECT* already has the same value that will be calculated by
11222 // s_cmp_lg_*
11223 //
11224 // s_cmp_[lg|eq]_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT*
11225 // (non-zero imm), 0)
11226
11227 unsigned NewDefOpc = Def->getOpcode();
11228 if (!setsSCCIfResultIsNonZero(*Def) &&
11229 !setsSCCIfResultIsZero(*Def, NeedInversion, NewDefOpc) &&
11230 !foldableSelect(*Def))
11231 return false;
11232
11233 if (!optimizeSCC(Def, &CmpInstr, NeedInversion))
11234 return false;
11235
11236 if (NewDefOpc != Def->getOpcode())
11237 Def->setDesc(get(NewDefOpc));
11238
11239 // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
11240 // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
11241 // 64-bit foldableSelect then delete s_or_b32 in the sequence:
11242 // sX = s_cselect_b64 (non-zero imm), 0
11243 // sLo = copy sX.sub0
11244 // sHi = copy sX.sub1
11245 // sY = s_or_b32 sLo, sHi
11246 if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
11247 MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
11248 const MachineOperand &OrOpnd1 = Def->getOperand(1);
11249 const MachineOperand &OrOpnd2 = Def->getOperand(2);
11250 if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
11251 MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
11252 MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
11253 if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
11254 Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
11255 Def2->getOperand(1).isReg() &&
11256 Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
11257 Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
11258 Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
11259 MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
11260 if (Select && foldableSelect(*Select))
11261 optimizeSCC(Select, Def, /*NeedInversion=*/false);
11262 }
11263 }
11264 }
11265 return true;
11266 };
11267
11268 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
11269 this](int64_t ExpectedValue, unsigned SrcSize,
11270 bool IsReversible, bool IsSigned) -> bool {
11271 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11272 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11273 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11274 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
11275 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
11276 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11277 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11278 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11279 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
11280 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
11281 //
11282 // Signed ge/gt are not used for the sign bit.
11283 //
11284 // If result of the AND is unused except in the compare:
11285 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
11286 //
11287 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11288 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
11289 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
11290 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11291 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
11292 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
11293
11294 MachineInstr *Def = MRI->getVRegDef(SrcReg);
11295 if (!Def)
11296 return false;
11297
11298 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
11299 Def->getOpcode() != AMDGPU::S_AND_B64)
11300 return false;
11301
11302 int64_t Mask;
11303 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
11304 if (MO->isImm())
11305 Mask = MO->getImm();
11306 else if (!getFoldableImm(MO, Mask))
11307 return false;
11308 Mask &= maxUIntN(SrcSize);
11309 return isPowerOf2_64(Mask);
11310 };
11311
11312 MachineOperand *SrcOp = &Def->getOperand(1);
11313 if (isMask(SrcOp))
11314 SrcOp = &Def->getOperand(2);
11315 else if (isMask(&Def->getOperand(2)))
11316 SrcOp = &Def->getOperand(1);
11317 else
11318 return false;
11319
11320 // A valid Mask is required to have a single bit set, hence a non-zero and
11321 // power-of-two value. This verifies that we will not do 64-bit shift below.
11322 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
11323 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
11324 if (IsSigned && BitNo == SrcSize - 1)
11325 return false;
11326
11327 ExpectedValue <<= BitNo;
11328
11329 bool IsReversedCC = false;
11330 if (CmpValue != ExpectedValue) {
11331 if (!IsReversible)
11332 return false;
11333 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
11334 if (!IsReversedCC)
11335 return false;
11336 }
11337
11338 Register DefReg = Def->getOperand(0).getReg();
11339 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
11340 return false;
11341
11342 if (!optimizeSCC(Def, &CmpInstr, /*NeedInversion=*/false))
11343 return false;
11344
11345 if (!MRI->use_nodbg_empty(DefReg)) {
11346 assert(!IsReversedCC);
11347 return true;
11348 }
11349
11350 // Replace AND with unused result with a S_BITCMP.
11351 MachineBasicBlock *MBB = Def->getParent();
11352
11353 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
11354 : AMDGPU::S_BITCMP1_B32
11355 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
11356 : AMDGPU::S_BITCMP1_B64;
11357
11358 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
11359 .add(*SrcOp)
11360 .addImm(BitNo);
11361 Def->eraseFromParent();
11362
11363 return true;
11364 };
11365
11366 switch (CmpInstr.getOpcode()) {
11367 default:
11368 break;
11369 case AMDGPU::S_CMP_EQ_U32:
11370 case AMDGPU::S_CMP_EQ_I32:
11371 case AMDGPU::S_CMPK_EQ_U32:
11372 case AMDGPU::S_CMPK_EQ_I32:
11373 return optimizeCmpAnd(1, 32, true, false) ||
11374 optimizeCmpSelect(/*NeedInversion=*/true);
11375 case AMDGPU::S_CMP_GE_U32:
11376 case AMDGPU::S_CMPK_GE_U32:
11377 return optimizeCmpAnd(1, 32, false, false);
11378 case AMDGPU::S_CMP_GE_I32:
11379 case AMDGPU::S_CMPK_GE_I32:
11380 return optimizeCmpAnd(1, 32, false, true);
11381 case AMDGPU::S_CMP_EQ_U64:
11382 return optimizeCmpAnd(1, 64, true, false);
11383 case AMDGPU::S_CMP_LG_U32:
11384 case AMDGPU::S_CMP_LG_I32:
11385 case AMDGPU::S_CMPK_LG_U32:
11386 case AMDGPU::S_CMPK_LG_I32:
11387 return optimizeCmpAnd(0, 32, true, false) ||
11388 optimizeCmpSelect(/*NeedInversion=*/false);
11389 case AMDGPU::S_CMP_GT_U32:
11390 case AMDGPU::S_CMPK_GT_U32:
11391 return optimizeCmpAnd(0, 32, false, false);
11392 case AMDGPU::S_CMP_GT_I32:
11393 case AMDGPU::S_CMPK_GT_I32:
11394 return optimizeCmpAnd(0, 32, false, true);
11395 case AMDGPU::S_CMP_LG_U64:
11396 return optimizeCmpAnd(0, 64, true, false) ||
11397 optimizeCmpSelect(/*NeedInversion=*/false);
11398 }
11399
11400 return false;
11401}
11402
11404 AMDGPU::OpName OpName) const {
11405 if (!ST.needsAlignedVGPRs())
11406 return;
11407
11408 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
11409 if (OpNo < 0)
11410 return;
11411 MachineOperand &Op = MI.getOperand(OpNo);
11412 if (getOpSize(MI, OpNo) > 4)
11413 return;
11414
11415 // Add implicit aligned super-reg to force alignment on the data operand.
11416 const DebugLoc &DL = MI.getDebugLoc();
11417 MachineBasicBlock *BB = MI.getParent();
11419 Register DataReg = Op.getReg();
11420 bool IsAGPR = RI.isAGPR(MRI, DataReg);
11422 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
11423 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
11424 Register NewVR =
11425 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
11426 : &AMDGPU::VReg_64_Align2RegClass);
11427 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
11428 .addReg(DataReg, {}, Op.getSubReg())
11429 .addImm(AMDGPU::sub0)
11430 .addReg(Undef)
11431 .addImm(AMDGPU::sub1);
11432 Op.setReg(NewVR);
11433 Op.setSubReg(AMDGPU::sub0);
11434 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
11435}
11436
11438 if (isIGLP(*MI))
11439 return false;
11440
11442}
11443
11445 if (!isWMMA(MI) && !isSWMMAC(MI))
11446 return false;
11447
11448 if (ST.hasGFX1250Insts())
11449 return AMDGPU::getWMMAIsXDL(MI.getOpcode());
11450
11451 return true;
11452}
11453
11455 unsigned Opcode = MI.getOpcode();
11456
11457 if (AMDGPU::isGFX12Plus(ST))
11458 return isDOT(MI) || isXDLWMMA(MI);
11459
11460 if (!isMAI(MI) || isDGEMM(Opcode) ||
11461 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
11462 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
11463 return false;
11464
11465 if (!ST.hasGFX940Insts())
11466 return true;
11467
11468 return AMDGPU::getMAIIsGFX940XDL(Opcode);
11469}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static bool isUndef(const MachineInstr &MI)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps, ArrayRef< Register > PhySGPRs={})
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
static MachineBasicBlock * generateWaterFallLoop(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr, ArrayRef< Register > PhySGPRs={})
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static bool setsSCCIfResultIsZero(const MachineInstr &Def, bool &NeedInversion, unsigned &NewDefOpc)
static bool isSCCDeadOnExit(MachineBasicBlock *MBB)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static constexpr AMDGPU::OpName ModifierOpNames[]
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static bool foldableSelect(const MachineInstr &Def)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, AMDGPU::OpName OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
static bool canRemat(const MachineInstr &MI)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
#define LLVM_DEBUG(...)
Definition Debug.h:114
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static LLVM_ABI Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition APFloat.cpp:145
Class for arbitrary precision integers.
Definition APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
Get the first element.
Definition ArrayRef.h:144
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
uint64_t getZExtValue() const
A debug info location.
Definition DebugLoc.h:123
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:254
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
constexpr unsigned getAddressSpace() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
static LocationSize precise(uint64_t Value)
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:418
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
unsigned getSize() const
Return the number of bytes in the encoding of this instruction, or zero if the encoding size cannot b...
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
unsigned getOpcode() const
Return the opcode number for this descriptor.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition MCInstrDesc.h:92
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
LLVM_ABI void setVariableValue(const MCExpr *Value)
Definition MCSymbol.cpp:50
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
MIRFormater - Interface to format MIR operand based on target.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool isBundle() const
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
bool modifiesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI void eraseFromBundle()
Unlink 'this' from its basic block and delete it.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mop_range explicit_operands()
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
LLVM_ABI void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
LLVM_ABI void clearRegisterKills(Register Reg, const TargetRegisterInfo *RegInfo)
Clear all kill flags affecting Reg.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
iterator_range< use_nodbg_iterator > use_nodbg_operands(Register Reg) const
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI void moveOperands(MachineOperand *Dst, MachineOperand *Src, unsigned NumOps)
Move NumOps operands from Src to Dst, updating use-def lists as needed.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
bool reservedRegsFrozen() const
reservedRegsFrozen - Returns true after freezeReservedRegs() was called to ensure the set of reserved...
LLVM_ABI void clearVirtRegs()
clearVirtRegs - Remove all virtual registers (after physreg assignment).
void setRegAllocationHint(Register VReg, unsigned Type, Register PrefReg)
setRegAllocationHint - Specify a register allocation hint for the specified virtual register.
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
void setSimpleHint(Register VReg, Register PrefReg)
Specify the preferred (target independent) register allocation hint for the specified virtual registe...
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
iterator_range< use_iterator > use_operands(Register Reg) const
LLVM_ABI void removeRegOperandFromUseList(MachineOperand *MO)
Remove MO from its use-def list.
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
LLVM_ABI void addRegOperandToUseList(MachineOperand *MO)
Add MO to the linked list of operands for its register.
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
bool canAddToBBProlog(const MachineInstr &MI) const
static bool isDS(const MachineInstr &MI)
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
bool isXDLWMMA(const MachineInstr &MI) const
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
bool isSpill(uint32_t Opcode) const
uint64_t getDefaultRsrcDataFormat() const
static bool isSOPP(const MachineInstr &MI)
bool mayAccessScratch(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
static bool isFLATScratch(const MachineInstr &MI)
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getFoldableCopySrcIdx(const MachineInstr &MI)
unsigned getOpSize(uint32_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static bool isMTBUF(const MachineInstr &MI)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool setsSCCIfResultIsNonZero(const MachineInstr &MI)
const MIRFormatter * getMIRFormatter() const override
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
static bool isVOP2(const MachineInstr &MI)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
MachineInstr * getWholeWaveFunctionSetup(MachineFunction &MF) const
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
static bool isDOT(const MachineInstr &MI)
InstSizeVerifyMode getInstSizeVerifyMode(const MachineInstr &MI) const override
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
static bool isSWMMAC(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
bool isXDL(const MachineInstr &MI) const
Register isStackAccess(const MachineInstr &MI, int &FrameIndex, TypeSize &MemBytes) const
static bool isVIMAGE(const MachineInstr &MI)
void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const
static bool isSOP2(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
bool hasRAWDependency(const MachineInstr &FirstMI, const MachineInstr &SecondMI) const
bool isLegalAV64PseudoImm(uint64_t Imm) const
Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isNeverCoissue(MachineInstr &MI) const
static bool isBUF(const MachineInstr &MI)
void handleCopyToPhysHelper(SIInstrWorklist &Worklist, Register DstReg, MachineInstr &Inst, MachineRegisterInfo &MRI, DenseMap< MachineInstr *, V2PhysSCopyInfo > &WaterFalls, DenseMap< MachineInstr *, bool > &V2SPhyCopiesToErase) const
bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, unsigned toIdx) const
static bool isFLATGlobal(const MachineInstr &MI)
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, int FrameIndex, MachineInstr *&CopyMI, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool isVOPDAntidependencyAllowed(const MachineInstr &MI) const
If OpX is multicycle, anti-dependencies are not allowed.
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex, TypeSize &MemBytes) const
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isTRANS(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static bool isSOPK(const MachineInstr &MI)
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void createWaterFallForSiCall(MachineInstr *MI, MachineDominatorTree *MDT, ArrayRef< MachineOperand * > ScalarOps, ArrayRef< Register > PhySGPRs={}) const
Wrapper function for generating waterfall for instruction MI This function take into consideration of...
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
bool isReMaterializableImpl(const MachineInstr &MI) const override
static bool isVOP3(const MCInstrDesc &Desc)
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool physRegUsesConstantBus(const MachineOperand &Reg) const
static bool isF16PseudoScalarTrans(unsigned Opcode)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const
static bool isDPP(const MachineInstr &MI)
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
static bool isMFMA(const MachineInstr &MI)
bool isLowLatencyInstruction(const MachineInstr &MI) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
void mutateAndCleanupImplicit(MachineInstr &MI, const MCInstrDesc &NewDesc) const
ValueUniformity getGenericValueUniformity(const MachineInstr &MI) const
static bool isMAI(const MCInstrDesc &Desc)
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, LaneBitmask UsedLanes=LaneBitmask::getAll()) const override
static bool usesLGKM_CNT(const MachineInstr &MI)
void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const
Fix operands in Inst to fix 16bit SALU to VALU lowering.
bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
const MachineOperand & getCalleeOperand(const MachineInstr &MI) const override
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
bool isAlwaysGDS(uint32_t Opcode) const
static bool isVGPRSpill(const MachineInstr &MI)
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction with the giv...
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
int64_t getNamedImmOperand(const MachineInstr &MI, AMDGPU::OpName OperandName) const
Get required immediate operand.
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool regUsesConstantBus(const MachineOperand &Reg, const MachineRegisterInfo &MRI) const
static bool isMIMG(const MachineInstr &MI)
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIMachineFunctionInfo &MFI) const
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
void createReadFirstLaneFromCopyToPhysReg(MachineRegisterInfo &MRI, Register DstReg, MachineInstr &Inst) const
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, AMDGPU::OpName Src0OpName, MachineOperand &Src1, AMDGPU::OpName Src1OpName) const
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
static bool isAtomic(const MachineInstr &MI)
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const
static bool isWWMRegSpillOpcode(uint32_t Opcode)
static bool sopkIsZext(unsigned Opcode)
static bool isSGPRSpill(const MachineInstr &MI)
static bool isWMMA(const MachineInstr &MI)
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
bool isBarrier(unsigned Opcode) const
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
bool isLegalGFX12PlusPackedMathFP32Operand(const MachineRegisterInfo &MRI, const MachineInstr &MI, unsigned SrcN, const MachineOperand *MO=nullptr) const
Check if MO would be a legal operand for gfx12+ packed math FP32 instructions.
static bool usesVM_CNT(const MachineInstr &MI)
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
static bool isFixedSize(const MachineInstr &MI)
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
ValueUniformity getValueUniformity(const MachineInstr &MI) const final
uint64_t getScratchRsrcWords23() const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
std::optional< int64_t > getImmOrMaterializedImm(MachineOperand &Op) const
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst, DenseMap< MachineInstr *, V2PhysSCopyInfo > &WaterFalls, DenseMap< MachineInstr *, bool > &V2SPhyCopiesToErase) const
static bool isLDSDMA(const MachineInstr &MI)
static bool isVOP1(const MachineInstr &MI)
SIInstrInfo(const GCNSubtarget &ST)
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
unsigned getScratchReservedForDynamicVGPRs() const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
unsigned getHWRegIndex(MCRegister Reg) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
unsigned getChannelFromSubReg(unsigned SubReg) const
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual const MachineOperand & getCalleeOperand(const MachineInstr &MI) const
Returns the callee operand from the given MI.
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, LaneBitmask UsedLanes=LaneBitmask::getAll()) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo & getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isPackedFP32Inst(unsigned Opc)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
bool isPKFMACF16InlineConstant(uint32_t Literal, bool IsGFX11Plus)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
bool getWMMAIsXDL(unsigned Opc)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isDPMACCInstruction(unsigned Opc)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int32_t getCommuteRev(uint32_t Opcode)
LLVM_READONLY int32_t getCommuteOrig(uint32_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
LLVM_READONLY int32_t getGlobalVaddrOp(uint32_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READONLY int32_t getMFMAEarlyClobberOp(uint32_t Opcode)
bool getMAIIsGFX940XDL(unsigned Opc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY int32_t getIfAddr64Inst(uint32_t Opcode)
Check if Opcode is an Addr64 opcode.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
const uint64_t RSRC_TID_ENABLE
LLVM_READONLY int32_t getVOPe32(uint32_t Opcode)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
bool isGenericAtomic(unsigned Opc)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
LLVM_READONLY int32_t getAddr64Inst(uint32_t Opcode)
int32_t getMCOpcode(uint32_t Opcode, unsigned Gen)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition SIDefines.h:234
@ OPERAND_REG_IMM_INT64
Definition SIDefines.h:204
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_C_BF16
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:227
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:213
@ OPERAND_REG_IMM_BF16
Definition SIDefines.h:208
@ OPERAND_REG_IMM_INT32
Operands with register, 32-bit, or 64-bit immediate.
Definition SIDefines.h:203
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_FP16
Definition SIDefines.h:209
@ OPERAND_REG_IMM_V2FP16_SPLAT
Definition SIDefines.h:212
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition SIDefines.h:219
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:214
@ OPERAND_REG_IMM_FP64
Definition SIDefines.h:207
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:228
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:239
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:240
@ OPERAND_REG_IMM_V2INT32
Definition SIDefines.h:215
@ OPERAND_SDWA_VOPC_DST
Definition SIDefines.h:251
@ OPERAND_REG_IMM_FP32
Definition SIDefines.h:206
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:224
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:220
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:226
@ OPERAND_INLINE_C_AV64_PSEUDO
Definition SIDefines.h:245
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:216
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:241
@ OPERAND_REG_INLINE_C_FP16
Definition SIDefines.h:223
@ OPERAND_REG_IMM_INT16
Definition SIDefines.h:205
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition SIDefines.h:231
LLVM_READONLY int32_t getBasicFromSDWAOp(uint32_t Opcode)
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII, const MCSubtargetInfo &ST)
@ TI_SCRATCH_RSRC_DWORD1
Definition AMDGPU.h:614
@ TI_SCRATCH_RSRC_DWORD3
Definition AMDGPU.h:616
@ TI_SCRATCH_RSRC_DWORD0
Definition AMDGPU.h:613
@ TI_SCRATCH_RSRC_DWORD2
Definition AMDGPU.h:615
@ TI_CONSTDATA_START
Definition AMDGPU.h:612
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READONLY int32_t getFlatScratchInstSVfromSS(uint32_t Opcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ OPERAND_GENERIC_4
Definition MCInstrDesc.h:71
@ OPERAND_GENERIC_2
Definition MCInstrDesc.h:69
@ OPERAND_GENERIC_1
Definition MCInstrDesc.h:68
@ OPERAND_GENERIC_3
Definition MCInstrDesc.h:70
@ OPERAND_IMMEDIATE
Definition MCInstrDesc.h:61
@ OPERAND_GENERIC_0
Definition MCInstrDesc.h:67
@ OPERAND_GENERIC_5
Definition MCInstrDesc.h:72
Not(const Pred &P) -> Not< Pred >
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:557
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:207
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:149
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, const MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI VirtRegInfo AnalyzeVirtRegInBundle(MachineInstr &MI, Register Reg, SmallVectorImpl< std::pair< MachineInstr *, unsigned > > *Ops=nullptr)
AnalyzeVirtRegInBundle - Analyze how the current instruction or bundle uses a virtual register.
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
@ Xor
Bitwise or logical XOR of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
constexpr unsigned BitWidth
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
constexpr T reverseBits(T Val)
Reverse the bits in Val.
Definition MathExtras.h:118
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
constexpr RegState getUndefRegState(bool B)
ValueUniformity
Enum describing how values behave with respect to uniformity and divergence, to answer the question: ...
Definition Uniformity.h:18
@ AlwaysUniform
The result value is always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result value can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
MachineCycleInfo::CycleT MachineCycle
static const MachineMemOperand::Flags MOThreadPrivate
Mark the MMO of accesses to memory locations that are never written to by other threads.
Definition SIInstrInfo.h:63
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
#define N
Helper struct for the implementation of 3-address conversion to communicate updates made to instructi...
MachineInstr * RemoveMIUse
Other instruction whose def is no longer used by the converted instruction.
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
constexpr bool all() const
Definition LaneBitmask.h:54
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition SIInstrInfo.h:67
MachineInstr * top() const
Definition SIInstrInfo.h:72
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition SIInstrInfo.h:91
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.
VirtRegInfo - Information about a virtual register used by a set of operands.
bool Reads
Reads - One of the operands read the virtual register.
bool Writes
Writes - One of the operands writes the virtual register.