LLVM 18.0.0git
SIFoldOperands.cpp
Go to the documentation of this file.
1//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7/// \file
8//===----------------------------------------------------------------------===//
9//
10
11#include "AMDGPU.h"
12#include "GCNSubtarget.h"
18
19#define DEBUG_TYPE "si-fold-operands"
20using namespace llvm;
21
22namespace {
23
24struct FoldCandidate {
26 union {
27 MachineOperand *OpToFold;
28 uint64_t ImmToFold;
29 int FrameIndexToFold;
30 };
31 int ShrinkOpcode;
32 unsigned UseOpNo;
34 bool Commuted;
35
36 FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
37 bool Commuted_ = false,
38 int ShrinkOp = -1) :
39 UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
40 Kind(FoldOp->getType()),
41 Commuted(Commuted_) {
42 if (FoldOp->isImm()) {
43 ImmToFold = FoldOp->getImm();
44 } else if (FoldOp->isFI()) {
45 FrameIndexToFold = FoldOp->getIndex();
46 } else {
47 assert(FoldOp->isReg() || FoldOp->isGlobal());
48 OpToFold = FoldOp;
49 }
50 }
51
52 bool isFI() const {
53 return Kind == MachineOperand::MO_FrameIndex;
54 }
55
56 bool isImm() const {
57 return Kind == MachineOperand::MO_Immediate;
58 }
59
60 bool isReg() const {
61 return Kind == MachineOperand::MO_Register;
62 }
63
64 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
65
66 bool needsShrink() const { return ShrinkOpcode != -1; }
67};
68
69class SIFoldOperands : public MachineFunctionPass {
70public:
71 static char ID;
73 const SIInstrInfo *TII;
74 const SIRegisterInfo *TRI;
75 const GCNSubtarget *ST;
76 const SIMachineFunctionInfo *MFI;
77
78 bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
79 const MachineOperand &OpToFold) const;
80
81 bool updateOperand(FoldCandidate &Fold) const;
82
83 bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
84 MachineInstr *MI, unsigned OpNo,
85 MachineOperand *OpToFold) const;
86 bool isUseSafeToFold(const MachineInstr &MI,
87 const MachineOperand &UseMO) const;
88 bool
89 getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
90 Register UseReg, uint8_t OpTy) const;
91 bool tryToFoldACImm(const MachineOperand &OpToFold, MachineInstr *UseMI,
92 unsigned UseOpIdx,
93 SmallVectorImpl<FoldCandidate> &FoldList) const;
94 void foldOperand(MachineOperand &OpToFold,
96 int UseOpIdx,
98 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
99
100 MachineOperand *getImmOrMaterializedImm(MachineOperand &Op) const;
101 bool tryConstantFoldOp(MachineInstr *MI) const;
102 bool tryFoldCndMask(MachineInstr &MI) const;
103 bool tryFoldZeroHighBits(MachineInstr &MI) const;
104 bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
105 bool tryFoldFoldableCopy(MachineInstr &MI,
106 MachineOperand *&CurrentKnownM0Val) const;
107
108 const MachineOperand *isClamp(const MachineInstr &MI) const;
109 bool tryFoldClamp(MachineInstr &MI);
110
111 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
112 bool tryFoldOMod(MachineInstr &MI);
113 bool tryFoldRegSequence(MachineInstr &MI);
114 bool tryFoldPhiAGPR(MachineInstr &MI);
115 bool tryFoldLoad(MachineInstr &MI);
116
117 bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
118
119public:
120 SIFoldOperands() : MachineFunctionPass(ID) {
122 }
123
124 bool runOnMachineFunction(MachineFunction &MF) override;
125
126 StringRef getPassName() const override { return "SI Fold Operands"; }
127
128 void getAnalysisUsage(AnalysisUsage &AU) const override {
129 AU.setPreservesCFG();
131 }
132};
133
134} // End anonymous namespace.
135
136INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
137 "SI Fold Operands", false, false)
138
139char SIFoldOperands::ID = 0;
140
141char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
142
145 const MachineOperand &MO) {
146 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
147 if (const TargetRegisterClass *SubRC =
148 TRI.getSubRegisterClass(RC, MO.getSubReg()))
149 RC = SubRC;
150 return RC;
151}
152
153// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
154static unsigned macToMad(unsigned Opc) {
155 switch (Opc) {
156 case AMDGPU::V_MAC_F32_e64:
157 return AMDGPU::V_MAD_F32_e64;
158 case AMDGPU::V_MAC_F16_e64:
159 return AMDGPU::V_MAD_F16_e64;
160 case AMDGPU::V_FMAC_F32_e64:
161 return AMDGPU::V_FMA_F32_e64;
162 case AMDGPU::V_FMAC_F16_e64:
163 return AMDGPU::V_FMA_F16_gfx9_e64;
164 case AMDGPU::V_FMAC_F16_t16_e64:
165 return AMDGPU::V_FMA_F16_gfx9_e64;
166 case AMDGPU::V_FMAC_LEGACY_F32_e64:
167 return AMDGPU::V_FMA_LEGACY_F32_e64;
168 case AMDGPU::V_FMAC_F64_e64:
169 return AMDGPU::V_FMA_F64_e64;
170 }
171 return AMDGPU::INSTRUCTION_LIST_END;
172}
173
174// TODO: Add heuristic that the frame index might not fit in the addressing mode
175// immediate offset to avoid materializing in loops.
176bool SIFoldOperands::frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
177 const MachineOperand &OpToFold) const {
178 if (!OpToFold.isFI())
179 return false;
180
181 const unsigned Opc = UseMI.getOpcode();
182 if (TII->isMUBUF(UseMI))
183 return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
184 if (!TII->isFLATScratch(UseMI))
185 return false;
186
187 int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
188 if (OpNo == SIdx)
189 return true;
190
191 int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
192 return OpNo == VIdx && SIdx == -1;
193}
194
196 return new SIFoldOperands();
197}
198
199bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
200 MachineInstr *MI = Fold.UseMI;
201 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
202 assert(Old.isReg());
203
204
205 const uint64_t TSFlags = MI->getDesc().TSFlags;
206 if (Fold.isImm()) {
208 (!ST->hasDOTOpSelHazard() || !(TSFlags & SIInstrFlags::IsDOT)) &&
209 AMDGPU::isFoldableLiteralV216(Fold.ImmToFold,
210 ST->hasInv2PiInlineImm())) {
211 // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
212 // already set.
213 unsigned Opcode = MI->getOpcode();
214 int OpNo = MI->getOperandNo(&Old);
215 int ModIdx = -1;
216 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
217 ModIdx = AMDGPU::OpName::src0_modifiers;
218 else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
219 ModIdx = AMDGPU::OpName::src1_modifiers;
220 else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
221 ModIdx = AMDGPU::OpName::src2_modifiers;
222 assert(ModIdx != -1);
223 ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
224 MachineOperand &Mod = MI->getOperand(ModIdx);
225 unsigned Val = Mod.getImm();
226 if (!(Val & SISrcMods::OP_SEL_0) && (Val & SISrcMods::OP_SEL_1)) {
227 // Only apply the following transformation if that operand requires
228 // a packed immediate.
229 switch (TII->get(Opcode).operands()[OpNo].OperandType) {
234 // If upper part is all zero we do not need op_sel_hi.
235 if (!isUInt<16>(Fold.ImmToFold)) {
236 if (!(Fold.ImmToFold & 0xffff)) {
237 Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
238 Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
239 Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
240 return true;
241 }
242 Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
243 Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
244 return true;
245 }
246 break;
247 default:
248 break;
249 }
250 }
251 }
252 }
253
254 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
255 MachineBasicBlock *MBB = MI->getParent();
256 auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16);
257 if (Liveness != MachineBasicBlock::LQR_Dead) {
258 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
259 return false;
260 }
261
262 int Op32 = Fold.ShrinkOpcode;
263 MachineOperand &Dst0 = MI->getOperand(0);
264 MachineOperand &Dst1 = MI->getOperand(1);
265 assert(Dst0.isDef() && Dst1.isDef());
266
267 bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(Dst1.getReg());
268
269 const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg());
270 Register NewReg0 = MRI->createVirtualRegister(Dst0RC);
271
272 MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32);
273
274 if (HaveNonDbgCarryUse) {
275 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::COPY),
276 Dst1.getReg())
277 .addReg(AMDGPU::VCC, RegState::Kill);
278 }
279
280 // Keep the old instruction around to avoid breaking iterators, but
281 // replace it with a dummy instruction to remove uses.
282 //
283 // FIXME: We should not invert how this pass looks at operands to avoid
284 // this. Should track set of foldable movs instead of looking for uses
285 // when looking at a use.
286 Dst0.setReg(NewReg0);
287 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
288 MI->removeOperand(I);
289 MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF));
290
291 if (Fold.Commuted)
292 TII->commuteInstruction(*Inst32, false);
293 return true;
294 }
295
296 assert(!Fold.needsShrink() && "not handled");
297
298 if (Fold.isImm()) {
299 if (Old.isTied()) {
300 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
301 if (NewMFMAOpc == -1)
302 return false;
303 MI->setDesc(TII->get(NewMFMAOpc));
304 MI->untieRegOperand(0);
305 }
306 Old.ChangeToImmediate(Fold.ImmToFold);
307 return true;
308 }
309
310 if (Fold.isGlobal()) {
311 Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
312 Fold.OpToFold->getTargetFlags());
313 return true;
314 }
315
316 if (Fold.isFI()) {
317 Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
318 return true;
319 }
320
321 MachineOperand *New = Fold.OpToFold;
322 Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
323 Old.setIsUndef(New->isUndef());
324 return true;
325}
326
328 const MachineInstr *MI) {
329 return any_of(FoldList, [&](const auto &C) { return C.UseMI == MI; });
330}
331
333 MachineInstr *MI, unsigned OpNo,
334 MachineOperand *FoldOp, bool Commuted = false,
335 int ShrinkOp = -1) {
336 // Skip additional folding on the same operand.
337 for (FoldCandidate &Fold : FoldList)
338 if (Fold.UseMI == MI && Fold.UseOpNo == OpNo)
339 return;
340 LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal")
341 << " operand " << OpNo << "\n " << *MI);
342 FoldList.emplace_back(MI, OpNo, FoldOp, Commuted, ShrinkOp);
343}
344
345bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
346 MachineInstr *MI, unsigned OpNo,
347 MachineOperand *OpToFold) const {
348 const unsigned Opc = MI->getOpcode();
349
350 auto tryToFoldAsFMAAKorMK = [&]() {
351 if (!OpToFold->isImm())
352 return false;
353
354 const bool TryAK = OpNo == 3;
355 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
356 MI->setDesc(TII->get(NewOpc));
357
358 // We have to fold into operand which would be Imm not into OpNo.
359 bool FoldAsFMAAKorMK =
360 tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
361 if (FoldAsFMAAKorMK) {
362 // Untie Src2 of fmac.
363 MI->untieRegOperand(3);
364 // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
365 if (OpNo == 1) {
366 MachineOperand &Op1 = MI->getOperand(1);
367 MachineOperand &Op2 = MI->getOperand(2);
368 Register OldReg = Op1.getReg();
369 // Operand 2 might be an inlinable constant
370 if (Op2.isImm()) {
371 Op1.ChangeToImmediate(Op2.getImm());
372 Op2.ChangeToRegister(OldReg, false);
373 } else {
374 Op1.setReg(Op2.getReg());
375 Op2.setReg(OldReg);
376 }
377 }
378 return true;
379 }
380 MI->setDesc(TII->get(Opc));
381 return false;
382 };
383
384 if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
385 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
386 unsigned NewOpc = macToMad(Opc);
387 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
388 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
389 // to fold the operand.
390 MI->setDesc(TII->get(NewOpc));
391 bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) &&
392 AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel);
393 if (AddOpSel)
394 MI->addOperand(MachineOperand::CreateImm(0));
395 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);
396 if (FoldAsMAD) {
397 MI->untieRegOperand(OpNo);
398 return true;
399 }
400 if (AddOpSel)
401 MI->removeOperand(MI->getNumExplicitOperands() - 1);
402 MI->setDesc(TII->get(Opc));
403 }
404
405 // Special case for s_fmac_f32 if we are trying to fold into Src2.
406 // By transforming into fmaak we can untie Src2 and make folding legal.
407 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
408 if (tryToFoldAsFMAAKorMK())
409 return true;
410 }
411
412 // Special case for s_setreg_b32
413 if (OpToFold->isImm()) {
414 unsigned ImmOpc = 0;
415 if (Opc == AMDGPU::S_SETREG_B32)
416 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
417 else if (Opc == AMDGPU::S_SETREG_B32_mode)
418 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
419 if (ImmOpc) {
420 MI->setDesc(TII->get(ImmOpc));
421 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
422 return true;
423 }
424 }
425
426 // If we are already folding into another operand of MI, then
427 // we can't commute the instruction, otherwise we risk making the
428 // other fold illegal.
429 if (isUseMIInFoldList(FoldList, MI))
430 return false;
431
432 unsigned CommuteOpNo = OpNo;
433
434 // Operand is not legal, so try to commute the instruction to
435 // see if this makes it possible to fold.
436 unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
437 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
438 bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1);
439
440 if (CanCommute) {
441 if (CommuteIdx0 == OpNo)
442 CommuteOpNo = CommuteIdx1;
443 else if (CommuteIdx1 == OpNo)
444 CommuteOpNo = CommuteIdx0;
445 }
446
447
448 // One of operands might be an Imm operand, and OpNo may refer to it after
449 // the call of commuteInstruction() below. Such situations are avoided
450 // here explicitly as OpNo must be a register operand to be a candidate
451 // for memory folding.
452 if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() ||
453 !MI->getOperand(CommuteIdx1).isReg()))
454 return false;
455
456 if (!CanCommute ||
457 !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
458 return false;
459
460 if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
461 if ((Opc == AMDGPU::V_ADD_CO_U32_e64 ||
462 Opc == AMDGPU::V_SUB_CO_U32_e64 ||
463 Opc == AMDGPU::V_SUBREV_CO_U32_e64) && // FIXME
464 (OpToFold->isImm() || OpToFold->isFI() || OpToFold->isGlobal())) {
465
466 // Verify the other operand is a VGPR, otherwise we would violate the
467 // constant bus restriction.
468 unsigned OtherIdx = CommuteOpNo == CommuteIdx0 ? CommuteIdx1 : CommuteIdx0;
469 MachineOperand &OtherOp = MI->getOperand(OtherIdx);
470 if (!OtherOp.isReg() ||
471 !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
472 return false;
473
474 assert(MI->getOperand(1).isDef());
475
476 // Make sure to get the 32-bit version of the commuted opcode.
477 unsigned MaybeCommutedOpc = MI->getOpcode();
478 int Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
479
480 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32);
481 return true;
482 }
483
484 TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1);
485 return false;
486 }
487
488 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true);
489 return true;
490 }
491
492 // Inlineable constant might have been folded into Imm operand of fmaak or
493 // fmamk and we are trying to fold a non-inlinable constant.
494 if ((Opc == AMDGPU::S_FMAAK_F32 || Opc == AMDGPU::S_FMAMK_F32) &&
495 !OpToFold->isReg() && !TII->isInlineConstant(*OpToFold)) {
496 unsigned ImmIdx = Opc == AMDGPU::S_FMAAK_F32 ? 3 : 2;
497 MachineOperand &OpImm = MI->getOperand(ImmIdx);
498 if (!OpImm.isReg() &&
499 TII->isInlineConstant(*MI, MI->getOperand(OpNo), OpImm))
500 return tryToFoldAsFMAAKorMK();
501 }
502
503 // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
504 // By changing into fmamk we can untie Src2.
505 // If folding for Src0 happens first and it is identical operand to Src1 we
506 // should avoid transforming into fmamk which requires commuting as it would
507 // cause folding into Src1 to fail later on due to wrong OpNo used.
508 if (Opc == AMDGPU::S_FMAC_F32 &&
509 (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
510 if (tryToFoldAsFMAAKorMK())
511 return true;
512 }
513
514 // Check the case where we might introduce a second constant operand to a
515 // scalar instruction
516 if (TII->isSALU(MI->getOpcode())) {
517 const MCInstrDesc &InstDesc = MI->getDesc();
518 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
519
520 // Fine if the operand can be encoded as an inline constant
521 if (!OpToFold->isReg() && !TII->isInlineConstant(*OpToFold, OpInfo)) {
522 // Otherwise check for another constant
523 for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) {
524 auto &Op = MI->getOperand(i);
525 if (OpNo != i && !Op.isReg() &&
526 !TII->isInlineConstant(Op, InstDesc.operands()[i]))
527 return false;
528 }
529 }
530 }
531
532 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
533 return true;
534}
535
536bool SIFoldOperands::isUseSafeToFold(const MachineInstr &MI,
537 const MachineOperand &UseMO) const {
538 // Operands of SDWA instructions must be registers.
539 return !TII->isSDWA(MI);
540}
541
542// Find a def of the UseReg, check if it is a reg_sequence and find initializers
543// for each subreg, tracking it to foldable inline immediate if possible.
544// Returns true on success.
545bool SIFoldOperands::getRegSeqInit(
546 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
547 Register UseReg, uint8_t OpTy) const {
548 MachineInstr *Def = MRI->getVRegDef(UseReg);
549 if (!Def || !Def->isRegSequence())
550 return false;
551
552 for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
553 MachineOperand *Sub = &Def->getOperand(I);
554 assert(Sub->isReg());
555
556 for (MachineInstr *SubDef = MRI->getVRegDef(Sub->getReg());
557 SubDef && Sub->isReg() && Sub->getReg().isVirtual() &&
558 !Sub->getSubReg() && TII->isFoldableCopy(*SubDef);
559 SubDef = MRI->getVRegDef(Sub->getReg())) {
560 MachineOperand *Op = &SubDef->getOperand(1);
561 if (Op->isImm()) {
562 if (TII->isInlineConstant(*Op, OpTy))
563 Sub = Op;
564 break;
565 }
566 if (!Op->isReg() || Op->getReg().isPhysical())
567 break;
568 Sub = Op;
569 }
570
571 Defs.emplace_back(Sub, Def->getOperand(I + 1).getImm());
572 }
573
574 return true;
575}
576
577bool SIFoldOperands::tryToFoldACImm(
578 const MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
579 SmallVectorImpl<FoldCandidate> &FoldList) const {
580 const MCInstrDesc &Desc = UseMI->getDesc();
581 if (UseOpIdx >= Desc.getNumOperands())
582 return false;
583
584 uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
589 return false;
590
591 if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) &&
592 TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
593 UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
594 return true;
595 }
596
597 if (!OpToFold.isReg())
598 return false;
599
600 Register UseReg = OpToFold.getReg();
601 if (!UseReg.isVirtual())
602 return false;
603
604 if (isUseMIInFoldList(FoldList, UseMI))
605 return false;
606
607 // Maybe it is just a COPY of an immediate itself.
608 MachineInstr *Def = MRI->getVRegDef(UseReg);
609 MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
610 if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) {
611 MachineOperand &DefOp = Def->getOperand(1);
612 if (DefOp.isImm() && TII->isInlineConstant(DefOp, OpTy) &&
613 TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
614 UseMI->getOperand(UseOpIdx).ChangeToImmediate(DefOp.getImm());
615 return true;
616 }
617 }
618
620 if (!getRegSeqInit(Defs, UseReg, OpTy))
621 return false;
622
623 int32_t Imm;
624 for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
625 const MachineOperand *Op = Defs[I].first;
626 if (!Op->isImm())
627 return false;
628
629 auto SubImm = Op->getImm();
630 if (!I) {
631 Imm = SubImm;
632 if (!TII->isInlineConstant(*Op, OpTy) ||
633 !TII->isOperandLegal(*UseMI, UseOpIdx, Op))
634 return false;
635
636 continue;
637 }
638 if (Imm != SubImm)
639 return false; // Can only fold splat constants
640 }
641
642 appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first);
643 return true;
644}
645
646void SIFoldOperands::foldOperand(
647 MachineOperand &OpToFold,
649 int UseOpIdx,
651 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
652 const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
653
654 if (!isUseSafeToFold(*UseMI, UseOp))
655 return;
656
657 // FIXME: Fold operands with subregs.
658 if (UseOp.isReg() && OpToFold.isReg() &&
659 (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister))
660 return;
661
662 // Special case for REG_SEQUENCE: We can't fold literals into
663 // REG_SEQUENCE instructions, so we have to fold them into the
664 // uses of REG_SEQUENCE.
665 if (UseMI->isRegSequence()) {
666 Register RegSeqDstReg = UseMI->getOperand(0).getReg();
667 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
668
669 for (auto &RSUse : make_early_inc_range(MRI->use_nodbg_operands(RegSeqDstReg))) {
670 MachineInstr *RSUseMI = RSUse.getParent();
671
672 if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI,
673 RSUseMI->getOperandNo(&RSUse), FoldList))
674 continue;
675
676 if (RSUse.getSubReg() != RegSeqDstSubReg)
677 continue;
678
679 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(&RSUse), FoldList,
680 CopiesToReplace);
681 }
682
683 return;
684 }
685
686 if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList))
687 return;
688
689 if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) {
690 // Verify that this is a stack access.
691 // FIXME: Should probably use stack pseudos before frame lowering.
692
693 if (TII->isMUBUF(*UseMI)) {
694 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
695 MFI->getScratchRSrcReg())
696 return;
697
698 // Ensure this is either relative to the current frame or the current
699 // wave.
700 MachineOperand &SOff =
701 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
702 if (!SOff.isImm() || SOff.getImm() != 0)
703 return;
704 }
705
706 // A frame index will resolve to a positive constant, so it should always be
707 // safe to fold the addressing mode, even pre-GFX9.
708 UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
709
710 const unsigned Opc = UseMI->getOpcode();
711 if (TII->isFLATScratch(*UseMI) &&
712 AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
713 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
714 unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
715 UseMI->setDesc(TII->get(NewOpc));
716 }
717
718 return;
719 }
720
721 bool FoldingImmLike =
722 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
723
724 if (FoldingImmLike && UseMI->isCopy()) {
725 Register DestReg = UseMI->getOperand(0).getReg();
726 Register SrcReg = UseMI->getOperand(1).getReg();
727 assert(SrcReg.isVirtual());
728
729 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
730
731 // Don't fold into a copy to a physical register with the same class. Doing
732 // so would interfere with the register coalescer's logic which would avoid
733 // redundant initializations.
734 if (DestReg.isPhysical() && SrcRC->contains(DestReg))
735 return;
736
737 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
738 if (!DestReg.isPhysical()) {
739 if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
741 for (auto &Use : MRI->use_nodbg_operands(DestReg)) {
742 // There's no point trying to fold into an implicit operand.
743 if (Use.isImplicit())
744 continue;
745
746 CopyUses.emplace_back(Use.getParent(),
747 Use.getParent()->getOperandNo(&Use),
748 &UseMI->getOperand(1));
749 }
750
751 for (auto &F : CopyUses) {
752 foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, FoldList,
753 CopiesToReplace);
754 }
755 }
756
757 if (DestRC == &AMDGPU::AGPR_32RegClass &&
758 TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
759 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
761 CopiesToReplace.push_back(UseMI);
762 return;
763 }
764 }
765
766 // In order to fold immediates into copies, we need to change the
767 // copy to a MOV.
768
769 unsigned MovOp = TII->getMovOpcode(DestRC);
770 if (MovOp == AMDGPU::COPY)
771 return;
772
773 UseMI->setDesc(TII->get(MovOp));
776 while (ImpOpI != ImpOpE) {
777 MachineInstr::mop_iterator Tmp = ImpOpI;
778 ImpOpI++;
780 }
781 CopiesToReplace.push_back(UseMI);
782 } else {
783 if (UseMI->isCopy() && OpToFold.isReg() &&
785 !UseMI->getOperand(1).getSubReg()) {
786 LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
787 unsigned Size = TII->getOpSize(*UseMI, 1);
788 Register UseReg = OpToFold.getReg();
790 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
791 UseMI->getOperand(1).setIsKill(false);
792 CopiesToReplace.push_back(UseMI);
793 OpToFold.setIsKill(false);
794
795 // Remove kill flags as kills may now be out of order with uses.
796 MRI->clearKillFlags(OpToFold.getReg());
797
798 // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
799 // can only accept VGPR or inline immediate. Recreate a reg_sequence with
800 // its initializers right here, so we will rematerialize immediates and
801 // avoid copies via different reg classes.
803 if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
804 getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
805 const DebugLoc &DL = UseMI->getDebugLoc();
807
808 UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
809 for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
811
815 for (unsigned I = 0; I < Size / 4; ++I) {
816 MachineOperand *Def = Defs[I].first;
818 if (Def->isImm() &&
819 TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
820 int64_t Imm = Def->getImm();
821
822 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
824 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm);
825 B.addReg(Tmp);
826 } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
827 auto Src = getRegSubRegPair(*Def);
828 Def->setIsKill(false);
829 if (!SeenAGPRs.insert(Src)) {
830 // We cannot build a reg_sequence out of the same registers, they
831 // must be copied. Better do it here before copyPhysReg() created
832 // several reads to do the AGPR->VGPR->AGPR copy.
833 CopyToVGPR = Src;
834 } else {
835 B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0,
836 Src.SubReg);
837 }
838 } else {
839 assert(Def->isReg());
840 Def->setIsKill(false);
841 auto Src = getRegSubRegPair(*Def);
842
843 // Direct copy from SGPR to AGPR is not possible. To avoid creation
844 // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
845 // create a copy here and track if we already have such a copy.
846 if (TRI->isSGPRReg(*MRI, Src.Reg)) {
847 CopyToVGPR = Src;
848 } else {
849 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
850 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
851 B.addReg(Tmp);
852 }
853 }
854
855 if (CopyToVGPR.Reg) {
856 Register Vgpr;
857 if (VGPRCopies.count(CopyToVGPR)) {
858 Vgpr = VGPRCopies[CopyToVGPR];
859 } else {
860 Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
861 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
862 VGPRCopies[CopyToVGPR] = Vgpr;
863 }
864 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
866 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg(Vgpr);
867 B.addReg(Tmp);
868 }
869
870 B.addImm(Defs[I].second);
871 }
872 LLVM_DEBUG(dbgs() << "Folded " << *UseMI);
873 return;
874 }
875
876 if (Size != 4)
877 return;
878
879 Register Reg0 = UseMI->getOperand(0).getReg();
880 Register Reg1 = UseMI->getOperand(1).getReg();
881 if (TRI->isAGPR(*MRI, Reg0) && TRI->isVGPR(*MRI, Reg1))
882 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
883 else if (TRI->isVGPR(*MRI, Reg0) && TRI->isAGPR(*MRI, Reg1))
884 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64));
885 else if (ST->hasGFX90AInsts() && TRI->isAGPR(*MRI, Reg0) &&
886 TRI->isAGPR(*MRI, Reg1))
887 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32));
888 return;
889 }
890
891 unsigned UseOpc = UseMI->getOpcode();
892 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
893 (UseOpc == AMDGPU::V_READLANE_B32 &&
894 (int)UseOpIdx ==
895 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
896 // %vgpr = V_MOV_B32 imm
897 // %sgpr = V_READFIRSTLANE_B32 %vgpr
898 // =>
899 // %sgpr = S_MOV_B32 imm
900 if (FoldingImmLike) {
902 UseMI->getOperand(UseOpIdx).getReg(),
903 *OpToFold.getParent(),
904 *UseMI))
905 return;
906
907 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
908
909 if (OpToFold.isImm())
911 else
913 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
914 return;
915 }
916
917 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
919 UseMI->getOperand(UseOpIdx).getReg(),
920 *OpToFold.getParent(),
921 *UseMI))
922 return;
923
924 // %vgpr = COPY %sgpr0
925 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
926 // =>
927 // %sgpr1 = COPY %sgpr0
928 UseMI->setDesc(TII->get(AMDGPU::COPY));
929 UseMI->getOperand(1).setReg(OpToFold.getReg());
930 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
931 UseMI->getOperand(1).setIsKill(false);
932 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
933 return;
934 }
935 }
936
937 const MCInstrDesc &UseDesc = UseMI->getDesc();
938
939 // Don't fold into target independent nodes. Target independent opcodes
940 // don't have defined register classes.
941 if (UseDesc.isVariadic() || UseOp.isImplicit() ||
942 UseDesc.operands()[UseOpIdx].RegClass == -1)
943 return;
944 }
945
946 if (!FoldingImmLike) {
947 if (OpToFold.isReg() && ST->needsAlignedVGPRs()) {
948 // Don't fold if OpToFold doesn't hold an aligned register.
949 const TargetRegisterClass *RC =
950 TRI->getRegClassForReg(*MRI, OpToFold.getReg());
951 if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) {
952 unsigned SubReg = OpToFold.getSubReg();
953 if (const TargetRegisterClass *SubRC =
954 TRI->getSubRegisterClass(RC, SubReg))
955 RC = SubRC;
956 }
957
958 if (!RC || !TRI->isProperlyAlignedRC(*RC))
959 return;
960 }
961
962 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold);
963
964 // FIXME: We could try to change the instruction from 64-bit to 32-bit
965 // to enable more folding opportunities. The shrink operands pass
966 // already does this.
967 return;
968 }
969
970
971 const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
972 const TargetRegisterClass *FoldRC =
973 TRI->getRegClass(FoldDesc.operands()[0].RegClass);
974
975 // Split 64-bit constants into 32-bits for folding.
976 if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) {
977 Register UseReg = UseOp.getReg();
978 const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
979 if (AMDGPU::getRegBitWidth(*UseRC) != 64)
980 return;
981
982 APInt Imm(64, OpToFold.getImm());
983 if (UseOp.getSubReg() == AMDGPU::sub0) {
984 Imm = Imm.getLoBits(32);
985 } else {
986 assert(UseOp.getSubReg() == AMDGPU::sub1);
987 Imm = Imm.getHiBits(32);
988 }
989
990 MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
991 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp);
992 return;
993 }
994
995 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold);
996}
997
998static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
999 uint32_t LHS, uint32_t RHS) {
1000 switch (Opcode) {
1001 case AMDGPU::V_AND_B32_e64:
1002 case AMDGPU::V_AND_B32_e32:
1003 case AMDGPU::S_AND_B32:
1004 Result = LHS & RHS;
1005 return true;
1006 case AMDGPU::V_OR_B32_e64:
1007 case AMDGPU::V_OR_B32_e32:
1008 case AMDGPU::S_OR_B32:
1009 Result = LHS | RHS;
1010 return true;
1011 case AMDGPU::V_XOR_B32_e64:
1012 case AMDGPU::V_XOR_B32_e32:
1013 case AMDGPU::S_XOR_B32:
1014 Result = LHS ^ RHS;
1015 return true;
1016 case AMDGPU::S_XNOR_B32:
1017 Result = ~(LHS ^ RHS);
1018 return true;
1019 case AMDGPU::S_NAND_B32:
1020 Result = ~(LHS & RHS);
1021 return true;
1022 case AMDGPU::S_NOR_B32:
1023 Result = ~(LHS | RHS);
1024 return true;
1025 case AMDGPU::S_ANDN2_B32:
1026 Result = LHS & ~RHS;
1027 return true;
1028 case AMDGPU::S_ORN2_B32:
1029 Result = LHS | ~RHS;
1030 return true;
1031 case AMDGPU::V_LSHL_B32_e64:
1032 case AMDGPU::V_LSHL_B32_e32:
1033 case AMDGPU::S_LSHL_B32:
1034 // The instruction ignores the high bits for out of bounds shifts.
1035 Result = LHS << (RHS & 31);
1036 return true;
1037 case AMDGPU::V_LSHLREV_B32_e64:
1038 case AMDGPU::V_LSHLREV_B32_e32:
1039 Result = RHS << (LHS & 31);
1040 return true;
1041 case AMDGPU::V_LSHR_B32_e64:
1042 case AMDGPU::V_LSHR_B32_e32:
1043 case AMDGPU::S_LSHR_B32:
1044 Result = LHS >> (RHS & 31);
1045 return true;
1046 case AMDGPU::V_LSHRREV_B32_e64:
1047 case AMDGPU::V_LSHRREV_B32_e32:
1048 Result = RHS >> (LHS & 31);
1049 return true;
1050 case AMDGPU::V_ASHR_I32_e64:
1051 case AMDGPU::V_ASHR_I32_e32:
1052 case AMDGPU::S_ASHR_I32:
1053 Result = static_cast<int32_t>(LHS) >> (RHS & 31);
1054 return true;
1055 case AMDGPU::V_ASHRREV_I32_e64:
1056 case AMDGPU::V_ASHRREV_I32_e32:
1057 Result = static_cast<int32_t>(RHS) >> (LHS & 31);
1058 return true;
1059 default:
1060 return false;
1061 }
1062}
1063
1064static unsigned getMovOpc(bool IsScalar) {
1065 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1066}
1067
1068static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
1069 MI.setDesc(NewDesc);
1070
1071 // Remove any leftover implicit operands from mutating the instruction. e.g.
1072 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
1073 // anymore.
1074 const MCInstrDesc &Desc = MI.getDesc();
1075 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
1076 Desc.implicit_defs().size();
1077
1078 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
1079 MI.removeOperand(I);
1080}
1081
1083SIFoldOperands::getImmOrMaterializedImm(MachineOperand &Op) const {
1084 // If this has a subregister, it obviously is a register source.
1085 if (!Op.isReg() || Op.getSubReg() != AMDGPU::NoSubRegister ||
1086 !Op.getReg().isVirtual())
1087 return &Op;
1088
1089 MachineInstr *Def = MRI->getVRegDef(Op.getReg());
1090 if (Def && Def->isMoveImmediate()) {
1091 MachineOperand &ImmSrc = Def->getOperand(1);
1092 if (ImmSrc.isImm())
1093 return &ImmSrc;
1094 }
1095
1096 return &Op;
1097}
1098
1099// Try to simplify operations with a constant that may appear after instruction
1100// selection.
1101// TODO: See if a frame index with a fixed offset can fold.
1102bool SIFoldOperands::tryConstantFoldOp(MachineInstr *MI) const {
1103 if (!MI->allImplicitDefsAreDead())
1104 return false;
1105
1106 unsigned Opc = MI->getOpcode();
1107
1108 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1109 if (Src0Idx == -1)
1110 return false;
1111 MachineOperand *Src0 = getImmOrMaterializedImm(MI->getOperand(Src0Idx));
1112
1113 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1114 Opc == AMDGPU::S_NOT_B32) &&
1115 Src0->isImm()) {
1116 MI->getOperand(1).ChangeToImmediate(~Src0->getImm());
1117 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
1118 return true;
1119 }
1120
1121 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1122 if (Src1Idx == -1)
1123 return false;
1124 MachineOperand *Src1 = getImmOrMaterializedImm(MI->getOperand(Src1Idx));
1125
1126 if (!Src0->isImm() && !Src1->isImm())
1127 return false;
1128
1129 // and k0, k1 -> v_mov_b32 (k0 & k1)
1130 // or k0, k1 -> v_mov_b32 (k0 | k1)
1131 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1132 if (Src0->isImm() && Src1->isImm()) {
1133 int32_t NewImm;
1134 if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
1135 return false;
1136
1137 bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg());
1138
1139 // Be careful to change the right operand, src0 may belong to a different
1140 // instruction.
1141 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1142 MI->removeOperand(Src1Idx);
1143 mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
1144 return true;
1145 }
1146
1147 if (!MI->isCommutable())
1148 return false;
1149
1150 if (Src0->isImm() && !Src1->isImm()) {
1151 std::swap(Src0, Src1);
1152 std::swap(Src0Idx, Src1Idx);
1153 }
1154
1155 int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
1156 if (Opc == AMDGPU::V_OR_B32_e64 ||
1157 Opc == AMDGPU::V_OR_B32_e32 ||
1158 Opc == AMDGPU::S_OR_B32) {
1159 if (Src1Val == 0) {
1160 // y = or x, 0 => y = copy x
1161 MI->removeOperand(Src1Idx);
1162 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1163 } else if (Src1Val == -1) {
1164 // y = or x, -1 => y = v_mov_b32 -1
1165 MI->removeOperand(Src1Idx);
1166 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
1167 } else
1168 return false;
1169
1170 return true;
1171 }
1172
1173 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1174 Opc == AMDGPU::S_AND_B32) {
1175 if (Src1Val == 0) {
1176 // y = and x, 0 => y = v_mov_b32 0
1177 MI->removeOperand(Src0Idx);
1178 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
1179 } else if (Src1Val == -1) {
1180 // y = and x, -1 => y = copy x
1181 MI->removeOperand(Src1Idx);
1182 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1183 } else
1184 return false;
1185
1186 return true;
1187 }
1188
1189 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1190 Opc == AMDGPU::S_XOR_B32) {
1191 if (Src1Val == 0) {
1192 // y = xor x, 0 => y = copy x
1193 MI->removeOperand(Src1Idx);
1194 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1195 return true;
1196 }
1197 }
1198
1199 return false;
1200}
1201
1202// Try to fold an instruction into a simpler one
1203bool SIFoldOperands::tryFoldCndMask(MachineInstr &MI) const {
1204 unsigned Opc = MI.getOpcode();
1205 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1206 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1207 return false;
1208
1209 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1210 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1211 if (!Src1->isIdenticalTo(*Src0)) {
1212 auto *Src0Imm = getImmOrMaterializedImm(*Src0);
1213 auto *Src1Imm = getImmOrMaterializedImm(*Src1);
1214 if (!Src1Imm->isIdenticalTo(*Src0Imm))
1215 return false;
1216 }
1217
1218 int Src1ModIdx =
1219 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1220 int Src0ModIdx =
1221 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1222 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1223 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1224 return false;
1225
1226 LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1227 auto &NewDesc =
1228 TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
1229 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1230 if (Src2Idx != -1)
1231 MI.removeOperand(Src2Idx);
1232 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1233 if (Src1ModIdx != -1)
1234 MI.removeOperand(Src1ModIdx);
1235 if (Src0ModIdx != -1)
1236 MI.removeOperand(Src0ModIdx);
1237 mutateCopyOp(MI, NewDesc);
1238 LLVM_DEBUG(dbgs() << MI);
1239 return true;
1240}
1241
1242bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr &MI) const {
1243 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1244 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1245 return false;
1246
1247 MachineOperand *Src0 = getImmOrMaterializedImm(MI.getOperand(1));
1248 if (!Src0->isImm() || Src0->getImm() != 0xffff)
1249 return false;
1250
1251 Register Src1 = MI.getOperand(2).getReg();
1252 MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1253 if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
1254 return false;
1255
1256 Register Dst = MI.getOperand(0).getReg();
1257 MRI->replaceRegWith(Dst, SrcDef->getOperand(0).getReg());
1258 MI.eraseFromParent();
1259 return true;
1260}
1261
1262bool SIFoldOperands::foldInstOperand(MachineInstr &MI,
1263 MachineOperand &OpToFold) const {
1264 // We need mutate the operands of new mov instructions to add implicit
1265 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1266 // this.
1267 SmallVector<MachineInstr *, 4> CopiesToReplace;
1269 MachineOperand &Dst = MI.getOperand(0);
1270 bool Changed = false;
1271
1272 if (OpToFold.isImm()) {
1273 for (auto &UseMI :
1274 make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
1275 // Folding the immediate may reveal operations that can be constant
1276 // folded or replaced with a copy. This can happen for example after
1277 // frame indices are lowered to constants or from splitting 64-bit
1278 // constants.
1279 //
1280 // We may also encounter cases where one or both operands are
1281 // immediates materialized into a register, which would ordinarily not
1282 // be folded due to multiple uses or operand constraints.
1283 if (tryConstantFoldOp(&UseMI)) {
1284 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
1285 Changed = true;
1286 }
1287 }
1288 }
1289
1291 for (auto &Use : MRI->use_nodbg_operands(Dst.getReg()))
1292 UsesToProcess.push_back(&Use);
1293 for (auto *U : UsesToProcess) {
1294 MachineInstr *UseMI = U->getParent();
1295 foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
1296 CopiesToReplace);
1297 }
1298
1299 if (CopiesToReplace.empty() && FoldList.empty())
1300 return Changed;
1301
1302 MachineFunction *MF = MI.getParent()->getParent();
1303 // Make sure we add EXEC uses to any new v_mov instructions created.
1304 for (MachineInstr *Copy : CopiesToReplace)
1305 Copy->addImplicitDefUseOperands(*MF);
1306
1307 for (FoldCandidate &Fold : FoldList) {
1308 assert(!Fold.isReg() || Fold.OpToFold);
1309 if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) {
1310 Register Reg = Fold.OpToFold->getReg();
1311 MachineInstr *DefMI = Fold.OpToFold->getParent();
1312 if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
1313 execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
1314 continue;
1315 }
1316 if (updateOperand(Fold)) {
1317 // Clear kill flags.
1318 if (Fold.isReg()) {
1319 assert(Fold.OpToFold && Fold.OpToFold->isReg());
1320 // FIXME: Probably shouldn't bother trying to fold if not an
1321 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1322 // copies.
1323 MRI->clearKillFlags(Fold.OpToFold->getReg());
1324 }
1325 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1326 << static_cast<int>(Fold.UseOpNo) << " of "
1327 << *Fold.UseMI);
1328 } else if (Fold.Commuted) {
1329 // Restoring instruction's original operand order if fold has failed.
1330 TII->commuteInstruction(*Fold.UseMI, false);
1331 }
1332 }
1333 return true;
1334}
1335
1336bool SIFoldOperands::tryFoldFoldableCopy(
1337 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
1338 // Specially track simple redefs of m0 to the same value in a block, so we
1339 // can erase the later ones.
1340 if (MI.getOperand(0).getReg() == AMDGPU::M0) {
1341 MachineOperand &NewM0Val = MI.getOperand(1);
1342 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
1343 MI.eraseFromParent();
1344 return true;
1345 }
1346
1347 // We aren't tracking other physical registers
1348 CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical())
1349 ? nullptr
1350 : &NewM0Val;
1351 return false;
1352 }
1353
1354 MachineOperand &OpToFold = MI.getOperand(1);
1355 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1356
1357 // FIXME: We could also be folding things like TargetIndexes.
1358 if (!FoldingImm && !OpToFold.isReg())
1359 return false;
1360
1361 if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
1362 return false;
1363
1364 // Prevent folding operands backwards in the function. For example,
1365 // the COPY opcode must not be replaced by 1 in this example:
1366 //
1367 // %3 = COPY %vgpr0; VGPR_32:%3
1368 // ...
1369 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1370 if (!MI.getOperand(0).getReg().isVirtual())
1371 return false;
1372
1373 bool Changed = foldInstOperand(MI, OpToFold);
1374
1375 // If we managed to fold all uses of this copy then we might as well
1376 // delete it now.
1377 // The only reason we need to follow chains of copies here is that
1378 // tryFoldRegSequence looks forward through copies before folding a
1379 // REG_SEQUENCE into its eventual users.
1380 auto *InstToErase = &MI;
1381 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1382 auto &SrcOp = InstToErase->getOperand(1);
1383 auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
1384 InstToErase->eraseFromParent();
1385 Changed = true;
1386 InstToErase = nullptr;
1387 if (!SrcReg || SrcReg.isPhysical())
1388 break;
1389 InstToErase = MRI->getVRegDef(SrcReg);
1390 if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
1391 break;
1392 }
1393
1394 if (InstToErase && InstToErase->isRegSequence() &&
1395 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1396 InstToErase->eraseFromParent();
1397 Changed = true;
1398 }
1399
1400 return Changed;
1401}
1402
1403// Clamp patterns are canonically selected to v_max_* instructions, so only
1404// handle them.
1405const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
1406 unsigned Op = MI.getOpcode();
1407 switch (Op) {
1408 case AMDGPU::V_MAX_F32_e64:
1409 case AMDGPU::V_MAX_F16_e64:
1410 case AMDGPU::V_MAX_F16_t16_e64:
1411 case AMDGPU::V_MAX_F64_e64:
1412 case AMDGPU::V_PK_MAX_F16: {
1413 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
1414 return nullptr;
1415
1416 // Make sure sources are identical.
1417 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1418 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1419 if (!Src0->isReg() || !Src1->isReg() ||
1420 Src0->getReg() != Src1->getReg() ||
1421 Src0->getSubReg() != Src1->getSubReg() ||
1422 Src0->getSubReg() != AMDGPU::NoSubRegister)
1423 return nullptr;
1424
1425 // Can't fold up if we have modifiers.
1426 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1427 return nullptr;
1428
1429 unsigned Src0Mods
1430 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
1431 unsigned Src1Mods
1432 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
1433
1434 // Having a 0 op_sel_hi would require swizzling the output in the source
1435 // instruction, which we can't do.
1436 unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1
1437 : 0u;
1438 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
1439 return nullptr;
1440 return Src0;
1441 }
1442 default:
1443 return nullptr;
1444 }
1445}
1446
1447// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
1448bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
1449 const MachineOperand *ClampSrc = isClamp(MI);
1450 if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
1451 return false;
1452
1453 MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
1454
1455 // The type of clamp must be compatible.
1456 if (TII->getClampMask(*Def) != TII->getClampMask(MI))
1457 return false;
1458
1459 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
1460 if (!DefClamp)
1461 return false;
1462
1463 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
1464
1465 // Clamp is applied after omod, so it is OK if omod is set.
1466 DefClamp->setImm(1);
1467 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1468 MI.eraseFromParent();
1469
1470 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1471 // instruction, so we might as well convert it to the more flexible VOP3-only
1472 // mad/fma form.
1473 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1474 Def->eraseFromParent();
1475
1476 return true;
1477}
1478
1479static int getOModValue(unsigned Opc, int64_t Val) {
1480 switch (Opc) {
1481 case AMDGPU::V_MUL_F64_e64: {
1482 switch (Val) {
1483 case 0x3fe0000000000000: // 0.5
1484 return SIOutMods::DIV2;
1485 case 0x4000000000000000: // 2.0
1486 return SIOutMods::MUL2;
1487 case 0x4010000000000000: // 4.0
1488 return SIOutMods::MUL4;
1489 default:
1490 return SIOutMods::NONE;
1491 }
1492 }
1493 case AMDGPU::V_MUL_F32_e64: {
1494 switch (static_cast<uint32_t>(Val)) {
1495 case 0x3f000000: // 0.5
1496 return SIOutMods::DIV2;
1497 case 0x40000000: // 2.0
1498 return SIOutMods::MUL2;
1499 case 0x40800000: // 4.0
1500 return SIOutMods::MUL4;
1501 default:
1502 return SIOutMods::NONE;
1503 }
1504 }
1505 case AMDGPU::V_MUL_F16_e64:
1506 case AMDGPU::V_MUL_F16_t16_e64: {
1507 switch (static_cast<uint16_t>(Val)) {
1508 case 0x3800: // 0.5
1509 return SIOutMods::DIV2;
1510 case 0x4000: // 2.0
1511 return SIOutMods::MUL2;
1512 case 0x4400: // 4.0
1513 return SIOutMods::MUL4;
1514 default:
1515 return SIOutMods::NONE;
1516 }
1517 }
1518 default:
1519 llvm_unreachable("invalid mul opcode");
1520 }
1521}
1522
1523// FIXME: Does this really not support denormals with f16?
1524// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
1525// handled, so will anything other than that break?
1526std::pair<const MachineOperand *, int>
1527SIFoldOperands::isOMod(const MachineInstr &MI) const {
1528 unsigned Op = MI.getOpcode();
1529 switch (Op) {
1530 case AMDGPU::V_MUL_F64_e64:
1531 case AMDGPU::V_MUL_F32_e64:
1532 case AMDGPU::V_MUL_F16_t16_e64:
1533 case AMDGPU::V_MUL_F16_e64: {
1534 // If output denormals are enabled, omod is ignored.
1535 if ((Op == AMDGPU::V_MUL_F32_e64 &&
1536 MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1537 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F16_e64 ||
1538 Op == AMDGPU::V_MUL_F16_t16_e64) &&
1539 MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
1540 return std::pair(nullptr, SIOutMods::NONE);
1541
1542 const MachineOperand *RegOp = nullptr;
1543 const MachineOperand *ImmOp = nullptr;
1544 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1545 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1546 if (Src0->isImm()) {
1547 ImmOp = Src0;
1548 RegOp = Src1;
1549 } else if (Src1->isImm()) {
1550 ImmOp = Src1;
1551 RegOp = Src0;
1552 } else
1553 return std::pair(nullptr, SIOutMods::NONE);
1554
1555 int OMod = getOModValue(Op, ImmOp->getImm());
1556 if (OMod == SIOutMods::NONE ||
1557 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
1558 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
1559 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
1560 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
1561 return std::pair(nullptr, SIOutMods::NONE);
1562
1563 return std::pair(RegOp, OMod);
1564 }
1565 case AMDGPU::V_ADD_F64_e64:
1566 case AMDGPU::V_ADD_F32_e64:
1567 case AMDGPU::V_ADD_F16_e64:
1568 case AMDGPU::V_ADD_F16_t16_e64: {
1569 // If output denormals are enabled, omod is ignored.
1570 if ((Op == AMDGPU::V_ADD_F32_e64 &&
1571 MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1572 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F16_e64 ||
1573 Op == AMDGPU::V_ADD_F16_t16_e64) &&
1574 MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
1575 return std::pair(nullptr, SIOutMods::NONE);
1576
1577 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
1578 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1579 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1580
1581 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
1582 Src0->getSubReg() == Src1->getSubReg() &&
1583 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
1584 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
1585 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
1586 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1587 return std::pair(Src0, SIOutMods::MUL2);
1588
1589 return std::pair(nullptr, SIOutMods::NONE);
1590 }
1591 default:
1592 return std::pair(nullptr, SIOutMods::NONE);
1593 }
1594}
1595
1596// FIXME: Does this need to check IEEE bit on function?
1597bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
1598 const MachineOperand *RegOp;
1599 int OMod;
1600 std::tie(RegOp, OMod) = isOMod(MI);
1601 if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
1602 RegOp->getSubReg() != AMDGPU::NoSubRegister ||
1603 !MRI->hasOneNonDBGUser(RegOp->getReg()))
1604 return false;
1605
1606 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
1607 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
1608 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
1609 return false;
1610
1611 // Clamp is applied after omod. If the source already has clamp set, don't
1612 // fold it.
1613 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
1614 return false;
1615
1616 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
1617
1618 DefOMod->setImm(OMod);
1619 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1620 MI.eraseFromParent();
1621
1622 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1623 // instruction, so we might as well convert it to the more flexible VOP3-only
1624 // mad/fma form.
1625 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1626 Def->eraseFromParent();
1627
1628 return true;
1629}
1630
1631// Try to fold a reg_sequence with vgpr output and agpr inputs into an
1632// instruction which can take an agpr. So far that means a store.
1633bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) {
1634 assert(MI.isRegSequence());
1635 auto Reg = MI.getOperand(0).getReg();
1636
1637 if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
1638 !MRI->hasOneNonDBGUse(Reg))
1639 return false;
1640
1642 if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER))
1643 return false;
1644
1645 for (auto &Def : Defs) {
1646 const auto *Op = Def.first;
1647 if (!Op->isReg())
1648 return false;
1649 if (TRI->isAGPR(*MRI, Op->getReg()))
1650 continue;
1651 // Maybe this is a COPY from AREG
1652 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
1653 if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
1654 return false;
1655 if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
1656 return false;
1657 }
1658
1659 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
1660 MachineInstr *UseMI = Op->getParent();
1661 while (UseMI->isCopy() && !Op->getSubReg()) {
1662 Reg = UseMI->getOperand(0).getReg();
1663 if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
1664 return false;
1665 Op = &*MRI->use_nodbg_begin(Reg);
1666 UseMI = Op->getParent();
1667 }
1668
1669 if (Op->getSubReg())
1670 return false;
1671
1672 unsigned OpIdx = Op - &UseMI->getOperand(0);
1673 const MCInstrDesc &InstDesc = UseMI->getDesc();
1674 const TargetRegisterClass *OpRC =
1675 TII->getRegClass(InstDesc, OpIdx, TRI, *MI.getMF());
1676 if (!OpRC || !TRI->isVectorSuperClass(OpRC))
1677 return false;
1678
1679 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
1680 auto Dst = MRI->createVirtualRegister(NewDstRC);
1681 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
1682 TII->get(AMDGPU::REG_SEQUENCE), Dst);
1683
1684 for (unsigned I = 0; I < Defs.size(); ++I) {
1685 MachineOperand *Def = Defs[I].first;
1686 Def->setIsKill(false);
1687 if (TRI->isAGPR(*MRI, Def->getReg())) {
1688 RS.add(*Def);
1689 } else { // This is a copy
1690 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
1691 SubDef->getOperand(1).setIsKill(false);
1692 RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg());
1693 }
1694 RS.addImm(Defs[I].second);
1695 }
1696
1697 Op->setReg(Dst);
1698 if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
1699 Op->setReg(Reg);
1700 RS->eraseFromParent();
1701 return false;
1702 }
1703
1704 LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
1705
1706 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
1707 // in which case we can erase them all later in runOnMachineFunction.
1708 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
1709 MI.eraseFromParent();
1710 return true;
1711}
1712
1713/// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and
1714/// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg
1715static bool isAGPRCopy(const SIRegisterInfo &TRI,
1716 const MachineRegisterInfo &MRI, const MachineInstr &Copy,
1717 Register &OutReg, unsigned &OutSubReg) {
1718 assert(Copy.isCopy());
1719
1720 const MachineOperand &CopySrc = Copy.getOperand(1);
1721 Register CopySrcReg = CopySrc.getReg();
1722 if (!CopySrcReg.isVirtual())
1723 return false;
1724
1725 // Common case: copy from AGPR directly, e.g.
1726 // %1:vgpr_32 = COPY %0:agpr_32
1727 if (TRI.isAGPR(MRI, CopySrcReg)) {
1728 OutReg = CopySrcReg;
1729 OutSubReg = CopySrc.getSubReg();
1730 return true;
1731 }
1732
1733 // Sometimes it can also involve two copies, e.g.
1734 // %1:vgpr_256 = COPY %0:agpr_256
1735 // %2:vgpr_32 = COPY %1:vgpr_256.sub0
1736 const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg);
1737 if (!CopySrcDef || !CopySrcDef->isCopy())
1738 return false;
1739
1740 const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1);
1741 Register OtherCopySrcReg = OtherCopySrc.getReg();
1742 if (!OtherCopySrcReg.isVirtual() ||
1743 CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister ||
1744 OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister ||
1745 !TRI.isAGPR(MRI, OtherCopySrcReg))
1746 return false;
1747
1748 OutReg = OtherCopySrcReg;
1749 OutSubReg = CopySrc.getSubReg();
1750 return true;
1751}
1752
1753// Try to hoist an AGPR to VGPR copy across a PHI.
1754// This should allow folding of an AGPR into a consumer which may support it.
1755//
1756// Example 1: LCSSA PHI
1757// loop:
1758// %1:vreg = COPY %0:areg
1759// exit:
1760// %2:vreg = PHI %1:vreg, %loop
1761// =>
1762// loop:
1763// exit:
1764// %1:areg = PHI %0:areg, %loop
1765// %2:vreg = COPY %1:areg
1766//
1767// Example 2: PHI with multiple incoming values:
1768// entry:
1769// %1:vreg = GLOBAL_LOAD(..)
1770// loop:
1771// %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
1772// %3:areg = COPY %2:vreg
1773// %4:areg = (instr using %3:areg)
1774// %5:vreg = COPY %4:areg
1775// =>
1776// entry:
1777// %1:vreg = GLOBAL_LOAD(..)
1778// %2:areg = COPY %1:vreg
1779// loop:
1780// %3:areg = PHI %2:areg, %entry, %X:areg,
1781// %4:areg = (instr using %3:areg)
1782bool SIFoldOperands::tryFoldPhiAGPR(MachineInstr &PHI) {
1783 assert(PHI.isPHI());
1784
1785 Register PhiOut = PHI.getOperand(0).getReg();
1786 if (!TRI->isVGPR(*MRI, PhiOut))
1787 return false;
1788
1789 // Iterate once over all incoming values of the PHI to check if this PHI is
1790 // eligible, and determine the exact AGPR RC we'll target.
1791 const TargetRegisterClass *ARC = nullptr;
1792 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
1793 MachineOperand &MO = PHI.getOperand(K);
1794 MachineInstr *Copy = MRI->getVRegDef(MO.getReg());
1795 if (!Copy || !Copy->isCopy())
1796 continue;
1797
1798 Register AGPRSrc;
1799 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
1800 if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask))
1801 continue;
1802
1803 const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc);
1804 if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
1805 CopyInRC = SubRC;
1806
1807 if (ARC && !ARC->hasSubClassEq(CopyInRC))
1808 return false;
1809 ARC = CopyInRC;
1810 }
1811
1812 if (!ARC)
1813 return false;
1814
1815 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
1816
1817 // Rewrite the PHI's incoming values to ARC.
1818 LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI);
1819 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
1820 MachineOperand &MO = PHI.getOperand(K);
1821 Register Reg = MO.getReg();
1822
1824 MachineBasicBlock *InsertMBB = nullptr;
1825
1826 // Look at the def of Reg, ignoring all copies.
1827 unsigned CopyOpc = AMDGPU::COPY;
1828 if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
1829
1830 // Look at pre-existing COPY instructions from ARC: Steal the operand. If
1831 // the copy was single-use, it will be removed by DCE later.
1832 if (Def->isCopy()) {
1833 Register AGPRSrc;
1834 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
1835 if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) {
1836 MO.setReg(AGPRSrc);
1837 MO.setSubReg(AGPRSubReg);
1838 continue;
1839 }
1840
1841 // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
1842 // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
1843 // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
1844 // is unlikely to be profitable.
1845 //
1846 // Note that V_ACCVGPR_WRITE is only used for AGPR_32.
1847 MachineOperand &CopyIn = Def->getOperand(1);
1848 if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) &&
1849 TRI->isSGPRReg(*MRI, CopyIn.getReg()))
1850 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1851 }
1852
1853 InsertMBB = Def->getParent();
1854 InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator());
1855 } else {
1856 InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
1857 InsertPt = InsertMBB->getFirstTerminator();
1858 }
1859
1860 Register NewReg = MRI->createVirtualRegister(ARC);
1861 MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
1862 TII->get(CopyOpc), NewReg)
1863 .addReg(Reg);
1864 MO.setReg(NewReg);
1865
1866 (void)MI;
1867 LLVM_DEBUG(dbgs() << " Created COPY: " << *MI);
1868 }
1869
1870 // Replace the PHI's result with a new register.
1871 Register NewReg = MRI->createVirtualRegister(ARC);
1872 PHI.getOperand(0).setReg(NewReg);
1873
1874 // COPY that new register back to the original PhiOut register. This COPY will
1875 // usually be folded out later.
1876 MachineBasicBlock *MBB = PHI.getParent();
1877 BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(),
1878 TII->get(AMDGPU::COPY), PhiOut)
1879 .addReg(NewReg);
1880
1881 LLVM_DEBUG(dbgs() << " Done: Folded " << PHI);
1882 return true;
1883}
1884
1885// Attempt to convert VGPR load to an AGPR load.
1886bool SIFoldOperands::tryFoldLoad(MachineInstr &MI) {
1887 assert(MI.mayLoad());
1888 if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
1889 return false;
1890
1891 MachineOperand &Def = MI.getOperand(0);
1892 if (!Def.isDef())
1893 return false;
1894
1895 Register DefReg = Def.getReg();
1896
1897 if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
1898 return false;
1899
1901 SmallVector<Register, 8> MoveRegs;
1902 for (const MachineInstr &I : MRI->use_nodbg_instructions(DefReg))
1903 Users.push_back(&I);
1904
1905 if (Users.empty())
1906 return false;
1907
1908 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
1909 while (!Users.empty()) {
1910 const MachineInstr *I = Users.pop_back_val();
1911 if (!I->isCopy() && !I->isRegSequence())
1912 return false;
1913 Register DstReg = I->getOperand(0).getReg();
1914 // Physical registers may have more than one instruction definitions
1915 if (DstReg.isPhysical())
1916 return false;
1917 if (TRI->isAGPR(*MRI, DstReg))
1918 continue;
1919 MoveRegs.push_back(DstReg);
1920 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg))
1921 Users.push_back(&U);
1922 }
1923
1924 const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
1925 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
1926 if (!TII->isOperandLegal(MI, 0, &Def)) {
1927 MRI->setRegClass(DefReg, RC);
1928 return false;
1929 }
1930
1931 while (!MoveRegs.empty()) {
1932 Register Reg = MoveRegs.pop_back_val();
1933 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
1934 }
1935
1936 LLVM_DEBUG(dbgs() << "Folded " << MI);
1937
1938 return true;
1939}
1940
1941// tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
1942// For GFX90A and later, this is pretty much always a good thing, but for GFX908
1943// there's cases where it can create a lot more AGPR-AGPR copies, which are
1944// expensive on this architecture due to the lack of V_ACCVGPR_MOV.
1945//
1946// This function looks at all AGPR PHIs in a basic block and collects their
1947// operands. Then, it checks for register that are used more than once across
1948// all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
1949// having to create one VGPR temporary per use, which can get very messy if
1950// these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
1951// element).
1952//
1953// Example
1954// a:
1955// %in:agpr_256 = COPY %foo:vgpr_256
1956// c:
1957// %x:agpr_32 = ..
1958// b:
1959// %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
1960// %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
1961// %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
1962// =>
1963// a:
1964// %in:agpr_256 = COPY %foo:vgpr_256
1965// %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
1966// %tmp_agpr:agpr_32 = COPY %tmp
1967// c:
1968// %x:agpr_32 = ..
1969// b:
1970// %0:areg = PHI %tmp_agpr, %a, %x, %c
1971// %1:areg = PHI %tmp_agpr, %a, %y, %c
1972// %2:areg = PHI %tmp_agpr, %a, %z, %c
1973bool SIFoldOperands::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
1974 // This is only really needed on GFX908 where AGPR-AGPR copies are
1975 // unreasonably difficult.
1976 if (ST->hasGFX90AInsts())
1977 return false;
1978
1979 // Look at all AGPR Phis and collect the register + subregister used.
1980 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
1981 RegToMO;
1982
1983 for (auto &MI : MBB) {
1984 if (!MI.isPHI())
1985 break;
1986
1987 if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))
1988 continue;
1989
1990 for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
1991 MachineOperand &PhiMO = MI.getOperand(K);
1992 RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
1993 }
1994 }
1995
1996 // For all (Reg, SubReg) pair that are used more than once, cache the value in
1997 // a VGPR.
1998 bool Changed = false;
1999 for (const auto &[Entry, MOs] : RegToMO) {
2000 if (MOs.size() == 1)
2001 continue;
2002
2003 const auto [Reg, SubReg] = Entry;
2004 MachineInstr *Def = MRI->getVRegDef(Reg);
2005 MachineBasicBlock *DefMBB = Def->getParent();
2006
2007 // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
2008 // out.
2009 const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
2010 Register TempVGPR =
2011 MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
2012 MachineInstr *VGPRCopy =
2013 BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
2014 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2015 .addReg(Reg, /* flags */ 0, SubReg);
2016
2017 // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
2018 Register TempAGPR = MRI->createVirtualRegister(ARC);
2019 BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(),
2020 TII->get(AMDGPU::COPY), TempAGPR)
2021 .addReg(TempVGPR);
2022
2023 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
2024 for (MachineOperand *MO : MOs) {
2025 MO->setReg(TempAGPR);
2026 MO->setSubReg(AMDGPU::NoSubRegister);
2027 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n");
2028 }
2029
2030 Changed = true;
2031 }
2032
2033 return Changed;
2034}
2035
2036bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
2037 if (skipFunction(MF.getFunction()))
2038 return false;
2039
2040 MRI = &MF.getRegInfo();
2041 ST = &MF.getSubtarget<GCNSubtarget>();
2042 TII = ST->getInstrInfo();
2043 TRI = &TII->getRegisterInfo();
2044 MFI = MF.getInfo<SIMachineFunctionInfo>();
2045
2046 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
2047 // correctly handle signed zeros.
2048 //
2049 // FIXME: Also need to check strictfp
2050 bool IsIEEEMode = MFI->getMode().IEEE;
2051 bool HasNSZ = MFI->hasNoSignedZerosFPMath();
2052
2053 bool Changed = false;
2054 for (MachineBasicBlock *MBB : depth_first(&MF)) {
2055 MachineOperand *CurrentKnownM0Val = nullptr;
2056 for (auto &MI : make_early_inc_range(*MBB)) {
2057 Changed |= tryFoldCndMask(MI);
2058
2059 if (tryFoldZeroHighBits(MI)) {
2060 Changed = true;
2061 continue;
2062 }
2063
2064 if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
2065 Changed = true;
2066 continue;
2067 }
2068
2069 if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
2070 Changed = true;
2071 continue;
2072 }
2073
2074 if (MI.mayLoad() && tryFoldLoad(MI)) {
2075 Changed = true;
2076 continue;
2077 }
2078
2079 if (TII->isFoldableCopy(MI)) {
2080 Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
2081 continue;
2082 }
2083
2084 // Saw an unknown clobber of m0, so we no longer know what it is.
2085 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
2086 CurrentKnownM0Val = nullptr;
2087
2088 // TODO: Omod might be OK if there is NSZ only on the source
2089 // instruction, and not the omod multiply.
2090 if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
2091 !tryFoldOMod(MI))
2092 Changed |= tryFoldClamp(MI);
2093 }
2094
2095 Changed |= tryOptimizeAGPRPhis(*MBB);
2096 }
2097
2098 return Changed;
2099}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
aarch64 promote const
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static bool isReg(const MCInst &MI, unsigned OpNo)
Module * Mod
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
uint64_t TSFlags
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static bool isUseMIInFoldList(ArrayRef< FoldCandidate > FoldList, const MachineInstr *MI)
static unsigned getMovOpc(bool IsScalar)
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc)
#define DEBUG_TYPE
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, bool Commuted=false, int ShrinkOp=-1)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition: APInt.h:76
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:269
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
Definition: MCInstrDesc.h:261
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
iterator SkipPHIsLabelsAndDebug(iterator I, bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:68
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:543
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:326
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:546
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
Definition: MachineInstr.h:749
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:540
bool isRegSequence() const
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:472
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr reads the specified register.
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
iterator_range< mop_iterator > implicit_operands()
Definition: MachineInstr.h:673
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:553
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:941
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
self_iterator getIterator()
Definition: ilist_node.h:82
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
@ OPERAND_REG_INLINE_C_LAST
Definition: SIDefines.h:238
@ OPERAND_REG_IMM_V2FP16
Definition: SIDefines.h:196
@ OPERAND_REG_IMM_V2INT16
Definition: SIDefines.h:197
@ OPERAND_REG_INLINE_AC_FIRST
Definition: SIDefines.h:240
@ OPERAND_REG_INLINE_C_V2FP16
Definition: SIDefines.h:209
@ OPERAND_REG_INLINE_C_FIRST
Definition: SIDefines.h:237
@ OPERAND_REG_INLINE_AC_LAST
Definition: SIDefines.h:241
@ OPERAND_REG_INLINE_C_INT32
Definition: SIDefines.h:203
@ OPERAND_REG_INLINE_C_V2INT16
Definition: SIDefines.h:208
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY int getFlatScratchInstSSfromSV(uint16_t Opcode)
bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ OPERAND_REGISTER
Definition: MCInstrDesc.h:61
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1281
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
char & SIFoldOperandsID
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:666
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1734
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
FunctionPass * createSIFoldOperandsPass()
DWARFExpression::Operation Op
void initializeSIFoldOperandsPass(PassRegistry &)
iterator_range< df_iterator< T > > depth_first(const T &G)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
Description of the encoding of one expression Op.
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
A pair composed of a register and a sub-register index.