LLVM 17.0.0git
SIFoldOperands.cpp
Go to the documentation of this file.
1//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7/// \file
8//===----------------------------------------------------------------------===//
9//
10
11#include "AMDGPU.h"
12#include "GCNSubtarget.h"
18
19#define DEBUG_TYPE "si-fold-operands"
20using namespace llvm;
21
22namespace {
23
24struct FoldCandidate {
26 union {
27 MachineOperand *OpToFold;
28 uint64_t ImmToFold;
29 int FrameIndexToFold;
30 };
31 int ShrinkOpcode;
32 unsigned UseOpNo;
34 bool Commuted;
35
36 FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
37 bool Commuted_ = false,
38 int ShrinkOp = -1) :
39 UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
40 Kind(FoldOp->getType()),
41 Commuted(Commuted_) {
42 if (FoldOp->isImm()) {
43 ImmToFold = FoldOp->getImm();
44 } else if (FoldOp->isFI()) {
45 FrameIndexToFold = FoldOp->getIndex();
46 } else {
47 assert(FoldOp->isReg() || FoldOp->isGlobal());
48 OpToFold = FoldOp;
49 }
50 }
51
52 bool isFI() const {
53 return Kind == MachineOperand::MO_FrameIndex;
54 }
55
56 bool isImm() const {
57 return Kind == MachineOperand::MO_Immediate;
58 }
59
60 bool isReg() const {
61 return Kind == MachineOperand::MO_Register;
62 }
63
64 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
65
66 bool needsShrink() const { return ShrinkOpcode != -1; }
67};
68
69class SIFoldOperands : public MachineFunctionPass {
70public:
71 static char ID;
73 const SIInstrInfo *TII;
74 const SIRegisterInfo *TRI;
75 const GCNSubtarget *ST;
76 const SIMachineFunctionInfo *MFI;
77
78 bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
79 const MachineOperand &OpToFold) const;
80
81 bool updateOperand(FoldCandidate &Fold) const;
82
83 bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
84 MachineInstr *MI, unsigned OpNo,
85 MachineOperand *OpToFold) const;
86 bool isUseSafeToFold(const MachineInstr &MI,
87 const MachineOperand &UseMO) const;
88 bool
89 getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
90 Register UseReg, uint8_t OpTy) const;
91 bool tryToFoldACImm(const MachineOperand &OpToFold, MachineInstr *UseMI,
92 unsigned UseOpIdx,
93 SmallVectorImpl<FoldCandidate> &FoldList) const;
94 void foldOperand(MachineOperand &OpToFold,
96 int UseOpIdx,
98 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
99
100 MachineOperand *getImmOrMaterializedImm(MachineOperand &Op) const;
101 bool tryConstantFoldOp(MachineInstr *MI) const;
102 bool tryFoldCndMask(MachineInstr &MI) const;
103 bool tryFoldZeroHighBits(MachineInstr &MI) const;
104 bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
105 bool tryFoldFoldableCopy(MachineInstr &MI,
106 MachineOperand *&CurrentKnownM0Val) const;
107
108 const MachineOperand *isClamp(const MachineInstr &MI) const;
109 bool tryFoldClamp(MachineInstr &MI);
110
111 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
112 bool tryFoldOMod(MachineInstr &MI);
113 bool tryFoldRegSequence(MachineInstr &MI);
114 bool tryFoldPhiAGPR(MachineInstr &MI);
115 bool tryFoldLoad(MachineInstr &MI);
116
117 bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
118
119public:
120 SIFoldOperands() : MachineFunctionPass(ID) {
122 }
123
124 bool runOnMachineFunction(MachineFunction &MF) override;
125
126 StringRef getPassName() const override { return "SI Fold Operands"; }
127
128 void getAnalysisUsage(AnalysisUsage &AU) const override {
129 AU.setPreservesCFG();
131 }
132};
133
134} // End anonymous namespace.
135
136INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
137 "SI Fold Operands", false, false)
138
139char SIFoldOperands::ID = 0;
140
141char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
142
145 const MachineOperand &MO) {
146 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
147 if (const TargetRegisterClass *SubRC =
148 TRI.getSubRegisterClass(RC, MO.getSubReg()))
149 RC = SubRC;
150 return RC;
151}
152
153// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
154static unsigned macToMad(unsigned Opc) {
155 switch (Opc) {
156 case AMDGPU::V_MAC_F32_e64:
157 return AMDGPU::V_MAD_F32_e64;
158 case AMDGPU::V_MAC_F16_e64:
159 return AMDGPU::V_MAD_F16_e64;
160 case AMDGPU::V_FMAC_F32_e64:
161 return AMDGPU::V_FMA_F32_e64;
162 case AMDGPU::V_FMAC_F16_e64:
163 return AMDGPU::V_FMA_F16_gfx9_e64;
164 case AMDGPU::V_FMAC_F16_t16_e64:
165 return AMDGPU::V_FMA_F16_gfx9_e64;
166 case AMDGPU::V_FMAC_LEGACY_F32_e64:
167 return AMDGPU::V_FMA_LEGACY_F32_e64;
168 case AMDGPU::V_FMAC_F64_e64:
169 return AMDGPU::V_FMA_F64_e64;
170 }
171 return AMDGPU::INSTRUCTION_LIST_END;
172}
173
174// TODO: Add heuristic that the frame index might not fit in the addressing mode
175// immediate offset to avoid materializing in loops.
176bool SIFoldOperands::frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
177 const MachineOperand &OpToFold) const {
178 if (!OpToFold.isFI())
179 return false;
180
181 const unsigned Opc = UseMI.getOpcode();
182 if (TII->isMUBUF(UseMI))
183 return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
184 if (!TII->isFLATScratch(UseMI))
185 return false;
186
187 int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
188 if (OpNo == SIdx)
189 return true;
190
191 int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
192 return OpNo == VIdx && SIdx == -1;
193}
194
196 return new SIFoldOperands();
197}
198
199bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
200 MachineInstr *MI = Fold.UseMI;
201 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
202 assert(Old.isReg());
203
204
205 const uint64_t TSFlags = MI->getDesc().TSFlags;
206 if (Fold.isImm()) {
208 (!ST->hasDOTOpSelHazard() || !(TSFlags & SIInstrFlags::IsDOT)) &&
209 AMDGPU::isFoldableLiteralV216(Fold.ImmToFold,
210 ST->hasInv2PiInlineImm())) {
211 // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
212 // already set.
213 unsigned Opcode = MI->getOpcode();
214 int OpNo = MI->getOperandNo(&Old);
215 int ModIdx = -1;
216 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
217 ModIdx = AMDGPU::OpName::src0_modifiers;
218 else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
219 ModIdx = AMDGPU::OpName::src1_modifiers;
220 else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
221 ModIdx = AMDGPU::OpName::src2_modifiers;
222 assert(ModIdx != -1);
223 ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
224 MachineOperand &Mod = MI->getOperand(ModIdx);
225 unsigned Val = Mod.getImm();
226 if (!(Val & SISrcMods::OP_SEL_0) && (Val & SISrcMods::OP_SEL_1)) {
227 // Only apply the following transformation if that operand requires
228 // a packed immediate.
229 switch (TII->get(Opcode).operands()[OpNo].OperandType) {
230 case AMDGPU::OPERAND_REG_IMM_V2FP16:
231 case AMDGPU::OPERAND_REG_IMM_V2INT16:
232 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
233 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
234 // If upper part is all zero we do not need op_sel_hi.
235 if (!isUInt<16>(Fold.ImmToFold)) {
236 if (!(Fold.ImmToFold & 0xffff)) {
237 Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
238 Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
239 Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
240 return true;
241 }
242 Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
243 Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
244 return true;
245 }
246 break;
247 default:
248 break;
249 }
250 }
251 }
252 }
253
254 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
255 MachineBasicBlock *MBB = MI->getParent();
256 auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16);
257 if (Liveness != MachineBasicBlock::LQR_Dead) {
258 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
259 return false;
260 }
261
262 int Op32 = Fold.ShrinkOpcode;
263 MachineOperand &Dst0 = MI->getOperand(0);
264 MachineOperand &Dst1 = MI->getOperand(1);
265 assert(Dst0.isDef() && Dst1.isDef());
266
267 bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(Dst1.getReg());
268
269 const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg());
270 Register NewReg0 = MRI->createVirtualRegister(Dst0RC);
271
272 MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32);
273
274 if (HaveNonDbgCarryUse) {
275 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::COPY),
276 Dst1.getReg())
277 .addReg(AMDGPU::VCC, RegState::Kill);
278 }
279
280 // Keep the old instruction around to avoid breaking iterators, but
281 // replace it with a dummy instruction to remove uses.
282 //
283 // FIXME: We should not invert how this pass looks at operands to avoid
284 // this. Should track set of foldable movs instead of looking for uses
285 // when looking at a use.
286 Dst0.setReg(NewReg0);
287 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
288 MI->removeOperand(I);
289 MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF));
290
291 if (Fold.Commuted)
292 TII->commuteInstruction(*Inst32, false);
293 return true;
294 }
295
296 assert(!Fold.needsShrink() && "not handled");
297
298 if (Fold.isImm()) {
299 if (Old.isTied()) {
300 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
301 if (NewMFMAOpc == -1)
302 return false;
303 MI->setDesc(TII->get(NewMFMAOpc));
304 MI->untieRegOperand(0);
305 }
306 Old.ChangeToImmediate(Fold.ImmToFold);
307 return true;
308 }
309
310 if (Fold.isGlobal()) {
311 Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
312 Fold.OpToFold->getTargetFlags());
313 return true;
314 }
315
316 if (Fold.isFI()) {
317 Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
318 return true;
319 }
320
321 MachineOperand *New = Fold.OpToFold;
322 Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
323 Old.setIsUndef(New->isUndef());
324 return true;
325}
326
328 const MachineInstr *MI) {
329 return any_of(FoldList, [&](const auto &C) { return C.UseMI == MI; });
330}
331
333 MachineInstr *MI, unsigned OpNo,
334 MachineOperand *FoldOp, bool Commuted = false,
335 int ShrinkOp = -1) {
336 // Skip additional folding on the same operand.
337 for (FoldCandidate &Fold : FoldList)
338 if (Fold.UseMI == MI && Fold.UseOpNo == OpNo)
339 return;
340 LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal")
341 << " operand " << OpNo << "\n " << *MI);
342 FoldList.emplace_back(MI, OpNo, FoldOp, Commuted, ShrinkOp);
343}
344
345bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
346 MachineInstr *MI, unsigned OpNo,
347 MachineOperand *OpToFold) const {
348 if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
349 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
350 unsigned Opc = MI->getOpcode();
351 unsigned NewOpc = macToMad(Opc);
352 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
353 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
354 // to fold the operand.
355 MI->setDesc(TII->get(NewOpc));
356 bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) &&
357 AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel);
358 if (AddOpSel)
359 MI->addOperand(MachineOperand::CreateImm(0));
360 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);
361 if (FoldAsMAD) {
362 MI->untieRegOperand(OpNo);
363 return true;
364 }
365 if (AddOpSel)
366 MI->removeOperand(MI->getNumExplicitOperands() - 1);
367 MI->setDesc(TII->get(Opc));
368 }
369
370 // Special case for s_setreg_b32
371 if (OpToFold->isImm()) {
372 unsigned ImmOpc = 0;
373 if (Opc == AMDGPU::S_SETREG_B32)
374 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
375 else if (Opc == AMDGPU::S_SETREG_B32_mode)
376 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
377 if (ImmOpc) {
378 MI->setDesc(TII->get(ImmOpc));
379 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
380 return true;
381 }
382 }
383
384 // If we are already folding into another operand of MI, then
385 // we can't commute the instruction, otherwise we risk making the
386 // other fold illegal.
387 if (isUseMIInFoldList(FoldList, MI))
388 return false;
389
390 unsigned CommuteOpNo = OpNo;
391
392 // Operand is not legal, so try to commute the instruction to
393 // see if this makes it possible to fold.
394 unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
395 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
396 bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1);
397
398 if (CanCommute) {
399 if (CommuteIdx0 == OpNo)
400 CommuteOpNo = CommuteIdx1;
401 else if (CommuteIdx1 == OpNo)
402 CommuteOpNo = CommuteIdx0;
403 }
404
405
406 // One of operands might be an Imm operand, and OpNo may refer to it after
407 // the call of commuteInstruction() below. Such situations are avoided
408 // here explicitly as OpNo must be a register operand to be a candidate
409 // for memory folding.
410 if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() ||
411 !MI->getOperand(CommuteIdx1).isReg()))
412 return false;
413
414 if (!CanCommute ||
415 !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
416 return false;
417
418 if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
419 if ((Opc == AMDGPU::V_ADD_CO_U32_e64 ||
420 Opc == AMDGPU::V_SUB_CO_U32_e64 ||
421 Opc == AMDGPU::V_SUBREV_CO_U32_e64) && // FIXME
422 (OpToFold->isImm() || OpToFold->isFI() || OpToFold->isGlobal())) {
423
424 // Verify the other operand is a VGPR, otherwise we would violate the
425 // constant bus restriction.
426 unsigned OtherIdx = CommuteOpNo == CommuteIdx0 ? CommuteIdx1 : CommuteIdx0;
427 MachineOperand &OtherOp = MI->getOperand(OtherIdx);
428 if (!OtherOp.isReg() ||
429 !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
430 return false;
431
432 assert(MI->getOperand(1).isDef());
433
434 // Make sure to get the 32-bit version of the commuted opcode.
435 unsigned MaybeCommutedOpc = MI->getOpcode();
436 int Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
437
438 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32);
439 return true;
440 }
441
442 TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1);
443 return false;
444 }
445
446 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true);
447 return true;
448 }
449
450 // Check the case where we might introduce a second constant operand to a
451 // scalar instruction
452 if (TII->isSALU(MI->getOpcode())) {
453 const MCInstrDesc &InstDesc = MI->getDesc();
454 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
455
456 // Fine if the operand can be encoded as an inline constant
457 if (!OpToFold->isReg() && !TII->isInlineConstant(*OpToFold, OpInfo)) {
458 // Otherwise check for another constant
459 for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) {
460 auto &Op = MI->getOperand(i);
461 if (OpNo != i && !Op.isReg() && !TII->isInlineConstant(Op, OpInfo))
462 return false;
463 }
464 }
465 }
466
467 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
468 return true;
469}
470
471bool SIFoldOperands::isUseSafeToFold(const MachineInstr &MI,
472 const MachineOperand &UseMO) const {
473 // Operands of SDWA instructions must be registers.
474 return !TII->isSDWA(MI);
475}
476
477// Find a def of the UseReg, check if it is a reg_sequence and find initializers
478// for each subreg, tracking it to foldable inline immediate if possible.
479// Returns true on success.
480bool SIFoldOperands::getRegSeqInit(
481 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
482 Register UseReg, uint8_t OpTy) const {
483 MachineInstr *Def = MRI->getVRegDef(UseReg);
484 if (!Def || !Def->isRegSequence())
485 return false;
486
487 for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
488 MachineOperand *Sub = &Def->getOperand(I);
489 assert(Sub->isReg());
490
491 for (MachineInstr *SubDef = MRI->getVRegDef(Sub->getReg());
492 SubDef && Sub->isReg() && Sub->getReg().isVirtual() &&
493 !Sub->getSubReg() && TII->isFoldableCopy(*SubDef);
494 SubDef = MRI->getVRegDef(Sub->getReg())) {
495 MachineOperand *Op = &SubDef->getOperand(1);
496 if (Op->isImm()) {
497 if (TII->isInlineConstant(*Op, OpTy))
498 Sub = Op;
499 break;
500 }
501 if (!Op->isReg() || Op->getReg().isPhysical())
502 break;
503 Sub = Op;
504 }
505
506 Defs.emplace_back(Sub, Def->getOperand(I + 1).getImm());
507 }
508
509 return true;
510}
511
512bool SIFoldOperands::tryToFoldACImm(
513 const MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
514 SmallVectorImpl<FoldCandidate> &FoldList) const {
515 const MCInstrDesc &Desc = UseMI->getDesc();
516 if (UseOpIdx >= Desc.getNumOperands())
517 return false;
518
519 uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
520 if ((OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST ||
521 OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST) &&
522 (OpTy < AMDGPU::OPERAND_REG_INLINE_C_FIRST ||
523 OpTy > AMDGPU::OPERAND_REG_INLINE_C_LAST))
524 return false;
525
526 if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) &&
527 TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
529 return true;
530 }
531
532 if (!OpToFold.isReg())
533 return false;
534
535 Register UseReg = OpToFold.getReg();
536 if (!UseReg.isVirtual())
537 return false;
538
539 if (isUseMIInFoldList(FoldList, UseMI))
540 return false;
541
542 // Maybe it is just a COPY of an immediate itself.
543 MachineInstr *Def = MRI->getVRegDef(UseReg);
545 if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) {
546 MachineOperand &DefOp = Def->getOperand(1);
547 if (DefOp.isImm() && TII->isInlineConstant(DefOp, OpTy) &&
548 TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
550 return true;
551 }
552 }
553
555 if (!getRegSeqInit(Defs, UseReg, OpTy))
556 return false;
557
558 int32_t Imm;
559 for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
560 const MachineOperand *Op = Defs[I].first;
561 if (!Op->isImm())
562 return false;
563
564 auto SubImm = Op->getImm();
565 if (!I) {
566 Imm = SubImm;
567 if (!TII->isInlineConstant(*Op, OpTy) ||
568 !TII->isOperandLegal(*UseMI, UseOpIdx, Op))
569 return false;
570
571 continue;
572 }
573 if (Imm != SubImm)
574 return false; // Can only fold splat constants
575 }
576
577 appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first);
578 return true;
579}
580
581void SIFoldOperands::foldOperand(
582 MachineOperand &OpToFold,
584 int UseOpIdx,
586 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
587 const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
588
589 if (!isUseSafeToFold(*UseMI, UseOp))
590 return;
591
592 // FIXME: Fold operands with subregs.
593 if (UseOp.isReg() && OpToFold.isReg() &&
594 (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister))
595 return;
596
597 // Special case for REG_SEQUENCE: We can't fold literals into
598 // REG_SEQUENCE instructions, so we have to fold them into the
599 // uses of REG_SEQUENCE.
600 if (UseMI->isRegSequence()) {
601 Register RegSeqDstReg = UseMI->getOperand(0).getReg();
602 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
603
604 for (auto &RSUse : make_early_inc_range(MRI->use_nodbg_operands(RegSeqDstReg))) {
605 MachineInstr *RSUseMI = RSUse.getParent();
606
607 if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI,
608 RSUseMI->getOperandNo(&RSUse), FoldList))
609 continue;
610
611 if (RSUse.getSubReg() != RegSeqDstSubReg)
612 continue;
613
614 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(&RSUse), FoldList,
615 CopiesToReplace);
616 }
617
618 return;
619 }
620
621 if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList))
622 return;
623
624 if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) {
625 // Verify that this is a stack access.
626 // FIXME: Should probably use stack pseudos before frame lowering.
627
628 if (TII->isMUBUF(*UseMI)) {
629 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
630 MFI->getScratchRSrcReg())
631 return;
632
633 // Ensure this is either relative to the current frame or the current
634 // wave.
635 MachineOperand &SOff =
636 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
637 if (!SOff.isImm() || SOff.getImm() != 0)
638 return;
639 }
640
641 // A frame index will resolve to a positive constant, so it should always be
642 // safe to fold the addressing mode, even pre-GFX9.
644
645 const unsigned Opc = UseMI->getOpcode();
646 if (TII->isFLATScratch(*UseMI) &&
647 AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
648 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
649 unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
650 UseMI->setDesc(TII->get(NewOpc));
651 }
652
653 return;
654 }
655
656 bool FoldingImmLike =
657 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
658
659 if (FoldingImmLike && UseMI->isCopy()) {
660 Register DestReg = UseMI->getOperand(0).getReg();
661 Register SrcReg = UseMI->getOperand(1).getReg();
662 assert(SrcReg.isVirtual());
663
664 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
665
666 // Don't fold into a copy to a physical register with the same class. Doing
667 // so would interfere with the register coalescer's logic which would avoid
668 // redundant initializations.
669 if (DestReg.isPhysical() && SrcRC->contains(DestReg))
670 return;
671
672 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
673 if (!DestReg.isPhysical()) {
674 if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
676 for (auto &Use : MRI->use_nodbg_operands(DestReg)) {
677 // There's no point trying to fold into an implicit operand.
678 if (Use.isImplicit())
679 continue;
680
681 CopyUses.emplace_back(Use.getParent(),
682 Use.getParent()->getOperandNo(&Use),
683 &UseMI->getOperand(1));
684 }
685
686 for (auto &F : CopyUses) {
687 foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, FoldList,
688 CopiesToReplace);
689 }
690 }
691
692 if (DestRC == &AMDGPU::AGPR_32RegClass &&
693 TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
694 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
696 CopiesToReplace.push_back(UseMI);
697 return;
698 }
699 }
700
701 // In order to fold immediates into copies, we need to change the
702 // copy to a MOV.
703
704 unsigned MovOp = TII->getMovOpcode(DestRC);
705 if (MovOp == AMDGPU::COPY)
706 return;
707
708 UseMI->setDesc(TII->get(MovOp));
711 while (ImpOpI != ImpOpE) {
712 MachineInstr::mop_iterator Tmp = ImpOpI;
713 ImpOpI++;
715 }
716 CopiesToReplace.push_back(UseMI);
717 } else {
718 if (UseMI->isCopy() && OpToFold.isReg() &&
720 !UseMI->getOperand(1).getSubReg()) {
721 LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
722 unsigned Size = TII->getOpSize(*UseMI, 1);
723 Register UseReg = OpToFold.getReg();
725 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
726 UseMI->getOperand(1).setIsKill(false);
727 CopiesToReplace.push_back(UseMI);
728 OpToFold.setIsKill(false);
729
730 // Remove kill flags as kills may now be out of order with uses.
731 MRI->clearKillFlags(OpToFold.getReg());
732
733 // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
734 // can only accept VGPR or inline immediate. Recreate a reg_sequence with
735 // its initializers right here, so we will rematerialize immediates and
736 // avoid copies via different reg classes.
738 if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
739 getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
740 const DebugLoc &DL = UseMI->getDebugLoc();
742
743 UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
744 for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
746
750 for (unsigned I = 0; I < Size / 4; ++I) {
751 MachineOperand *Def = Defs[I].first;
753 if (Def->isImm() &&
754 TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
755 int64_t Imm = Def->getImm();
756
757 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
759 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm);
760 B.addReg(Tmp);
761 } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
762 auto Src = getRegSubRegPair(*Def);
763 Def->setIsKill(false);
764 if (!SeenAGPRs.insert(Src)) {
765 // We cannot build a reg_sequence out of the same registers, they
766 // must be copied. Better do it here before copyPhysReg() created
767 // several reads to do the AGPR->VGPR->AGPR copy.
768 CopyToVGPR = Src;
769 } else {
770 B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0,
771 Src.SubReg);
772 }
773 } else {
774 assert(Def->isReg());
775 Def->setIsKill(false);
776 auto Src = getRegSubRegPair(*Def);
777
778 // Direct copy from SGPR to AGPR is not possible. To avoid creation
779 // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
780 // create a copy here and track if we already have such a copy.
781 if (TRI->isSGPRReg(*MRI, Src.Reg)) {
782 CopyToVGPR = Src;
783 } else {
784 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
785 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
786 B.addReg(Tmp);
787 }
788 }
789
790 if (CopyToVGPR.Reg) {
791 Register Vgpr;
792 if (VGPRCopies.count(CopyToVGPR)) {
793 Vgpr = VGPRCopies[CopyToVGPR];
794 } else {
795 Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
796 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
797 VGPRCopies[CopyToVGPR] = Vgpr;
798 }
799 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
801 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg(Vgpr);
802 B.addReg(Tmp);
803 }
804
805 B.addImm(Defs[I].second);
806 }
807 LLVM_DEBUG(dbgs() << "Folded " << *UseMI);
808 return;
809 }
810
811 if (Size != 4)
812 return;
813
814 Register Reg0 = UseMI->getOperand(0).getReg();
815 Register Reg1 = UseMI->getOperand(1).getReg();
816 if (TRI->isAGPR(*MRI, Reg0) && TRI->isVGPR(*MRI, Reg1))
817 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
818 else if (TRI->isVGPR(*MRI, Reg0) && TRI->isAGPR(*MRI, Reg1))
819 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64));
820 else if (ST->hasGFX90AInsts() && TRI->isAGPR(*MRI, Reg0) &&
821 TRI->isAGPR(*MRI, Reg1))
822 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32));
823 return;
824 }
825
826 unsigned UseOpc = UseMI->getOpcode();
827 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
828 (UseOpc == AMDGPU::V_READLANE_B32 &&
829 (int)UseOpIdx ==
830 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
831 // %vgpr = V_MOV_B32 imm
832 // %sgpr = V_READFIRSTLANE_B32 %vgpr
833 // =>
834 // %sgpr = S_MOV_B32 imm
835 if (FoldingImmLike) {
838 *OpToFold.getParent(),
839 *UseMI))
840 return;
841
842 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
843
844 if (OpToFold.isImm())
846 else
848 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
849 return;
850 }
851
852 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
855 *OpToFold.getParent(),
856 *UseMI))
857 return;
858
859 // %vgpr = COPY %sgpr0
860 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
861 // =>
862 // %sgpr1 = COPY %sgpr0
863 UseMI->setDesc(TII->get(AMDGPU::COPY));
864 UseMI->getOperand(1).setReg(OpToFold.getReg());
865 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
866 UseMI->getOperand(1).setIsKill(false);
867 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
868 return;
869 }
870 }
871
872 const MCInstrDesc &UseDesc = UseMI->getDesc();
873
874 // Don't fold into target independent nodes. Target independent opcodes
875 // don't have defined register classes.
876 if (UseDesc.isVariadic() || UseOp.isImplicit() ||
877 UseDesc.operands()[UseOpIdx].RegClass == -1)
878 return;
879 }
880
881 if (!FoldingImmLike) {
882 if (OpToFold.isReg() && ST->needsAlignedVGPRs()) {
883 // Don't fold if OpToFold doesn't hold an aligned register.
884 const TargetRegisterClass *RC =
885 TRI->getRegClassForReg(*MRI, OpToFold.getReg());
886 if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) {
887 unsigned SubReg = OpToFold.getSubReg();
888 if (const TargetRegisterClass *SubRC =
889 TRI->getSubRegisterClass(RC, SubReg))
890 RC = SubRC;
891 }
892
893 if (!RC || !TRI->isProperlyAlignedRC(*RC))
894 return;
895 }
896
897 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold);
898
899 // FIXME: We could try to change the instruction from 64-bit to 32-bit
900 // to enable more folding opportunities. The shrink operands pass
901 // already does this.
902 return;
903 }
904
905
906 const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
907 const TargetRegisterClass *FoldRC =
908 TRI->getRegClass(FoldDesc.operands()[0].RegClass);
909
910 // Split 64-bit constants into 32-bits for folding.
911 if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
912 Register UseReg = UseOp.getReg();
913 const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
914
915 if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
916 return;
917
918 APInt Imm(64, OpToFold.getImm());
919 if (UseOp.getSubReg() == AMDGPU::sub0) {
920 Imm = Imm.getLoBits(32);
921 } else {
922 assert(UseOp.getSubReg() == AMDGPU::sub1);
923 Imm = Imm.getHiBits(32);
924 }
925
926 MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
927 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp);
928 return;
929 }
930
931 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold);
932}
933
934static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
935 uint32_t LHS, uint32_t RHS) {
936 switch (Opcode) {
937 case AMDGPU::V_AND_B32_e64:
938 case AMDGPU::V_AND_B32_e32:
939 case AMDGPU::S_AND_B32:
940 Result = LHS & RHS;
941 return true;
942 case AMDGPU::V_OR_B32_e64:
943 case AMDGPU::V_OR_B32_e32:
944 case AMDGPU::S_OR_B32:
945 Result = LHS | RHS;
946 return true;
947 case AMDGPU::V_XOR_B32_e64:
948 case AMDGPU::V_XOR_B32_e32:
949 case AMDGPU::S_XOR_B32:
950 Result = LHS ^ RHS;
951 return true;
952 case AMDGPU::S_XNOR_B32:
953 Result = ~(LHS ^ RHS);
954 return true;
955 case AMDGPU::S_NAND_B32:
956 Result = ~(LHS & RHS);
957 return true;
958 case AMDGPU::S_NOR_B32:
959 Result = ~(LHS | RHS);
960 return true;
961 case AMDGPU::S_ANDN2_B32:
962 Result = LHS & ~RHS;
963 return true;
964 case AMDGPU::S_ORN2_B32:
965 Result = LHS | ~RHS;
966 return true;
967 case AMDGPU::V_LSHL_B32_e64:
968 case AMDGPU::V_LSHL_B32_e32:
969 case AMDGPU::S_LSHL_B32:
970 // The instruction ignores the high bits for out of bounds shifts.
971 Result = LHS << (RHS & 31);
972 return true;
973 case AMDGPU::V_LSHLREV_B32_e64:
974 case AMDGPU::V_LSHLREV_B32_e32:
975 Result = RHS << (LHS & 31);
976 return true;
977 case AMDGPU::V_LSHR_B32_e64:
978 case AMDGPU::V_LSHR_B32_e32:
979 case AMDGPU::S_LSHR_B32:
980 Result = LHS >> (RHS & 31);
981 return true;
982 case AMDGPU::V_LSHRREV_B32_e64:
983 case AMDGPU::V_LSHRREV_B32_e32:
984 Result = RHS >> (LHS & 31);
985 return true;
986 case AMDGPU::V_ASHR_I32_e64:
987 case AMDGPU::V_ASHR_I32_e32:
988 case AMDGPU::S_ASHR_I32:
989 Result = static_cast<int32_t>(LHS) >> (RHS & 31);
990 return true;
991 case AMDGPU::V_ASHRREV_I32_e64:
992 case AMDGPU::V_ASHRREV_I32_e32:
993 Result = static_cast<int32_t>(RHS) >> (LHS & 31);
994 return true;
995 default:
996 return false;
997 }
998}
999
1000static unsigned getMovOpc(bool IsScalar) {
1001 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1002}
1003
1004static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
1005 MI.setDesc(NewDesc);
1006
1007 // Remove any leftover implicit operands from mutating the instruction. e.g.
1008 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
1009 // anymore.
1010 const MCInstrDesc &Desc = MI.getDesc();
1011 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
1012 Desc.implicit_defs().size();
1013
1014 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
1015 MI.removeOperand(I);
1016}
1017
1019SIFoldOperands::getImmOrMaterializedImm(MachineOperand &Op) const {
1020 // If this has a subregister, it obviously is a register source.
1021 if (!Op.isReg() || Op.getSubReg() != AMDGPU::NoSubRegister ||
1022 !Op.getReg().isVirtual())
1023 return &Op;
1024
1025 MachineInstr *Def = MRI->getVRegDef(Op.getReg());
1026 if (Def && Def->isMoveImmediate()) {
1027 MachineOperand &ImmSrc = Def->getOperand(1);
1028 if (ImmSrc.isImm())
1029 return &ImmSrc;
1030 }
1031
1032 return &Op;
1033}
1034
1035// Try to simplify operations with a constant that may appear after instruction
1036// selection.
1037// TODO: See if a frame index with a fixed offset can fold.
1038bool SIFoldOperands::tryConstantFoldOp(MachineInstr *MI) const {
1039 unsigned Opc = MI->getOpcode();
1040
1041 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1042 if (Src0Idx == -1)
1043 return false;
1044 MachineOperand *Src0 = getImmOrMaterializedImm(MI->getOperand(Src0Idx));
1045
1046 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1047 Opc == AMDGPU::S_NOT_B32) &&
1048 Src0->isImm()) {
1049 MI->getOperand(1).ChangeToImmediate(~Src0->getImm());
1050 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
1051 return true;
1052 }
1053
1054 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1055 if (Src1Idx == -1)
1056 return false;
1057 MachineOperand *Src1 = getImmOrMaterializedImm(MI->getOperand(Src1Idx));
1058
1059 if (!Src0->isImm() && !Src1->isImm())
1060 return false;
1061
1062 // and k0, k1 -> v_mov_b32 (k0 & k1)
1063 // or k0, k1 -> v_mov_b32 (k0 | k1)
1064 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1065 if (Src0->isImm() && Src1->isImm()) {
1066 int32_t NewImm;
1067 if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
1068 return false;
1069
1070 bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg());
1071
1072 // Be careful to change the right operand, src0 may belong to a different
1073 // instruction.
1074 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1075 MI->removeOperand(Src1Idx);
1076 mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
1077 return true;
1078 }
1079
1080 if (!MI->isCommutable())
1081 return false;
1082
1083 if (Src0->isImm() && !Src1->isImm()) {
1084 std::swap(Src0, Src1);
1085 std::swap(Src0Idx, Src1Idx);
1086 }
1087
1088 int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
1089 if (Opc == AMDGPU::V_OR_B32_e64 ||
1090 Opc == AMDGPU::V_OR_B32_e32 ||
1091 Opc == AMDGPU::S_OR_B32) {
1092 if (Src1Val == 0) {
1093 // y = or x, 0 => y = copy x
1094 MI->removeOperand(Src1Idx);
1095 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1096 } else if (Src1Val == -1) {
1097 // y = or x, -1 => y = v_mov_b32 -1
1098 MI->removeOperand(Src1Idx);
1099 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
1100 } else
1101 return false;
1102
1103 return true;
1104 }
1105
1106 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1107 Opc == AMDGPU::S_AND_B32) {
1108 if (Src1Val == 0) {
1109 // y = and x, 0 => y = v_mov_b32 0
1110 MI->removeOperand(Src0Idx);
1111 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
1112 } else if (Src1Val == -1) {
1113 // y = and x, -1 => y = copy x
1114 MI->removeOperand(Src1Idx);
1115 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1116 } else
1117 return false;
1118
1119 return true;
1120 }
1121
1122 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1123 Opc == AMDGPU::S_XOR_B32) {
1124 if (Src1Val == 0) {
1125 // y = xor x, 0 => y = copy x
1126 MI->removeOperand(Src1Idx);
1127 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1128 return true;
1129 }
1130 }
1131
1132 return false;
1133}
1134
1135// Try to fold an instruction into a simpler one
1136bool SIFoldOperands::tryFoldCndMask(MachineInstr &MI) const {
1137 unsigned Opc = MI.getOpcode();
1138 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1139 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1140 return false;
1141
1142 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1143 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1144 if (!Src1->isIdenticalTo(*Src0)) {
1145 auto *Src0Imm = getImmOrMaterializedImm(*Src0);
1146 auto *Src1Imm = getImmOrMaterializedImm(*Src1);
1147 if (!Src1Imm->isIdenticalTo(*Src0Imm))
1148 return false;
1149 }
1150
1151 int Src1ModIdx =
1152 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1153 int Src0ModIdx =
1154 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1155 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1156 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1157 return false;
1158
1159 LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1160 auto &NewDesc =
1161 TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
1162 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1163 if (Src2Idx != -1)
1164 MI.removeOperand(Src2Idx);
1165 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1166 if (Src1ModIdx != -1)
1167 MI.removeOperand(Src1ModIdx);
1168 if (Src0ModIdx != -1)
1169 MI.removeOperand(Src0ModIdx);
1170 mutateCopyOp(MI, NewDesc);
1171 LLVM_DEBUG(dbgs() << MI);
1172 return true;
1173}
1174
1175bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr &MI) const {
1176 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1177 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1178 return false;
1179
1180 MachineOperand *Src0 = getImmOrMaterializedImm(MI.getOperand(1));
1181 if (!Src0->isImm() || Src0->getImm() != 0xffff)
1182 return false;
1183
1184 Register Src1 = MI.getOperand(2).getReg();
1185 MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1186 if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
1187 return false;
1188
1189 Register Dst = MI.getOperand(0).getReg();
1190 MRI->replaceRegWith(Dst, SrcDef->getOperand(0).getReg());
1191 MI.eraseFromParent();
1192 return true;
1193}
1194
1195bool SIFoldOperands::foldInstOperand(MachineInstr &MI,
1196 MachineOperand &OpToFold) const {
1197 // We need mutate the operands of new mov instructions to add implicit
1198 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1199 // this.
1200 SmallVector<MachineInstr *, 4> CopiesToReplace;
1202 MachineOperand &Dst = MI.getOperand(0);
1203 bool Changed = false;
1204
1205 if (OpToFold.isImm()) {
1206 for (auto &UseMI :
1207 make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
1208 // Folding the immediate may reveal operations that can be constant
1209 // folded or replaced with a copy. This can happen for example after
1210 // frame indices are lowered to constants or from splitting 64-bit
1211 // constants.
1212 //
1213 // We may also encounter cases where one or both operands are
1214 // immediates materialized into a register, which would ordinarily not
1215 // be folded due to multiple uses or operand constraints.
1216 if (tryConstantFoldOp(&UseMI)) {
1217 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
1218 Changed = true;
1219 }
1220 }
1221 }
1222
1224 for (auto &Use : MRI->use_nodbg_operands(Dst.getReg()))
1225 UsesToProcess.push_back(&Use);
1226 for (auto *U : UsesToProcess) {
1227 MachineInstr *UseMI = U->getParent();
1228 foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
1229 CopiesToReplace);
1230 }
1231
1232 if (CopiesToReplace.empty() && FoldList.empty())
1233 return Changed;
1234
1235 MachineFunction *MF = MI.getParent()->getParent();
1236 // Make sure we add EXEC uses to any new v_mov instructions created.
1237 for (MachineInstr *Copy : CopiesToReplace)
1238 Copy->addImplicitDefUseOperands(*MF);
1239
1240 for (FoldCandidate &Fold : FoldList) {
1241 assert(!Fold.isReg() || Fold.OpToFold);
1242 if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) {
1243 Register Reg = Fold.OpToFold->getReg();
1244 MachineInstr *DefMI = Fold.OpToFold->getParent();
1245 if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
1246 execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
1247 continue;
1248 }
1249 if (updateOperand(Fold)) {
1250 // Clear kill flags.
1251 if (Fold.isReg()) {
1252 assert(Fold.OpToFold && Fold.OpToFold->isReg());
1253 // FIXME: Probably shouldn't bother trying to fold if not an
1254 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1255 // copies.
1256 MRI->clearKillFlags(Fold.OpToFold->getReg());
1257 }
1258 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1259 << static_cast<int>(Fold.UseOpNo) << " of "
1260 << *Fold.UseMI);
1261 } else if (Fold.Commuted) {
1262 // Restoring instruction's original operand order if fold has failed.
1263 TII->commuteInstruction(*Fold.UseMI, false);
1264 }
1265 }
1266 return true;
1267}
1268
1269bool SIFoldOperands::tryFoldFoldableCopy(
1270 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
1271 // Specially track simple redefs of m0 to the same value in a block, so we
1272 // can erase the later ones.
1273 if (MI.getOperand(0).getReg() == AMDGPU::M0) {
1274 MachineOperand &NewM0Val = MI.getOperand(1);
1275 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
1276 MI.eraseFromParent();
1277 return true;
1278 }
1279
1280 // We aren't tracking other physical registers
1281 CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical())
1282 ? nullptr
1283 : &NewM0Val;
1284 return false;
1285 }
1286
1287 MachineOperand &OpToFold = MI.getOperand(1);
1288 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1289
1290 // FIXME: We could also be folding things like TargetIndexes.
1291 if (!FoldingImm && !OpToFold.isReg())
1292 return false;
1293
1294 if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
1295 return false;
1296
1297 // Prevent folding operands backwards in the function. For example,
1298 // the COPY opcode must not be replaced by 1 in this example:
1299 //
1300 // %3 = COPY %vgpr0; VGPR_32:%3
1301 // ...
1302 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1303 if (!MI.getOperand(0).getReg().isVirtual())
1304 return false;
1305
1306 bool Changed = foldInstOperand(MI, OpToFold);
1307
1308 // If we managed to fold all uses of this copy then we might as well
1309 // delete it now.
1310 // The only reason we need to follow chains of copies here is that
1311 // tryFoldRegSequence looks forward through copies before folding a
1312 // REG_SEQUENCE into its eventual users.
1313 auto *InstToErase = &MI;
1314 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1315 auto &SrcOp = InstToErase->getOperand(1);
1316 auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
1317 InstToErase->eraseFromParent();
1318 Changed = true;
1319 InstToErase = nullptr;
1320 if (!SrcReg || SrcReg.isPhysical())
1321 break;
1322 InstToErase = MRI->getVRegDef(SrcReg);
1323 if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
1324 break;
1325 }
1326
1327 if (InstToErase && InstToErase->isRegSequence() &&
1328 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1329 InstToErase->eraseFromParent();
1330 Changed = true;
1331 }
1332
1333 return Changed;
1334}
1335
1336// Clamp patterns are canonically selected to v_max_* instructions, so only
1337// handle them.
1338const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
1339 unsigned Op = MI.getOpcode();
1340 switch (Op) {
1341 case AMDGPU::V_MAX_F32_e64:
1342 case AMDGPU::V_MAX_F16_e64:
1343 case AMDGPU::V_MAX_F16_t16_e64:
1344 case AMDGPU::V_MAX_F64_e64:
1345 case AMDGPU::V_PK_MAX_F16: {
1346 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
1347 return nullptr;
1348
1349 // Make sure sources are identical.
1350 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1351 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1352 if (!Src0->isReg() || !Src1->isReg() ||
1353 Src0->getReg() != Src1->getReg() ||
1354 Src0->getSubReg() != Src1->getSubReg() ||
1355 Src0->getSubReg() != AMDGPU::NoSubRegister)
1356 return nullptr;
1357
1358 // Can't fold up if we have modifiers.
1359 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1360 return nullptr;
1361
1362 unsigned Src0Mods
1363 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
1364 unsigned Src1Mods
1365 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
1366
1367 // Having a 0 op_sel_hi would require swizzling the output in the source
1368 // instruction, which we can't do.
1369 unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1
1370 : 0u;
1371 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
1372 return nullptr;
1373 return Src0;
1374 }
1375 default:
1376 return nullptr;
1377 }
1378}
1379
1380// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
1381bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
1382 const MachineOperand *ClampSrc = isClamp(MI);
1383 if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
1384 return false;
1385
1386 MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
1387
1388 // The type of clamp must be compatible.
1389 if (TII->getClampMask(*Def) != TII->getClampMask(MI))
1390 return false;
1391
1392 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
1393 if (!DefClamp)
1394 return false;
1395
1396 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
1397
1398 // Clamp is applied after omod, so it is OK if omod is set.
1399 DefClamp->setImm(1);
1400 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1401 MI.eraseFromParent();
1402
1403 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1404 // instruction, so we might as well convert it to the more flexible VOP3-only
1405 // mad/fma form.
1406 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1407 Def->eraseFromParent();
1408
1409 return true;
1410}
1411
1412static int getOModValue(unsigned Opc, int64_t Val) {
1413 switch (Opc) {
1414 case AMDGPU::V_MUL_F64_e64: {
1415 switch (Val) {
1416 case 0x3fe0000000000000: // 0.5
1417 return SIOutMods::DIV2;
1418 case 0x4000000000000000: // 2.0
1419 return SIOutMods::MUL2;
1420 case 0x4010000000000000: // 4.0
1421 return SIOutMods::MUL4;
1422 default:
1423 return SIOutMods::NONE;
1424 }
1425 }
1426 case AMDGPU::V_MUL_F32_e64: {
1427 switch (static_cast<uint32_t>(Val)) {
1428 case 0x3f000000: // 0.5
1429 return SIOutMods::DIV2;
1430 case 0x40000000: // 2.0
1431 return SIOutMods::MUL2;
1432 case 0x40800000: // 4.0
1433 return SIOutMods::MUL4;
1434 default:
1435 return SIOutMods::NONE;
1436 }
1437 }
1438 case AMDGPU::V_MUL_F16_e64:
1439 case AMDGPU::V_MUL_F16_t16_e64: {
1440 switch (static_cast<uint16_t>(Val)) {
1441 case 0x3800: // 0.5
1442 return SIOutMods::DIV2;
1443 case 0x4000: // 2.0
1444 return SIOutMods::MUL2;
1445 case 0x4400: // 4.0
1446 return SIOutMods::MUL4;
1447 default:
1448 return SIOutMods::NONE;
1449 }
1450 }
1451 default:
1452 llvm_unreachable("invalid mul opcode");
1453 }
1454}
1455
1456// FIXME: Does this really not support denormals with f16?
1457// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
1458// handled, so will anything other than that break?
1459std::pair<const MachineOperand *, int>
1460SIFoldOperands::isOMod(const MachineInstr &MI) const {
1461 unsigned Op = MI.getOpcode();
1462 switch (Op) {
1463 case AMDGPU::V_MUL_F64_e64:
1464 case AMDGPU::V_MUL_F32_e64:
1465 case AMDGPU::V_MUL_F16_t16_e64:
1466 case AMDGPU::V_MUL_F16_e64: {
1467 // If output denormals are enabled, omod is ignored.
1468 if ((Op == AMDGPU::V_MUL_F32_e64 &&
1469 MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1470 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F16_e64 ||
1471 Op == AMDGPU::V_MUL_F16_t16_e64) &&
1472 MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
1473 return std::pair(nullptr, SIOutMods::NONE);
1474
1475 const MachineOperand *RegOp = nullptr;
1476 const MachineOperand *ImmOp = nullptr;
1477 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1478 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1479 if (Src0->isImm()) {
1480 ImmOp = Src0;
1481 RegOp = Src1;
1482 } else if (Src1->isImm()) {
1483 ImmOp = Src1;
1484 RegOp = Src0;
1485 } else
1486 return std::pair(nullptr, SIOutMods::NONE);
1487
1488 int OMod = getOModValue(Op, ImmOp->getImm());
1489 if (OMod == SIOutMods::NONE ||
1490 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
1491 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
1492 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
1493 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
1494 return std::pair(nullptr, SIOutMods::NONE);
1495
1496 return std::pair(RegOp, OMod);
1497 }
1498 case AMDGPU::V_ADD_F64_e64:
1499 case AMDGPU::V_ADD_F32_e64:
1500 case AMDGPU::V_ADD_F16_e64:
1501 case AMDGPU::V_ADD_F16_t16_e64: {
1502 // If output denormals are enabled, omod is ignored.
1503 if ((Op == AMDGPU::V_ADD_F32_e64 &&
1504 MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1505 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F16_e64 ||
1506 Op == AMDGPU::V_ADD_F16_t16_e64) &&
1507 MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
1508 return std::pair(nullptr, SIOutMods::NONE);
1509
1510 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
1511 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1512 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1513
1514 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
1515 Src0->getSubReg() == Src1->getSubReg() &&
1516 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
1517 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
1518 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
1519 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1520 return std::pair(Src0, SIOutMods::MUL2);
1521
1522 return std::pair(nullptr, SIOutMods::NONE);
1523 }
1524 default:
1525 return std::pair(nullptr, SIOutMods::NONE);
1526 }
1527}
1528
1529// FIXME: Does this need to check IEEE bit on function?
1530bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
1531 const MachineOperand *RegOp;
1532 int OMod;
1533 std::tie(RegOp, OMod) = isOMod(MI);
1534 if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
1535 RegOp->getSubReg() != AMDGPU::NoSubRegister ||
1536 !MRI->hasOneNonDBGUser(RegOp->getReg()))
1537 return false;
1538
1539 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
1540 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
1541 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
1542 return false;
1543
1544 // Clamp is applied after omod. If the source already has clamp set, don't
1545 // fold it.
1546 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
1547 return false;
1548
1549 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
1550
1551 DefOMod->setImm(OMod);
1552 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1553 MI.eraseFromParent();
1554
1555 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1556 // instruction, so we might as well convert it to the more flexible VOP3-only
1557 // mad/fma form.
1558 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1559 Def->eraseFromParent();
1560
1561 return true;
1562}
1563
1564// Try to fold a reg_sequence with vgpr output and agpr inputs into an
1565// instruction which can take an agpr. So far that means a store.
1566bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) {
1567 assert(MI.isRegSequence());
1568 auto Reg = MI.getOperand(0).getReg();
1569
1570 if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
1571 !MRI->hasOneNonDBGUse(Reg))
1572 return false;
1573
1575 if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER))
1576 return false;
1577
1578 for (auto &Def : Defs) {
1579 const auto *Op = Def.first;
1580 if (!Op->isReg())
1581 return false;
1582 if (TRI->isAGPR(*MRI, Op->getReg()))
1583 continue;
1584 // Maybe this is a COPY from AREG
1585 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
1586 if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
1587 return false;
1588 if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
1589 return false;
1590 }
1591
1592 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
1593 MachineInstr *UseMI = Op->getParent();
1594 while (UseMI->isCopy() && !Op->getSubReg()) {
1595 Reg = UseMI->getOperand(0).getReg();
1596 if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
1597 return false;
1598 Op = &*MRI->use_nodbg_begin(Reg);
1599 UseMI = Op->getParent();
1600 }
1601
1602 if (Op->getSubReg())
1603 return false;
1604
1605 unsigned OpIdx = Op - &UseMI->getOperand(0);
1606 const MCInstrDesc &InstDesc = UseMI->getDesc();
1607 const TargetRegisterClass *OpRC =
1608 TII->getRegClass(InstDesc, OpIdx, TRI, *MI.getMF());
1609 if (!OpRC || !TRI->isVectorSuperClass(OpRC))
1610 return false;
1611
1612 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
1613 auto Dst = MRI->createVirtualRegister(NewDstRC);
1614 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
1615 TII->get(AMDGPU::REG_SEQUENCE), Dst);
1616
1617 for (unsigned I = 0; I < Defs.size(); ++I) {
1618 MachineOperand *Def = Defs[I].first;
1619 Def->setIsKill(false);
1620 if (TRI->isAGPR(*MRI, Def->getReg())) {
1621 RS.add(*Def);
1622 } else { // This is a copy
1623 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
1624 SubDef->getOperand(1).setIsKill(false);
1625 RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg());
1626 }
1627 RS.addImm(Defs[I].second);
1628 }
1629
1630 Op->setReg(Dst);
1631 if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
1632 Op->setReg(Reg);
1633 RS->eraseFromParent();
1634 return false;
1635 }
1636
1637 LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
1638
1639 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
1640 // in which case we can erase them all later in runOnMachineFunction.
1641 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
1642 MI.eraseFromParent();
1643 return true;
1644}
1645
1646// Try to hoist an AGPR to VGPR copy across a PHI.
1647// This should allow folding of an AGPR into a consumer which may support it.
1648//
1649// Example 1: LCSSA PHI
1650// loop:
1651// %1:vreg = COPY %0:areg
1652// exit:
1653// %2:vreg = PHI %1:vreg, %loop
1654// =>
1655// loop:
1656// exit:
1657// %1:areg = PHI %0:areg, %loop
1658// %2:vreg = COPY %1:areg
1659//
1660// Example 2: PHI with multiple incoming values:
1661// entry:
1662// %1:vreg = GLOBAL_LOAD(..)
1663// loop:
1664// %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
1665// %3:areg = COPY %2:vreg
1666// %4:areg = (instr using %3:areg)
1667// %5:vreg = COPY %4:areg
1668// =>
1669// entry:
1670// %1:vreg = GLOBAL_LOAD(..)
1671// %2:areg = COPY %1:vreg
1672// loop:
1673// %3:areg = PHI %2:areg, %entry, %X:areg,
1674// %4:areg = (instr using %3:areg)
1675bool SIFoldOperands::tryFoldPhiAGPR(MachineInstr &PHI) {
1676 assert(PHI.isPHI());
1677
1678 Register PhiOut = PHI.getOperand(0).getReg();
1679 if (!TRI->isVGPR(*MRI, PhiOut))
1680 return false;
1681
1682 // Iterate once over all incoming values of the PHI to check if this PHI is
1683 // eligible, and determine the exact AGPR RC we'll target.
1684 const TargetRegisterClass *ARC = nullptr;
1685 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
1686 MachineOperand &MO = PHI.getOperand(K);
1687
1688 Register PhiIn = MO.getReg();
1689 if (MO.getSubReg() || !TRI->isVGPR(*MRI, PhiIn))
1690 return false;
1691
1692 MachineInstr *Copy = MRI->getVRegDef(PhiIn);
1693 if (!Copy || !Copy->isCopy())
1694 continue;
1695
1696 Register CopyIn = Copy->getOperand(1).getReg();
1697 if (CopyIn.isVirtual() && TRI->isAGPR(*MRI, CopyIn)) {
1698 const TargetRegisterClass *CopyInRC =
1699 getRegOpRC(*MRI, *TRI, Copy->getOperand(1));
1700 if (ARC && !ARC->hasSubClassEq(CopyInRC))
1701 return false;
1702 ARC = CopyInRC;
1703 }
1704 }
1705
1706 if (!ARC)
1707 return false;
1708
1709 // Rewrite the PHI's incoming values to ARC.
1710 LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI);
1711 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
1712 MachineOperand &MO = PHI.getOperand(K);
1713 Register Reg = MO.getReg();
1714
1716 MachineBasicBlock *InsertMBB = nullptr;
1717
1718 // Look at the def of Reg, ignoring all copies.
1719 bool UseAccVGPRWrite = false;
1720 if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
1721
1722 // Look at pre-existing COPY instructions from ARC: Steal the operand. If
1723 // the copy was single-use, it will be removed by DCE later.
1724 if (Def->isCopy()) {
1725 MachineOperand &CopyIn = Def->getOperand(1);
1726 if (CopyIn.getReg().isVirtual() &&
1727 getRegOpRC(*MRI, *TRI, CopyIn)->hasSubClassEq(ARC)) {
1728 MO.setReg(CopyIn.getReg());
1729 MO.setSubReg(CopyIn.getSubReg());
1730 continue;
1731 }
1732
1733 // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
1734 // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
1735 // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
1736 // is unlikely to be profitable.
1737 if (!ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) &&
1738 TRI->isSGPRReg(*MRI, CopyIn.getReg()))
1739 UseAccVGPRWrite = true;
1740 }
1741
1742 InsertPt = ++Def->getIterator();
1743 InsertMBB = Def->getParent();
1744 } else {
1745 InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
1746 InsertPt = InsertMBB->getFirstTerminator();
1747 }
1748
1749 const unsigned CopyOpc =
1750 UseAccVGPRWrite ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::COPY;
1751 Register NewReg = MRI->createVirtualRegister(ARC);
1752 MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
1753 TII->get(CopyOpc), NewReg)
1754 .addReg(Reg);
1755 MO.setReg(NewReg);
1756
1757 (void)MI;
1758 LLVM_DEBUG(dbgs() << " Created COPY: " << *MI);
1759 }
1760
1761 // Replace the PHI's result with a new register.
1762 Register NewReg = MRI->createVirtualRegister(ARC);
1763 PHI.getOperand(0).setReg(NewReg);
1764
1765 // COPY that new register back to the original PhiOut register. This COPY will
1766 // usually be folded out later.
1767 MachineBasicBlock *MBB = PHI.getParent();
1768 BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(),
1769 TII->get(AMDGPU::COPY), PhiOut)
1770 .addReg(NewReg);
1771
1772 LLVM_DEBUG(dbgs() << " Done: Folded " << PHI);
1773 return true;
1774}
1775
1776// Attempt to convert VGPR load to an AGPR load.
1777bool SIFoldOperands::tryFoldLoad(MachineInstr &MI) {
1778 assert(MI.mayLoad());
1779 if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
1780 return false;
1781
1782 MachineOperand &Def = MI.getOperand(0);
1783 if (!Def.isDef())
1784 return false;
1785
1786 Register DefReg = Def.getReg();
1787
1788 if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
1789 return false;
1790
1792 SmallVector<Register, 8> MoveRegs;
1793 for (const MachineInstr &I : MRI->use_nodbg_instructions(DefReg))
1794 Users.push_back(&I);
1795
1796 if (Users.empty())
1797 return false;
1798
1799 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
1800 while (!Users.empty()) {
1801 const MachineInstr *I = Users.pop_back_val();
1802 if (!I->isCopy() && !I->isRegSequence())
1803 return false;
1804 Register DstReg = I->getOperand(0).getReg();
1805 // Physical registers may have more than one instruction definitions
1806 if (DstReg.isPhysical())
1807 return false;
1808 if (TRI->isAGPR(*MRI, DstReg))
1809 continue;
1810 MoveRegs.push_back(DstReg);
1811 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg))
1812 Users.push_back(&U);
1813 }
1814
1815 const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
1816 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
1817 if (!TII->isOperandLegal(MI, 0, &Def)) {
1818 MRI->setRegClass(DefReg, RC);
1819 return false;
1820 }
1821
1822 while (!MoveRegs.empty()) {
1823 Register Reg = MoveRegs.pop_back_val();
1824 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
1825 }
1826
1827 LLVM_DEBUG(dbgs() << "Folded " << MI);
1828
1829 return true;
1830}
1831
1832// tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
1833// For GFX90A and later, this is pretty much always a good thing, but for GFX908
1834// there's cases where it can create a lot more AGPR-AGPR copies, which are
1835// expensive on this architecture due to the lack of V_ACCVGPR_MOV.
1836//
1837// This function looks at all AGPR PHIs in a basic block and collects their
1838// operands. Then, it checks for register that are used more than once across
1839// all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
1840// having to create one VGPR temporary per use, which can get very messy if
1841// these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
1842// element).
1843//
1844// Example
1845// a:
1846// %in:agpr_256 = COPY %foo:vgpr_256
1847// c:
1848// %x:agpr_32 = ..
1849// b:
1850// %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
1851// %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
1852// %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
1853// =>
1854// a:
1855// %in:agpr_256 = COPY %foo:vgpr_256
1856// %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
1857// %tmp_agpr:agpr_32 = COPY %tmp
1858// c:
1859// %x:agpr_32 = ..
1860// b:
1861// %0:areg = PHI %tmp_agpr, %a, %x, %c
1862// %1:areg = PHI %tmp_agpr, %a, %y, %c
1863// %2:areg = PHI %tmp_agpr, %a, %z, %c
1864bool SIFoldOperands::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
1865 // This is only really needed on GFX908 where AGPR-AGPR copies are
1866 // unreasonably difficult.
1867 if (ST->hasGFX90AInsts())
1868 return false;
1869
1870 // Look at all AGPR Phis and collect the register + subregister used.
1871 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
1872 RegToMO;
1873
1874 for (auto &MI : MBB) {
1875 if (!MI.isPHI())
1876 break;
1877
1878 if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))
1879 continue;
1880
1881 for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
1882 MachineOperand &PhiMO = MI.getOperand(K);
1883 RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
1884 }
1885 }
1886
1887 // For all (Reg, SubReg) pair that are used more than once, cache the value in
1888 // a VGPR.
1889 bool Changed = false;
1890 for (const auto &[Entry, MOs] : RegToMO) {
1891 if (MOs.size() == 1)
1892 continue;
1893
1894 const auto [Reg, SubReg] = Entry;
1895 MachineInstr *Def = MRI->getVRegDef(Reg);
1896 MachineBasicBlock *DefMBB = Def->getParent();
1897
1898 // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
1899 // out.
1900 const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
1901 Register TempVGPR =
1902 MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
1903 MachineInstr *VGPRCopy =
1904 BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
1905 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
1906 .addReg(Reg, /* flags */ 0, SubReg);
1907
1908 // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
1909 Register TempAGPR = MRI->createVirtualRegister(ARC);
1910 BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(),
1911 TII->get(AMDGPU::COPY), TempAGPR)
1912 .addReg(TempVGPR);
1913
1914 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
1915 for (MachineOperand *MO : MOs) {
1916 MO->setReg(TempAGPR);
1917 MO->setSubReg(AMDGPU::NoSubRegister);
1918 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n");
1919 }
1920
1921 Changed = true;
1922 }
1923
1924 return Changed;
1925}
1926
1927bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
1928 if (skipFunction(MF.getFunction()))
1929 return false;
1930
1931 MRI = &MF.getRegInfo();
1932 ST = &MF.getSubtarget<GCNSubtarget>();
1933 TII = ST->getInstrInfo();
1934 TRI = &TII->getRegisterInfo();
1935 MFI = MF.getInfo<SIMachineFunctionInfo>();
1936
1937 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
1938 // correctly handle signed zeros.
1939 //
1940 // FIXME: Also need to check strictfp
1941 bool IsIEEEMode = MFI->getMode().IEEE;
1942 bool HasNSZ = MFI->hasNoSignedZerosFPMath();
1943
1944 bool Changed = false;
1945 for (MachineBasicBlock *MBB : depth_first(&MF)) {
1946 MachineOperand *CurrentKnownM0Val = nullptr;
1947 for (auto &MI : make_early_inc_range(*MBB)) {
1948 Changed |= tryFoldCndMask(MI);
1949
1950 if (tryFoldZeroHighBits(MI)) {
1951 Changed = true;
1952 continue;
1953 }
1954
1955 if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
1956 Changed = true;
1957 continue;
1958 }
1959
1960 if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
1961 Changed = true;
1962 continue;
1963 }
1964
1965 if (MI.mayLoad() && tryFoldLoad(MI)) {
1966 Changed = true;
1967 continue;
1968 }
1969
1970 if (TII->isFoldableCopy(MI)) {
1971 Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
1972 continue;
1973 }
1974
1975 // Saw an unknown clobber of m0, so we no longer know what it is.
1976 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
1977 CurrentKnownM0Val = nullptr;
1978
1979 // TODO: Omod might be OK if there is NSZ only on the source
1980 // instruction, and not the omod multiply.
1981 if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
1982 !tryFoldOMod(MI))
1983 Changed |= tryFoldClamp(MI);
1984 }
1985
1986 Changed |= tryOptimizeAGPRPhis(*MBB);
1987 }
1988
1989 return Changed;
1990}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
aarch64 promote const
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static bool isReg(const MCInst &MI, unsigned OpNo)
Module * Mod
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
unsigned UseOpIdx
uint64_t TSFlags
static unsigned macToMad(unsigned Opc)
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static bool isUseMIInFoldList(ArrayRef< FoldCandidate > FoldList, const MachineInstr *MI)
static unsigned getMovOpc(bool IsScalar)
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc)
#define DEBUG_TYPE
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, bool Commuted=false, int ShrinkOp=-1)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition: APInt.h:75
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:265
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
A debug info location.
Definition: DebugLoc.h:33
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
ArrayRef< MCPhysReg > implicit_defs() const
Return a list of registers that are potentially written by any instance of this machine instruction.
Definition: MCInstrDesc.h:580
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
Definition: MCInstrDesc.h:261
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
Definition: MCInstrDesc.h:566
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:68
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:516
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:313
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:519
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
Definition: MachineInstr.h:706
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:513
bool isRegSequence() const
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:445
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr reads the specified register.
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
iterator_range< mop_iterator > implicit_operands()
Definition: MachineInstr.h:655
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:526
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:97
bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:152
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:312
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:941
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
static const unsigned CommuteAnyOperandIndex
unsigned getID() const
Return the register class ID number.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
self_iterator getIterator()
Definition: ilist_node.h:82
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ OPERAND_REGISTER
Definition: MCInstrDesc.h:61
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1217
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
char & SIFoldOperandsID
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:748
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1826
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
FunctionPass * createSIFoldOperandsPass()
void initializeSIFoldOperandsPass(PassRegistry &)
iterator_range< df_iterator< T > > depth_first(const T &G)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
A pair composed of a register and a sub-register index.