LLVM 19.0.0git
SIFoldOperands.cpp
Go to the documentation of this file.
1//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7/// \file
8//===----------------------------------------------------------------------===//
9//
10
11#include "AMDGPU.h"
12#include "GCNSubtarget.h"
18
19#define DEBUG_TYPE "si-fold-operands"
20using namespace llvm;
21
22namespace {
23
24struct FoldCandidate {
26 union {
27 MachineOperand *OpToFold;
28 uint64_t ImmToFold;
29 int FrameIndexToFold;
30 };
31 int ShrinkOpcode;
32 unsigned UseOpNo;
34 bool Commuted;
35
36 FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
37 bool Commuted_ = false,
38 int ShrinkOp = -1) :
39 UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
40 Kind(FoldOp->getType()),
41 Commuted(Commuted_) {
42 if (FoldOp->isImm()) {
43 ImmToFold = FoldOp->getImm();
44 } else if (FoldOp->isFI()) {
45 FrameIndexToFold = FoldOp->getIndex();
46 } else {
47 assert(FoldOp->isReg() || FoldOp->isGlobal());
48 OpToFold = FoldOp;
49 }
50 }
51
52 bool isFI() const {
53 return Kind == MachineOperand::MO_FrameIndex;
54 }
55
56 bool isImm() const {
57 return Kind == MachineOperand::MO_Immediate;
58 }
59
60 bool isReg() const {
61 return Kind == MachineOperand::MO_Register;
62 }
63
64 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
65
66 bool needsShrink() const { return ShrinkOpcode != -1; }
67};
68
69class SIFoldOperands : public MachineFunctionPass {
70public:
71 static char ID;
73 const SIInstrInfo *TII;
74 const SIRegisterInfo *TRI;
75 const GCNSubtarget *ST;
76 const SIMachineFunctionInfo *MFI;
77
78 bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
79 const MachineOperand &OpToFold) const;
80
81 bool updateOperand(FoldCandidate &Fold) const;
82
83 bool canUseImmWithOpSel(FoldCandidate &Fold) const;
84
85 bool tryFoldImmWithOpSel(FoldCandidate &Fold) const;
86
87 bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
88 MachineInstr *MI, unsigned OpNo,
89 MachineOperand *OpToFold) const;
90 bool isUseSafeToFold(const MachineInstr &MI,
91 const MachineOperand &UseMO) const;
92 bool
93 getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
94 Register UseReg, uint8_t OpTy) const;
95 bool tryToFoldACImm(const MachineOperand &OpToFold, MachineInstr *UseMI,
96 unsigned UseOpIdx,
97 SmallVectorImpl<FoldCandidate> &FoldList) const;
98 void foldOperand(MachineOperand &OpToFold,
100 int UseOpIdx,
102 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
103
104 MachineOperand *getImmOrMaterializedImm(MachineOperand &Op) const;
105 bool tryConstantFoldOp(MachineInstr *MI) const;
106 bool tryFoldCndMask(MachineInstr &MI) const;
107 bool tryFoldZeroHighBits(MachineInstr &MI) const;
108 bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
109 bool tryFoldFoldableCopy(MachineInstr &MI,
110 MachineOperand *&CurrentKnownM0Val) const;
111
112 const MachineOperand *isClamp(const MachineInstr &MI) const;
113 bool tryFoldClamp(MachineInstr &MI);
114
115 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
116 bool tryFoldOMod(MachineInstr &MI);
117 bool tryFoldRegSequence(MachineInstr &MI);
118 bool tryFoldPhiAGPR(MachineInstr &MI);
119 bool tryFoldLoad(MachineInstr &MI);
120
121 bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
122
123public:
124 SIFoldOperands() : MachineFunctionPass(ID) {
126 }
127
128 bool runOnMachineFunction(MachineFunction &MF) override;
129
130 StringRef getPassName() const override { return "SI Fold Operands"; }
131
132 void getAnalysisUsage(AnalysisUsage &AU) const override {
133 AU.setPreservesCFG();
135 }
136};
137
138} // End anonymous namespace.
139
140INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
141 "SI Fold Operands", false, false)
142
143char SIFoldOperands::ID = 0;
144
145char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
146
149 const MachineOperand &MO) {
150 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
151 if (const TargetRegisterClass *SubRC =
152 TRI.getSubRegisterClass(RC, MO.getSubReg()))
153 RC = SubRC;
154 return RC;
155}
156
157// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
158static unsigned macToMad(unsigned Opc) {
159 switch (Opc) {
160 case AMDGPU::V_MAC_F32_e64:
161 return AMDGPU::V_MAD_F32_e64;
162 case AMDGPU::V_MAC_F16_e64:
163 return AMDGPU::V_MAD_F16_e64;
164 case AMDGPU::V_FMAC_F32_e64:
165 return AMDGPU::V_FMA_F32_e64;
166 case AMDGPU::V_FMAC_F16_e64:
167 return AMDGPU::V_FMA_F16_gfx9_e64;
168 case AMDGPU::V_FMAC_F16_t16_e64:
169 return AMDGPU::V_FMA_F16_gfx9_e64;
170 case AMDGPU::V_FMAC_LEGACY_F32_e64:
171 return AMDGPU::V_FMA_LEGACY_F32_e64;
172 case AMDGPU::V_FMAC_F64_e64:
173 return AMDGPU::V_FMA_F64_e64;
174 }
175 return AMDGPU::INSTRUCTION_LIST_END;
176}
177
178// TODO: Add heuristic that the frame index might not fit in the addressing mode
179// immediate offset to avoid materializing in loops.
180bool SIFoldOperands::frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
181 const MachineOperand &OpToFold) const {
182 if (!OpToFold.isFI())
183 return false;
184
185 const unsigned Opc = UseMI.getOpcode();
186 if (TII->isMUBUF(UseMI))
187 return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
188 if (!TII->isFLATScratch(UseMI))
189 return false;
190
191 int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
192 if (OpNo == SIdx)
193 return true;
194
195 int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
196 return OpNo == VIdx && SIdx == -1;
197}
198
200 return new SIFoldOperands();
201}
202
203bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {
204 MachineInstr *MI = Fold.UseMI;
205 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
206 const uint64_t TSFlags = MI->getDesc().TSFlags;
207
208 assert(Old.isReg() && Fold.isImm());
209
210 if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
211 (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) ||
212 (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
213 return false;
214
215 unsigned Opcode = MI->getOpcode();
216 int OpNo = MI->getOperandNo(&Old);
217 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
218 switch (OpType) {
219 default:
220 return false;
227 break;
228 }
229
230 return true;
231}
232
233bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold) const {
234 MachineInstr *MI = Fold.UseMI;
235 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
236 unsigned Opcode = MI->getOpcode();
237 int OpNo = MI->getOperandNo(&Old);
238 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
239
240 // If the literal can be inlined as-is, apply it and short-circuit the
241 // tests below. The main motivation for this is to avoid unintuitive
242 // uses of opsel.
243 if (AMDGPU::isInlinableLiteralV216(Fold.ImmToFold, OpType)) {
244 Old.ChangeToImmediate(Fold.ImmToFold);
245 return true;
246 }
247
248 // Refer to op_sel/op_sel_hi and check if we can change the immediate and
249 // op_sel in a way that allows an inline constant.
250 int ModIdx = -1;
251 unsigned SrcIdx = ~0;
252 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
253 ModIdx = AMDGPU::OpName::src0_modifiers;
254 SrcIdx = 0;
255 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
256 ModIdx = AMDGPU::OpName::src1_modifiers;
257 SrcIdx = 1;
258 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
259 ModIdx = AMDGPU::OpName::src2_modifiers;
260 SrcIdx = 2;
261 }
262 assert(ModIdx != -1);
263 ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
264 MachineOperand &Mod = MI->getOperand(ModIdx);
265 unsigned ModVal = Mod.getImm();
266
267 uint16_t ImmLo = static_cast<uint16_t>(
268 Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
269 uint16_t ImmHi = static_cast<uint16_t>(
270 Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
271 uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
272 unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
273
274 // Helper function that attempts to inline the given value with a newly
275 // chosen opsel pattern.
276 auto tryFoldToInline = [&](uint32_t Imm) -> bool {
277 if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) {
278 Mod.setImm(NewModVal | SISrcMods::OP_SEL_1);
279 Old.ChangeToImmediate(Imm);
280 return true;
281 }
282
283 // Try to shuffle the halves around and leverage opsel to get an inline
284 // constant.
285 uint16_t Lo = static_cast<uint16_t>(Imm);
286 uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
287 if (Lo == Hi) {
288 if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) {
289 Mod.setImm(NewModVal);
291 return true;
292 }
293
294 if (static_cast<int16_t>(Lo) < 0) {
295 int32_t SExt = static_cast<int16_t>(Lo);
296 if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) {
297 Mod.setImm(NewModVal);
298 Old.ChangeToImmediate(SExt);
299 return true;
300 }
301 }
302
303 // This check is only useful for integer instructions
304 if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16 ||
306 if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) {
307 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
308 Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16);
309 return true;
310 }
311 }
312 } else {
313 uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
314 if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) {
315 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0);
316 Old.ChangeToImmediate(Swapped);
317 return true;
318 }
319 }
320
321 return false;
322 };
323
324 if (tryFoldToInline(Imm))
325 return true;
326
327 // Replace integer addition by subtraction and vice versa if it allows
328 // folding the immediate to an inline constant.
329 //
330 // We should only ever get here for SrcIdx == 1 due to canonicalization
331 // earlier in the pipeline, but we double-check here to be safe / fully
332 // general.
333 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
334 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
335 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
336 unsigned ClampIdx =
337 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
338 bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
339
340 if (!Clamp) {
341 uint16_t NegLo = -static_cast<uint16_t>(Imm);
342 uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
343 uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
344
345 if (tryFoldToInline(NegImm)) {
346 unsigned NegOpcode =
347 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
348 MI->setDesc(TII->get(NegOpcode));
349 return true;
350 }
351 }
352 }
353
354 return false;
355}
356
357bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
358 MachineInstr *MI = Fold.UseMI;
359 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
360 assert(Old.isReg());
361
362 if (Fold.isImm() && canUseImmWithOpSel(Fold)) {
363 if (tryFoldImmWithOpSel(Fold))
364 return true;
365
366 // We can't represent the candidate as an inline constant. Try as a literal
367 // with the original opsel, checking constant bus limitations.
369 int OpNo = MI->getOperandNo(&Old);
370 if (!TII->isOperandLegal(*MI, OpNo, &New))
371 return false;
372 Old.ChangeToImmediate(Fold.ImmToFold);
373 return true;
374 }
375
376 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
377 MachineBasicBlock *MBB = MI->getParent();
378 auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16);
379 if (Liveness != MachineBasicBlock::LQR_Dead) {
380 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
381 return false;
382 }
383
384 int Op32 = Fold.ShrinkOpcode;
385 MachineOperand &Dst0 = MI->getOperand(0);
386 MachineOperand &Dst1 = MI->getOperand(1);
387 assert(Dst0.isDef() && Dst1.isDef());
388
389 bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(Dst1.getReg());
390
391 const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg());
392 Register NewReg0 = MRI->createVirtualRegister(Dst0RC);
393
394 MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32);
395
396 if (HaveNonDbgCarryUse) {
397 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::COPY),
398 Dst1.getReg())
399 .addReg(AMDGPU::VCC, RegState::Kill);
400 }
401
402 // Keep the old instruction around to avoid breaking iterators, but
403 // replace it with a dummy instruction to remove uses.
404 //
405 // FIXME: We should not invert how this pass looks at operands to avoid
406 // this. Should track set of foldable movs instead of looking for uses
407 // when looking at a use.
408 Dst0.setReg(NewReg0);
409 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
410 MI->removeOperand(I);
411 MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF));
412
413 if (Fold.Commuted)
414 TII->commuteInstruction(*Inst32, false);
415 return true;
416 }
417
418 assert(!Fold.needsShrink() && "not handled");
419
420 if (Fold.isImm()) {
421 if (Old.isTied()) {
422 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
423 if (NewMFMAOpc == -1)
424 return false;
425 MI->setDesc(TII->get(NewMFMAOpc));
426 MI->untieRegOperand(0);
427 }
428 Old.ChangeToImmediate(Fold.ImmToFold);
429 return true;
430 }
431
432 if (Fold.isGlobal()) {
433 Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
434 Fold.OpToFold->getTargetFlags());
435 return true;
436 }
437
438 if (Fold.isFI()) {
439 Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
440 return true;
441 }
442
443 MachineOperand *New = Fold.OpToFold;
444 Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
445 Old.setIsUndef(New->isUndef());
446 return true;
447}
448
450 const MachineInstr *MI) {
451 return any_of(FoldList, [&](const auto &C) { return C.UseMI == MI; });
452}
453
455 MachineInstr *MI, unsigned OpNo,
456 MachineOperand *FoldOp, bool Commuted = false,
457 int ShrinkOp = -1) {
458 // Skip additional folding on the same operand.
459 for (FoldCandidate &Fold : FoldList)
460 if (Fold.UseMI == MI && Fold.UseOpNo == OpNo)
461 return;
462 LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal")
463 << " operand " << OpNo << "\n " << *MI);
464 FoldList.emplace_back(MI, OpNo, FoldOp, Commuted, ShrinkOp);
465}
466
467bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
468 MachineInstr *MI, unsigned OpNo,
469 MachineOperand *OpToFold) const {
470 const unsigned Opc = MI->getOpcode();
471
472 auto tryToFoldAsFMAAKorMK = [&]() {
473 if (!OpToFold->isImm())
474 return false;
475
476 const bool TryAK = OpNo == 3;
477 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
478 MI->setDesc(TII->get(NewOpc));
479
480 // We have to fold into operand which would be Imm not into OpNo.
481 bool FoldAsFMAAKorMK =
482 tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
483 if (FoldAsFMAAKorMK) {
484 // Untie Src2 of fmac.
485 MI->untieRegOperand(3);
486 // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
487 if (OpNo == 1) {
488 MachineOperand &Op1 = MI->getOperand(1);
489 MachineOperand &Op2 = MI->getOperand(2);
490 Register OldReg = Op1.getReg();
491 // Operand 2 might be an inlinable constant
492 if (Op2.isImm()) {
493 Op1.ChangeToImmediate(Op2.getImm());
494 Op2.ChangeToRegister(OldReg, false);
495 } else {
496 Op1.setReg(Op2.getReg());
497 Op2.setReg(OldReg);
498 }
499 }
500 return true;
501 }
502 MI->setDesc(TII->get(Opc));
503 return false;
504 };
505
506 bool IsLegal = TII->isOperandLegal(*MI, OpNo, OpToFold);
507 if (!IsLegal && OpToFold->isImm()) {
508 FoldCandidate Fold(MI, OpNo, OpToFold);
509 IsLegal = canUseImmWithOpSel(Fold);
510 }
511
512 if (!IsLegal) {
513 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
514 unsigned NewOpc = macToMad(Opc);
515 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
516 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
517 // to fold the operand.
518 MI->setDesc(TII->get(NewOpc));
519 bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) &&
520 AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel);
521 if (AddOpSel)
522 MI->addOperand(MachineOperand::CreateImm(0));
523 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);
524 if (FoldAsMAD) {
525 MI->untieRegOperand(OpNo);
526 return true;
527 }
528 if (AddOpSel)
529 MI->removeOperand(MI->getNumExplicitOperands() - 1);
530 MI->setDesc(TII->get(Opc));
531 }
532
533 // Special case for s_fmac_f32 if we are trying to fold into Src2.
534 // By transforming into fmaak we can untie Src2 and make folding legal.
535 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
536 if (tryToFoldAsFMAAKorMK())
537 return true;
538 }
539
540 // Special case for s_setreg_b32
541 if (OpToFold->isImm()) {
542 unsigned ImmOpc = 0;
543 if (Opc == AMDGPU::S_SETREG_B32)
544 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
545 else if (Opc == AMDGPU::S_SETREG_B32_mode)
546 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
547 if (ImmOpc) {
548 MI->setDesc(TII->get(ImmOpc));
549 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
550 return true;
551 }
552 }
553
554 // If we are already folding into another operand of MI, then
555 // we can't commute the instruction, otherwise we risk making the
556 // other fold illegal.
557 if (isUseMIInFoldList(FoldList, MI))
558 return false;
559
560 // Operand is not legal, so try to commute the instruction to
561 // see if this makes it possible to fold.
562 unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex;
563 bool CanCommute = TII->findCommutedOpIndices(*MI, OpNo, CommuteOpNo);
564 if (!CanCommute)
565 return false;
566
567 // One of operands might be an Imm operand, and OpNo may refer to it after
568 // the call of commuteInstruction() below. Such situations are avoided
569 // here explicitly as OpNo must be a register operand to be a candidate
570 // for memory folding.
571 if (!MI->getOperand(OpNo).isReg() || !MI->getOperand(CommuteOpNo).isReg())
572 return false;
573
574 if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo))
575 return false;
576
577 int Op32 = -1;
578 if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
579 if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&
580 Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME
581 (!OpToFold->isImm() && !OpToFold->isFI() && !OpToFold->isGlobal())) {
582 TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo);
583 return false;
584 }
585
586 // Verify the other operand is a VGPR, otherwise we would violate the
587 // constant bus restriction.
588 MachineOperand &OtherOp = MI->getOperand(OpNo);
589 if (!OtherOp.isReg() ||
590 !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
591 return false;
592
593 assert(MI->getOperand(1).isDef());
594
595 // Make sure to get the 32-bit version of the commuted opcode.
596 unsigned MaybeCommutedOpc = MI->getOpcode();
597 Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
598 }
599
600 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32);
601 return true;
602 }
603
604 // Inlineable constant might have been folded into Imm operand of fmaak or
605 // fmamk and we are trying to fold a non-inlinable constant.
606 if ((Opc == AMDGPU::S_FMAAK_F32 || Opc == AMDGPU::S_FMAMK_F32) &&
607 !OpToFold->isReg() && !TII->isInlineConstant(*OpToFold)) {
608 unsigned ImmIdx = Opc == AMDGPU::S_FMAAK_F32 ? 3 : 2;
609 MachineOperand &OpImm = MI->getOperand(ImmIdx);
610 if (!OpImm.isReg() &&
611 TII->isInlineConstant(*MI, MI->getOperand(OpNo), OpImm))
612 return tryToFoldAsFMAAKorMK();
613 }
614
615 // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
616 // By changing into fmamk we can untie Src2.
617 // If folding for Src0 happens first and it is identical operand to Src1 we
618 // should avoid transforming into fmamk which requires commuting as it would
619 // cause folding into Src1 to fail later on due to wrong OpNo used.
620 if (Opc == AMDGPU::S_FMAC_F32 &&
621 (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
622 if (tryToFoldAsFMAAKorMK())
623 return true;
624 }
625
626 // Check the case where we might introduce a second constant operand to a
627 // scalar instruction
628 if (TII->isSALU(MI->getOpcode())) {
629 const MCInstrDesc &InstDesc = MI->getDesc();
630 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
631
632 // Fine if the operand can be encoded as an inline constant
633 if (!OpToFold->isReg() && !TII->isInlineConstant(*OpToFold, OpInfo)) {
634 // Otherwise check for another constant
635 for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) {
636 auto &Op = MI->getOperand(i);
637 if (OpNo != i && !Op.isReg() &&
638 !TII->isInlineConstant(Op, InstDesc.operands()[i]))
639 return false;
640 }
641 }
642 }
643
644 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
645 return true;
646}
647
648bool SIFoldOperands::isUseSafeToFold(const MachineInstr &MI,
649 const MachineOperand &UseMO) const {
650 // Operands of SDWA instructions must be registers.
651 return !TII->isSDWA(MI);
652}
653
654// Find a def of the UseReg, check if it is a reg_sequence and find initializers
655// for each subreg, tracking it to foldable inline immediate if possible.
656// Returns true on success.
657bool SIFoldOperands::getRegSeqInit(
658 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
659 Register UseReg, uint8_t OpTy) const {
660 MachineInstr *Def = MRI->getVRegDef(UseReg);
661 if (!Def || !Def->isRegSequence())
662 return false;
663
664 for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
665 MachineOperand *Sub = &Def->getOperand(I);
666 assert(Sub->isReg());
667
668 for (MachineInstr *SubDef = MRI->getVRegDef(Sub->getReg());
669 SubDef && Sub->isReg() && Sub->getReg().isVirtual() &&
670 !Sub->getSubReg() && TII->isFoldableCopy(*SubDef);
671 SubDef = MRI->getVRegDef(Sub->getReg())) {
672 MachineOperand *Op = &SubDef->getOperand(1);
673 if (Op->isImm()) {
674 if (TII->isInlineConstant(*Op, OpTy))
675 Sub = Op;
676 break;
677 }
678 if (!Op->isReg() || Op->getReg().isPhysical())
679 break;
680 Sub = Op;
681 }
682
683 Defs.emplace_back(Sub, Def->getOperand(I + 1).getImm());
684 }
685
686 return true;
687}
688
689bool SIFoldOperands::tryToFoldACImm(
690 const MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
691 SmallVectorImpl<FoldCandidate> &FoldList) const {
692 const MCInstrDesc &Desc = UseMI->getDesc();
693 if (UseOpIdx >= Desc.getNumOperands())
694 return false;
695
697 return false;
698
699 uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
700 if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) &&
701 TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
702 UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
703 return true;
704 }
705
706 if (!OpToFold.isReg())
707 return false;
708
709 Register UseReg = OpToFold.getReg();
710 if (!UseReg.isVirtual())
711 return false;
712
713 if (isUseMIInFoldList(FoldList, UseMI))
714 return false;
715
716 // Maybe it is just a COPY of an immediate itself.
717 MachineInstr *Def = MRI->getVRegDef(UseReg);
718 MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
719 if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) {
720 MachineOperand &DefOp = Def->getOperand(1);
721 if (DefOp.isImm() && TII->isInlineConstant(DefOp, OpTy) &&
722 TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
723 UseMI->getOperand(UseOpIdx).ChangeToImmediate(DefOp.getImm());
724 return true;
725 }
726 }
727
729 if (!getRegSeqInit(Defs, UseReg, OpTy))
730 return false;
731
732 int32_t Imm;
733 for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
734 const MachineOperand *Op = Defs[I].first;
735 if (!Op->isImm())
736 return false;
737
738 auto SubImm = Op->getImm();
739 if (!I) {
740 Imm = SubImm;
741 if (!TII->isInlineConstant(*Op, OpTy) ||
742 !TII->isOperandLegal(*UseMI, UseOpIdx, Op))
743 return false;
744
745 continue;
746 }
747 if (Imm != SubImm)
748 return false; // Can only fold splat constants
749 }
750
751 appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first);
752 return true;
753}
754
755void SIFoldOperands::foldOperand(
756 MachineOperand &OpToFold,
758 int UseOpIdx,
760 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
761 const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);
762
763 if (!isUseSafeToFold(*UseMI, *UseOp))
764 return;
765
766 // FIXME: Fold operands with subregs.
767 if (UseOp->isReg() && OpToFold.isReg() &&
768 (UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister))
769 return;
770
771 // Special case for REG_SEQUENCE: We can't fold literals into
772 // REG_SEQUENCE instructions, so we have to fold them into the
773 // uses of REG_SEQUENCE.
774 if (UseMI->isRegSequence()) {
775 Register RegSeqDstReg = UseMI->getOperand(0).getReg();
776 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
777
778 for (auto &RSUse : make_early_inc_range(MRI->use_nodbg_operands(RegSeqDstReg))) {
779 MachineInstr *RSUseMI = RSUse.getParent();
780
781 if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI,
782 RSUseMI->getOperandNo(&RSUse), FoldList))
783 continue;
784
785 if (RSUse.getSubReg() != RegSeqDstSubReg)
786 continue;
787
788 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(&RSUse), FoldList,
789 CopiesToReplace);
790 }
791
792 return;
793 }
794
795 if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList))
796 return;
797
798 if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) {
799 // Verify that this is a stack access.
800 // FIXME: Should probably use stack pseudos before frame lowering.
801
802 if (TII->isMUBUF(*UseMI)) {
803 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
804 MFI->getScratchRSrcReg())
805 return;
806
807 // Ensure this is either relative to the current frame or the current
808 // wave.
809 MachineOperand &SOff =
810 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
811 if (!SOff.isImm() || SOff.getImm() != 0)
812 return;
813 }
814
815 // A frame index will resolve to a positive constant, so it should always be
816 // safe to fold the addressing mode, even pre-GFX9.
817 UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
818
819 const unsigned Opc = UseMI->getOpcode();
820 if (TII->isFLATScratch(*UseMI) &&
821 AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
822 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
823 unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
824 UseMI->setDesc(TII->get(NewOpc));
825 }
826
827 return;
828 }
829
830 bool FoldingImmLike =
831 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
832
833 if (FoldingImmLike && UseMI->isCopy()) {
834 Register DestReg = UseMI->getOperand(0).getReg();
835 Register SrcReg = UseMI->getOperand(1).getReg();
836 assert(SrcReg.isVirtual());
837
838 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
839
840 // Don't fold into a copy to a physical register with the same class. Doing
841 // so would interfere with the register coalescer's logic which would avoid
842 // redundant initializations.
843 if (DestReg.isPhysical() && SrcRC->contains(DestReg))
844 return;
845
846 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
847 if (!DestReg.isPhysical()) {
848 if (DestRC == &AMDGPU::AGPR_32RegClass &&
849 TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
850 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
852 CopiesToReplace.push_back(UseMI);
853 return;
854 }
855 }
856
857 // In order to fold immediates into copies, we need to change the
858 // copy to a MOV.
859
860 unsigned MovOp = TII->getMovOpcode(DestRC);
861 if (MovOp == AMDGPU::COPY)
862 return;
863
866 while (ImpOpI != ImpOpE) {
867 MachineInstr::mop_iterator Tmp = ImpOpI;
868 ImpOpI++;
870 }
871 UseMI->setDesc(TII->get(MovOp));
872
873 if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
874 const auto &SrcOp = UseMI->getOperand(UseOpIdx);
875 MachineOperand NewSrcOp(SrcOp);
878 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
879 UseMI->addOperand(NewSrcOp); // src0
880 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
881 UseOpIdx = 2;
882 UseOp = &UseMI->getOperand(UseOpIdx);
883 }
884 CopiesToReplace.push_back(UseMI);
885 } else {
886 if (UseMI->isCopy() && OpToFold.isReg() &&
888 !UseMI->getOperand(1).getSubReg()) {
889 LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
890 unsigned Size = TII->getOpSize(*UseMI, 1);
891 Register UseReg = OpToFold.getReg();
893 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
894 UseMI->getOperand(1).setIsKill(false);
895 CopiesToReplace.push_back(UseMI);
896 OpToFold.setIsKill(false);
897
898 // Remove kill flags as kills may now be out of order with uses.
899 MRI->clearKillFlags(OpToFold.getReg());
900
901 // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
902 // can only accept VGPR or inline immediate. Recreate a reg_sequence with
903 // its initializers right here, so we will rematerialize immediates and
904 // avoid copies via different reg classes.
906 if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
907 getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
908 const DebugLoc &DL = UseMI->getDebugLoc();
910
911 UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
912 for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
914
918 for (unsigned I = 0; I < Size / 4; ++I) {
919 MachineOperand *Def = Defs[I].first;
921 if (Def->isImm() &&
922 TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
923 int64_t Imm = Def->getImm();
924
925 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
927 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm);
928 B.addReg(Tmp);
929 } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
930 auto Src = getRegSubRegPair(*Def);
931 Def->setIsKill(false);
932 if (!SeenAGPRs.insert(Src)) {
933 // We cannot build a reg_sequence out of the same registers, they
934 // must be copied. Better do it here before copyPhysReg() created
935 // several reads to do the AGPR->VGPR->AGPR copy.
936 CopyToVGPR = Src;
937 } else {
938 B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0,
939 Src.SubReg);
940 }
941 } else {
942 assert(Def->isReg());
943 Def->setIsKill(false);
944 auto Src = getRegSubRegPair(*Def);
945
946 // Direct copy from SGPR to AGPR is not possible. To avoid creation
947 // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
948 // create a copy here and track if we already have such a copy.
949 if (TRI->isSGPRReg(*MRI, Src.Reg)) {
950 CopyToVGPR = Src;
951 } else {
952 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
953 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
954 B.addReg(Tmp);
955 }
956 }
957
958 if (CopyToVGPR.Reg) {
959 Register Vgpr;
960 if (VGPRCopies.count(CopyToVGPR)) {
961 Vgpr = VGPRCopies[CopyToVGPR];
962 } else {
963 Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
964 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
965 VGPRCopies[CopyToVGPR] = Vgpr;
966 }
967 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
969 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg(Vgpr);
970 B.addReg(Tmp);
971 }
972
973 B.addImm(Defs[I].second);
974 }
975 LLVM_DEBUG(dbgs() << "Folded " << *UseMI);
976 return;
977 }
978
979 if (Size != 4)
980 return;
981
982 Register Reg0 = UseMI->getOperand(0).getReg();
983 Register Reg1 = UseMI->getOperand(1).getReg();
984 if (TRI->isAGPR(*MRI, Reg0) && TRI->isVGPR(*MRI, Reg1))
985 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
986 else if (TRI->isVGPR(*MRI, Reg0) && TRI->isAGPR(*MRI, Reg1))
987 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64));
988 else if (ST->hasGFX90AInsts() && TRI->isAGPR(*MRI, Reg0) &&
989 TRI->isAGPR(*MRI, Reg1))
990 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32));
991 return;
992 }
993
994 unsigned UseOpc = UseMI->getOpcode();
995 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
996 (UseOpc == AMDGPU::V_READLANE_B32 &&
997 (int)UseOpIdx ==
998 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
999 // %vgpr = V_MOV_B32 imm
1000 // %sgpr = V_READFIRSTLANE_B32 %vgpr
1001 // =>
1002 // %sgpr = S_MOV_B32 imm
1003 if (FoldingImmLike) {
1005 UseMI->getOperand(UseOpIdx).getReg(),
1006 *OpToFold.getParent(),
1007 *UseMI))
1008 return;
1009
1010 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
1011
1012 if (OpToFold.isImm())
1013 UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
1014 else
1016 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1017 return;
1018 }
1019
1020 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
1022 UseMI->getOperand(UseOpIdx).getReg(),
1023 *OpToFold.getParent(),
1024 *UseMI))
1025 return;
1026
1027 // %vgpr = COPY %sgpr0
1028 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
1029 // =>
1030 // %sgpr1 = COPY %sgpr0
1031 UseMI->setDesc(TII->get(AMDGPU::COPY));
1032 UseMI->getOperand(1).setReg(OpToFold.getReg());
1033 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1034 UseMI->getOperand(1).setIsKill(false);
1035 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1036 return;
1037 }
1038 }
1039
1040 const MCInstrDesc &UseDesc = UseMI->getDesc();
1041
1042 // Don't fold into target independent nodes. Target independent opcodes
1043 // don't have defined register classes.
1044 if (UseDesc.isVariadic() || UseOp->isImplicit() ||
1045 UseDesc.operands()[UseOpIdx].RegClass == -1)
1046 return;
1047 }
1048
1049 if (!FoldingImmLike) {
1050 if (OpToFold.isReg() && ST->needsAlignedVGPRs()) {
1051 // Don't fold if OpToFold doesn't hold an aligned register.
1052 const TargetRegisterClass *RC =
1053 TRI->getRegClassForReg(*MRI, OpToFold.getReg());
1054 if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) {
1055 unsigned SubReg = OpToFold.getSubReg();
1056 if (const TargetRegisterClass *SubRC =
1057 TRI->getSubRegisterClass(RC, SubReg))
1058 RC = SubRC;
1059 }
1060
1061 if (!RC || !TRI->isProperlyAlignedRC(*RC))
1062 return;
1063 }
1064
1065 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold);
1066
1067 // FIXME: We could try to change the instruction from 64-bit to 32-bit
1068 // to enable more folding opportunities. The shrink operands pass
1069 // already does this.
1070 return;
1071 }
1072
1073
1074 const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
1075 const TargetRegisterClass *FoldRC =
1076 TRI->getRegClass(FoldDesc.operands()[0].RegClass);
1077
1078 // Split 64-bit constants into 32-bits for folding.
1079 if (UseOp->getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) {
1080 Register UseReg = UseOp->getReg();
1081 const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
1082 if (AMDGPU::getRegBitWidth(*UseRC) != 64)
1083 return;
1084
1085 APInt Imm(64, OpToFold.getImm());
1086 if (UseOp->getSubReg() == AMDGPU::sub0) {
1087 Imm = Imm.getLoBits(32);
1088 } else {
1089 assert(UseOp->getSubReg() == AMDGPU::sub1);
1090 Imm = Imm.getHiBits(32);
1091 }
1092
1093 MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
1094 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp);
1095 return;
1096 }
1097
1098 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold);
1099}
1100
1101static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
1102 uint32_t LHS, uint32_t RHS) {
1103 switch (Opcode) {
1104 case AMDGPU::V_AND_B32_e64:
1105 case AMDGPU::V_AND_B32_e32:
1106 case AMDGPU::S_AND_B32:
1107 Result = LHS & RHS;
1108 return true;
1109 case AMDGPU::V_OR_B32_e64:
1110 case AMDGPU::V_OR_B32_e32:
1111 case AMDGPU::S_OR_B32:
1112 Result = LHS | RHS;
1113 return true;
1114 case AMDGPU::V_XOR_B32_e64:
1115 case AMDGPU::V_XOR_B32_e32:
1116 case AMDGPU::S_XOR_B32:
1117 Result = LHS ^ RHS;
1118 return true;
1119 case AMDGPU::S_XNOR_B32:
1120 Result = ~(LHS ^ RHS);
1121 return true;
1122 case AMDGPU::S_NAND_B32:
1123 Result = ~(LHS & RHS);
1124 return true;
1125 case AMDGPU::S_NOR_B32:
1126 Result = ~(LHS | RHS);
1127 return true;
1128 case AMDGPU::S_ANDN2_B32:
1129 Result = LHS & ~RHS;
1130 return true;
1131 case AMDGPU::S_ORN2_B32:
1132 Result = LHS | ~RHS;
1133 return true;
1134 case AMDGPU::V_LSHL_B32_e64:
1135 case AMDGPU::V_LSHL_B32_e32:
1136 case AMDGPU::S_LSHL_B32:
1137 // The instruction ignores the high bits for out of bounds shifts.
1138 Result = LHS << (RHS & 31);
1139 return true;
1140 case AMDGPU::V_LSHLREV_B32_e64:
1141 case AMDGPU::V_LSHLREV_B32_e32:
1142 Result = RHS << (LHS & 31);
1143 return true;
1144 case AMDGPU::V_LSHR_B32_e64:
1145 case AMDGPU::V_LSHR_B32_e32:
1146 case AMDGPU::S_LSHR_B32:
1147 Result = LHS >> (RHS & 31);
1148 return true;
1149 case AMDGPU::V_LSHRREV_B32_e64:
1150 case AMDGPU::V_LSHRREV_B32_e32:
1151 Result = RHS >> (LHS & 31);
1152 return true;
1153 case AMDGPU::V_ASHR_I32_e64:
1154 case AMDGPU::V_ASHR_I32_e32:
1155 case AMDGPU::S_ASHR_I32:
1156 Result = static_cast<int32_t>(LHS) >> (RHS & 31);
1157 return true;
1158 case AMDGPU::V_ASHRREV_I32_e64:
1159 case AMDGPU::V_ASHRREV_I32_e32:
1160 Result = static_cast<int32_t>(RHS) >> (LHS & 31);
1161 return true;
1162 default:
1163 return false;
1164 }
1165}
1166
1167static unsigned getMovOpc(bool IsScalar) {
1168 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1169}
1170
1171static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
1172 MI.setDesc(NewDesc);
1173
1174 // Remove any leftover implicit operands from mutating the instruction. e.g.
1175 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
1176 // anymore.
1177 const MCInstrDesc &Desc = MI.getDesc();
1178 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
1179 Desc.implicit_defs().size();
1180
1181 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
1182 MI.removeOperand(I);
1183}
1184
1186SIFoldOperands::getImmOrMaterializedImm(MachineOperand &Op) const {
1187 // If this has a subregister, it obviously is a register source.
1188 if (!Op.isReg() || Op.getSubReg() != AMDGPU::NoSubRegister ||
1189 !Op.getReg().isVirtual())
1190 return &Op;
1191
1192 MachineInstr *Def = MRI->getVRegDef(Op.getReg());
1193 if (Def && Def->isMoveImmediate()) {
1194 MachineOperand &ImmSrc = Def->getOperand(1);
1195 if (ImmSrc.isImm())
1196 return &ImmSrc;
1197 }
1198
1199 return &Op;
1200}
1201
1202// Try to simplify operations with a constant that may appear after instruction
1203// selection.
1204// TODO: See if a frame index with a fixed offset can fold.
1205bool SIFoldOperands::tryConstantFoldOp(MachineInstr *MI) const {
1206 if (!MI->allImplicitDefsAreDead())
1207 return false;
1208
1209 unsigned Opc = MI->getOpcode();
1210
1211 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1212 if (Src0Idx == -1)
1213 return false;
1214 MachineOperand *Src0 = getImmOrMaterializedImm(MI->getOperand(Src0Idx));
1215
1216 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1217 Opc == AMDGPU::S_NOT_B32) &&
1218 Src0->isImm()) {
1219 MI->getOperand(1).ChangeToImmediate(~Src0->getImm());
1220 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
1221 return true;
1222 }
1223
1224 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1225 if (Src1Idx == -1)
1226 return false;
1227 MachineOperand *Src1 = getImmOrMaterializedImm(MI->getOperand(Src1Idx));
1228
1229 if (!Src0->isImm() && !Src1->isImm())
1230 return false;
1231
1232 // and k0, k1 -> v_mov_b32 (k0 & k1)
1233 // or k0, k1 -> v_mov_b32 (k0 | k1)
1234 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1235 if (Src0->isImm() && Src1->isImm()) {
1236 int32_t NewImm;
1237 if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
1238 return false;
1239
1240 bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg());
1241
1242 // Be careful to change the right operand, src0 may belong to a different
1243 // instruction.
1244 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1245 MI->removeOperand(Src1Idx);
1246 mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
1247 return true;
1248 }
1249
1250 if (!MI->isCommutable())
1251 return false;
1252
1253 if (Src0->isImm() && !Src1->isImm()) {
1254 std::swap(Src0, Src1);
1255 std::swap(Src0Idx, Src1Idx);
1256 }
1257
1258 int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
1259 if (Opc == AMDGPU::V_OR_B32_e64 ||
1260 Opc == AMDGPU::V_OR_B32_e32 ||
1261 Opc == AMDGPU::S_OR_B32) {
1262 if (Src1Val == 0) {
1263 // y = or x, 0 => y = copy x
1264 MI->removeOperand(Src1Idx);
1265 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1266 } else if (Src1Val == -1) {
1267 // y = or x, -1 => y = v_mov_b32 -1
1268 MI->removeOperand(Src1Idx);
1269 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
1270 } else
1271 return false;
1272
1273 return true;
1274 }
1275
1276 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1277 Opc == AMDGPU::S_AND_B32) {
1278 if (Src1Val == 0) {
1279 // y = and x, 0 => y = v_mov_b32 0
1280 MI->removeOperand(Src0Idx);
1281 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
1282 } else if (Src1Val == -1) {
1283 // y = and x, -1 => y = copy x
1284 MI->removeOperand(Src1Idx);
1285 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1286 } else
1287 return false;
1288
1289 return true;
1290 }
1291
1292 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1293 Opc == AMDGPU::S_XOR_B32) {
1294 if (Src1Val == 0) {
1295 // y = xor x, 0 => y = copy x
1296 MI->removeOperand(Src1Idx);
1297 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1298 return true;
1299 }
1300 }
1301
1302 return false;
1303}
1304
1305// Try to fold an instruction into a simpler one
1306bool SIFoldOperands::tryFoldCndMask(MachineInstr &MI) const {
1307 unsigned Opc = MI.getOpcode();
1308 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1309 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1310 return false;
1311
1312 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1313 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1314 if (!Src1->isIdenticalTo(*Src0)) {
1315 auto *Src0Imm = getImmOrMaterializedImm(*Src0);
1316 auto *Src1Imm = getImmOrMaterializedImm(*Src1);
1317 if (!Src1Imm->isIdenticalTo(*Src0Imm))
1318 return false;
1319 }
1320
1321 int Src1ModIdx =
1322 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1323 int Src0ModIdx =
1324 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1325 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1326 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1327 return false;
1328
1329 LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1330 auto &NewDesc =
1331 TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
1332 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1333 if (Src2Idx != -1)
1334 MI.removeOperand(Src2Idx);
1335 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1336 if (Src1ModIdx != -1)
1337 MI.removeOperand(Src1ModIdx);
1338 if (Src0ModIdx != -1)
1339 MI.removeOperand(Src0ModIdx);
1340 mutateCopyOp(MI, NewDesc);
1341 LLVM_DEBUG(dbgs() << MI);
1342 return true;
1343}
1344
1345bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr &MI) const {
1346 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1347 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1348 return false;
1349
1350 MachineOperand *Src0 = getImmOrMaterializedImm(MI.getOperand(1));
1351 if (!Src0->isImm() || Src0->getImm() != 0xffff)
1352 return false;
1353
1354 Register Src1 = MI.getOperand(2).getReg();
1355 MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1356 if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
1357 return false;
1358
1359 Register Dst = MI.getOperand(0).getReg();
1360 MRI->replaceRegWith(Dst, SrcDef->getOperand(0).getReg());
1361 MI.eraseFromParent();
1362 return true;
1363}
1364
1365bool SIFoldOperands::foldInstOperand(MachineInstr &MI,
1366 MachineOperand &OpToFold) const {
1367 // We need mutate the operands of new mov instructions to add implicit
1368 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1369 // this.
1370 SmallVector<MachineInstr *, 4> CopiesToReplace;
1372 MachineOperand &Dst = MI.getOperand(0);
1373 bool Changed = false;
1374
1375 if (OpToFold.isImm()) {
1376 for (auto &UseMI :
1377 make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
1378 // Folding the immediate may reveal operations that can be constant
1379 // folded or replaced with a copy. This can happen for example after
1380 // frame indices are lowered to constants or from splitting 64-bit
1381 // constants.
1382 //
1383 // We may also encounter cases where one or both operands are
1384 // immediates materialized into a register, which would ordinarily not
1385 // be folded due to multiple uses or operand constraints.
1386 if (tryConstantFoldOp(&UseMI)) {
1387 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
1388 Changed = true;
1389 }
1390 }
1391 }
1392
1394 for (auto &Use : MRI->use_nodbg_operands(Dst.getReg()))
1395 UsesToProcess.push_back(&Use);
1396 for (auto *U : UsesToProcess) {
1397 MachineInstr *UseMI = U->getParent();
1398 foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
1399 CopiesToReplace);
1400 }
1401
1402 if (CopiesToReplace.empty() && FoldList.empty())
1403 return Changed;
1404
1405 MachineFunction *MF = MI.getParent()->getParent();
1406 // Make sure we add EXEC uses to any new v_mov instructions created.
1407 for (MachineInstr *Copy : CopiesToReplace)
1408 Copy->addImplicitDefUseOperands(*MF);
1409
1410 for (FoldCandidate &Fold : FoldList) {
1411 assert(!Fold.isReg() || Fold.OpToFold);
1412 if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) {
1413 Register Reg = Fold.OpToFold->getReg();
1414 MachineInstr *DefMI = Fold.OpToFold->getParent();
1415 if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
1416 execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
1417 continue;
1418 }
1419 if (updateOperand(Fold)) {
1420 // Clear kill flags.
1421 if (Fold.isReg()) {
1422 assert(Fold.OpToFold && Fold.OpToFold->isReg());
1423 // FIXME: Probably shouldn't bother trying to fold if not an
1424 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1425 // copies.
1426 MRI->clearKillFlags(Fold.OpToFold->getReg());
1427 }
1428 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1429 << static_cast<int>(Fold.UseOpNo) << " of "
1430 << *Fold.UseMI);
1431 } else if (Fold.Commuted) {
1432 // Restoring instruction's original operand order if fold has failed.
1433 TII->commuteInstruction(*Fold.UseMI, false);
1434 }
1435 }
1436 return true;
1437}
1438
1439bool SIFoldOperands::tryFoldFoldableCopy(
1440 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
1441 // Specially track simple redefs of m0 to the same value in a block, so we
1442 // can erase the later ones.
1443 if (MI.getOperand(0).getReg() == AMDGPU::M0) {
1444 MachineOperand &NewM0Val = MI.getOperand(1);
1445 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
1446 MI.eraseFromParent();
1447 return true;
1448 }
1449
1450 // We aren't tracking other physical registers
1451 CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical())
1452 ? nullptr
1453 : &NewM0Val;
1454 return false;
1455 }
1456
1457 MachineOperand &OpToFold = MI.getOperand(1);
1458 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1459
1460 // FIXME: We could also be folding things like TargetIndexes.
1461 if (!FoldingImm && !OpToFold.isReg())
1462 return false;
1463
1464 if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
1465 return false;
1466
1467 // Prevent folding operands backwards in the function. For example,
1468 // the COPY opcode must not be replaced by 1 in this example:
1469 //
1470 // %3 = COPY %vgpr0; VGPR_32:%3
1471 // ...
1472 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1473 if (!MI.getOperand(0).getReg().isVirtual())
1474 return false;
1475
1476 bool Changed = foldInstOperand(MI, OpToFold);
1477
1478 // If we managed to fold all uses of this copy then we might as well
1479 // delete it now.
1480 // The only reason we need to follow chains of copies here is that
1481 // tryFoldRegSequence looks forward through copies before folding a
1482 // REG_SEQUENCE into its eventual users.
1483 auto *InstToErase = &MI;
1484 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1485 auto &SrcOp = InstToErase->getOperand(1);
1486 auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
1487 InstToErase->eraseFromParent();
1488 Changed = true;
1489 InstToErase = nullptr;
1490 if (!SrcReg || SrcReg.isPhysical())
1491 break;
1492 InstToErase = MRI->getVRegDef(SrcReg);
1493 if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
1494 break;
1495 }
1496
1497 if (InstToErase && InstToErase->isRegSequence() &&
1498 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1499 InstToErase->eraseFromParent();
1500 Changed = true;
1501 }
1502
1503 return Changed;
1504}
1505
1506// Clamp patterns are canonically selected to v_max_* instructions, so only
1507// handle them.
1508const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
1509 unsigned Op = MI.getOpcode();
1510 switch (Op) {
1511 case AMDGPU::V_MAX_F32_e64:
1512 case AMDGPU::V_MAX_F16_e64:
1513 case AMDGPU::V_MAX_F16_t16_e64:
1514 case AMDGPU::V_MAX_F16_fake16_e64:
1515 case AMDGPU::V_MAX_F64_e64:
1516 case AMDGPU::V_MAX_NUM_F64_e64:
1517 case AMDGPU::V_PK_MAX_F16: {
1518 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
1519 return nullptr;
1520
1521 // Make sure sources are identical.
1522 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1523 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1524 if (!Src0->isReg() || !Src1->isReg() ||
1525 Src0->getReg() != Src1->getReg() ||
1526 Src0->getSubReg() != Src1->getSubReg() ||
1527 Src0->getSubReg() != AMDGPU::NoSubRegister)
1528 return nullptr;
1529
1530 // Can't fold up if we have modifiers.
1531 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1532 return nullptr;
1533
1534 unsigned Src0Mods
1535 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
1536 unsigned Src1Mods
1537 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
1538
1539 // Having a 0 op_sel_hi would require swizzling the output in the source
1540 // instruction, which we can't do.
1541 unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1
1542 : 0u;
1543 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
1544 return nullptr;
1545 return Src0;
1546 }
1547 default:
1548 return nullptr;
1549 }
1550}
1551
1552// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
1553bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
1554 const MachineOperand *ClampSrc = isClamp(MI);
1555 if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
1556 return false;
1557
1558 MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
1559
1560 // The type of clamp must be compatible.
1561 if (TII->getClampMask(*Def) != TII->getClampMask(MI))
1562 return false;
1563
1564 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
1565 if (!DefClamp)
1566 return false;
1567
1568 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
1569
1570 // Clamp is applied after omod, so it is OK if omod is set.
1571 DefClamp->setImm(1);
1572 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1573 MI.eraseFromParent();
1574
1575 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1576 // instruction, so we might as well convert it to the more flexible VOP3-only
1577 // mad/fma form.
1578 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1579 Def->eraseFromParent();
1580
1581 return true;
1582}
1583
1584static int getOModValue(unsigned Opc, int64_t Val) {
1585 switch (Opc) {
1586 case AMDGPU::V_MUL_F64_e64:
1587 case AMDGPU::V_MUL_F64_pseudo_e64: {
1588 switch (Val) {
1589 case 0x3fe0000000000000: // 0.5
1590 return SIOutMods::DIV2;
1591 case 0x4000000000000000: // 2.0
1592 return SIOutMods::MUL2;
1593 case 0x4010000000000000: // 4.0
1594 return SIOutMods::MUL4;
1595 default:
1596 return SIOutMods::NONE;
1597 }
1598 }
1599 case AMDGPU::V_MUL_F32_e64: {
1600 switch (static_cast<uint32_t>(Val)) {
1601 case 0x3f000000: // 0.5
1602 return SIOutMods::DIV2;
1603 case 0x40000000: // 2.0
1604 return SIOutMods::MUL2;
1605 case 0x40800000: // 4.0
1606 return SIOutMods::MUL4;
1607 default:
1608 return SIOutMods::NONE;
1609 }
1610 }
1611 case AMDGPU::V_MUL_F16_e64:
1612 case AMDGPU::V_MUL_F16_t16_e64:
1613 case AMDGPU::V_MUL_F16_fake16_e64: {
1614 switch (static_cast<uint16_t>(Val)) {
1615 case 0x3800: // 0.5
1616 return SIOutMods::DIV2;
1617 case 0x4000: // 2.0
1618 return SIOutMods::MUL2;
1619 case 0x4400: // 4.0
1620 return SIOutMods::MUL4;
1621 default:
1622 return SIOutMods::NONE;
1623 }
1624 }
1625 default:
1626 llvm_unreachable("invalid mul opcode");
1627 }
1628}
1629
1630// FIXME: Does this really not support denormals with f16?
1631// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
1632// handled, so will anything other than that break?
1633std::pair<const MachineOperand *, int>
1634SIFoldOperands::isOMod(const MachineInstr &MI) const {
1635 unsigned Op = MI.getOpcode();
1636 switch (Op) {
1637 case AMDGPU::V_MUL_F64_e64:
1638 case AMDGPU::V_MUL_F64_pseudo_e64:
1639 case AMDGPU::V_MUL_F32_e64:
1640 case AMDGPU::V_MUL_F16_t16_e64:
1641 case AMDGPU::V_MUL_F16_fake16_e64:
1642 case AMDGPU::V_MUL_F16_e64: {
1643 // If output denormals are enabled, omod is ignored.
1644 if ((Op == AMDGPU::V_MUL_F32_e64 &&
1645 MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1646 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
1647 Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
1648 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
1649 MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
1650 return std::pair(nullptr, SIOutMods::NONE);
1651
1652 const MachineOperand *RegOp = nullptr;
1653 const MachineOperand *ImmOp = nullptr;
1654 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1655 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1656 if (Src0->isImm()) {
1657 ImmOp = Src0;
1658 RegOp = Src1;
1659 } else if (Src1->isImm()) {
1660 ImmOp = Src1;
1661 RegOp = Src0;
1662 } else
1663 return std::pair(nullptr, SIOutMods::NONE);
1664
1665 int OMod = getOModValue(Op, ImmOp->getImm());
1666 if (OMod == SIOutMods::NONE ||
1667 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
1668 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
1669 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
1670 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
1671 return std::pair(nullptr, SIOutMods::NONE);
1672
1673 return std::pair(RegOp, OMod);
1674 }
1675 case AMDGPU::V_ADD_F64_e64:
1676 case AMDGPU::V_ADD_F64_pseudo_e64:
1677 case AMDGPU::V_ADD_F32_e64:
1678 case AMDGPU::V_ADD_F16_e64:
1679 case AMDGPU::V_ADD_F16_t16_e64:
1680 case AMDGPU::V_ADD_F16_fake16_e64: {
1681 // If output denormals are enabled, omod is ignored.
1682 if ((Op == AMDGPU::V_ADD_F32_e64 &&
1683 MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1684 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
1685 Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
1686 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
1687 MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
1688 return std::pair(nullptr, SIOutMods::NONE);
1689
1690 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
1691 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1692 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1693
1694 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
1695 Src0->getSubReg() == Src1->getSubReg() &&
1696 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
1697 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
1698 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
1699 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1700 return std::pair(Src0, SIOutMods::MUL2);
1701
1702 return std::pair(nullptr, SIOutMods::NONE);
1703 }
1704 default:
1705 return std::pair(nullptr, SIOutMods::NONE);
1706 }
1707}
1708
1709// FIXME: Does this need to check IEEE bit on function?
1710bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
1711 const MachineOperand *RegOp;
1712 int OMod;
1713 std::tie(RegOp, OMod) = isOMod(MI);
1714 if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
1715 RegOp->getSubReg() != AMDGPU::NoSubRegister ||
1716 !MRI->hasOneNonDBGUser(RegOp->getReg()))
1717 return false;
1718
1719 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
1720 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
1721 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
1722 return false;
1723
1724 // Clamp is applied after omod. If the source already has clamp set, don't
1725 // fold it.
1726 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
1727 return false;
1728
1729 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
1730
1731 DefOMod->setImm(OMod);
1732 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1733 MI.eraseFromParent();
1734
1735 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1736 // instruction, so we might as well convert it to the more flexible VOP3-only
1737 // mad/fma form.
1738 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1739 Def->eraseFromParent();
1740
1741 return true;
1742}
1743
1744// Try to fold a reg_sequence with vgpr output and agpr inputs into an
1745// instruction which can take an agpr. So far that means a store.
1746bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) {
1747 assert(MI.isRegSequence());
1748 auto Reg = MI.getOperand(0).getReg();
1749
1750 if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
1751 !MRI->hasOneNonDBGUse(Reg))
1752 return false;
1753
1755 if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER))
1756 return false;
1757
1758 for (auto &Def : Defs) {
1759 const auto *Op = Def.first;
1760 if (!Op->isReg())
1761 return false;
1762 if (TRI->isAGPR(*MRI, Op->getReg()))
1763 continue;
1764 // Maybe this is a COPY from AREG
1765 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
1766 if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
1767 return false;
1768 if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
1769 return false;
1770 }
1771
1772 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
1773 MachineInstr *UseMI = Op->getParent();
1774 while (UseMI->isCopy() && !Op->getSubReg()) {
1775 Reg = UseMI->getOperand(0).getReg();
1776 if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
1777 return false;
1778 Op = &*MRI->use_nodbg_begin(Reg);
1779 UseMI = Op->getParent();
1780 }
1781
1782 if (Op->getSubReg())
1783 return false;
1784
1785 unsigned OpIdx = Op - &UseMI->getOperand(0);
1786 const MCInstrDesc &InstDesc = UseMI->getDesc();
1787 const TargetRegisterClass *OpRC =
1788 TII->getRegClass(InstDesc, OpIdx, TRI, *MI.getMF());
1789 if (!OpRC || !TRI->isVectorSuperClass(OpRC))
1790 return false;
1791
1792 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
1793 auto Dst = MRI->createVirtualRegister(NewDstRC);
1794 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
1795 TII->get(AMDGPU::REG_SEQUENCE), Dst);
1796
1797 for (unsigned I = 0; I < Defs.size(); ++I) {
1798 MachineOperand *Def = Defs[I].first;
1799 Def->setIsKill(false);
1800 if (TRI->isAGPR(*MRI, Def->getReg())) {
1801 RS.add(*Def);
1802 } else { // This is a copy
1803 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
1804 SubDef->getOperand(1).setIsKill(false);
1805 RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg());
1806 }
1807 RS.addImm(Defs[I].second);
1808 }
1809
1810 Op->setReg(Dst);
1811 if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
1812 Op->setReg(Reg);
1813 RS->eraseFromParent();
1814 return false;
1815 }
1816
1817 LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
1818
1819 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
1820 // in which case we can erase them all later in runOnMachineFunction.
1821 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
1822 MI.eraseFromParent();
1823 return true;
1824}
1825
1826/// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and
1827/// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg
1828static bool isAGPRCopy(const SIRegisterInfo &TRI,
1829 const MachineRegisterInfo &MRI, const MachineInstr &Copy,
1830 Register &OutReg, unsigned &OutSubReg) {
1831 assert(Copy.isCopy());
1832
1833 const MachineOperand &CopySrc = Copy.getOperand(1);
1834 Register CopySrcReg = CopySrc.getReg();
1835 if (!CopySrcReg.isVirtual())
1836 return false;
1837
1838 // Common case: copy from AGPR directly, e.g.
1839 // %1:vgpr_32 = COPY %0:agpr_32
1840 if (TRI.isAGPR(MRI, CopySrcReg)) {
1841 OutReg = CopySrcReg;
1842 OutSubReg = CopySrc.getSubReg();
1843 return true;
1844 }
1845
1846 // Sometimes it can also involve two copies, e.g.
1847 // %1:vgpr_256 = COPY %0:agpr_256
1848 // %2:vgpr_32 = COPY %1:vgpr_256.sub0
1849 const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg);
1850 if (!CopySrcDef || !CopySrcDef->isCopy())
1851 return false;
1852
1853 const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1);
1854 Register OtherCopySrcReg = OtherCopySrc.getReg();
1855 if (!OtherCopySrcReg.isVirtual() ||
1856 CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister ||
1857 OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister ||
1858 !TRI.isAGPR(MRI, OtherCopySrcReg))
1859 return false;
1860
1861 OutReg = OtherCopySrcReg;
1862 OutSubReg = CopySrc.getSubReg();
1863 return true;
1864}
1865
1866// Try to hoist an AGPR to VGPR copy across a PHI.
1867// This should allow folding of an AGPR into a consumer which may support it.
1868//
1869// Example 1: LCSSA PHI
1870// loop:
1871// %1:vreg = COPY %0:areg
1872// exit:
1873// %2:vreg = PHI %1:vreg, %loop
1874// =>
1875// loop:
1876// exit:
1877// %1:areg = PHI %0:areg, %loop
1878// %2:vreg = COPY %1:areg
1879//
1880// Example 2: PHI with multiple incoming values:
1881// entry:
1882// %1:vreg = GLOBAL_LOAD(..)
1883// loop:
1884// %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
1885// %3:areg = COPY %2:vreg
1886// %4:areg = (instr using %3:areg)
1887// %5:vreg = COPY %4:areg
1888// =>
1889// entry:
1890// %1:vreg = GLOBAL_LOAD(..)
1891// %2:areg = COPY %1:vreg
1892// loop:
1893// %3:areg = PHI %2:areg, %entry, %X:areg,
1894// %4:areg = (instr using %3:areg)
1895bool SIFoldOperands::tryFoldPhiAGPR(MachineInstr &PHI) {
1896 assert(PHI.isPHI());
1897
1898 Register PhiOut = PHI.getOperand(0).getReg();
1899 if (!TRI->isVGPR(*MRI, PhiOut))
1900 return false;
1901
1902 // Iterate once over all incoming values of the PHI to check if this PHI is
1903 // eligible, and determine the exact AGPR RC we'll target.
1904 const TargetRegisterClass *ARC = nullptr;
1905 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
1906 MachineOperand &MO = PHI.getOperand(K);
1907 MachineInstr *Copy = MRI->getVRegDef(MO.getReg());
1908 if (!Copy || !Copy->isCopy())
1909 continue;
1910
1911 Register AGPRSrc;
1912 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
1913 if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask))
1914 continue;
1915
1916 const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc);
1917 if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
1918 CopyInRC = SubRC;
1919
1920 if (ARC && !ARC->hasSubClassEq(CopyInRC))
1921 return false;
1922 ARC = CopyInRC;
1923 }
1924
1925 if (!ARC)
1926 return false;
1927
1928 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
1929
1930 // Rewrite the PHI's incoming values to ARC.
1931 LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI);
1932 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
1933 MachineOperand &MO = PHI.getOperand(K);
1934 Register Reg = MO.getReg();
1935
1937 MachineBasicBlock *InsertMBB = nullptr;
1938
1939 // Look at the def of Reg, ignoring all copies.
1940 unsigned CopyOpc = AMDGPU::COPY;
1941 if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
1942
1943 // Look at pre-existing COPY instructions from ARC: Steal the operand. If
1944 // the copy was single-use, it will be removed by DCE later.
1945 if (Def->isCopy()) {
1946 Register AGPRSrc;
1947 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
1948 if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) {
1949 MO.setReg(AGPRSrc);
1950 MO.setSubReg(AGPRSubReg);
1951 continue;
1952 }
1953
1954 // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
1955 // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
1956 // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
1957 // is unlikely to be profitable.
1958 //
1959 // Note that V_ACCVGPR_WRITE is only used for AGPR_32.
1960 MachineOperand &CopyIn = Def->getOperand(1);
1961 if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) &&
1962 TRI->isSGPRReg(*MRI, CopyIn.getReg()))
1963 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1964 }
1965
1966 InsertMBB = Def->getParent();
1967 InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator());
1968 } else {
1969 InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
1970 InsertPt = InsertMBB->getFirstTerminator();
1971 }
1972
1973 Register NewReg = MRI->createVirtualRegister(ARC);
1974 MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
1975 TII->get(CopyOpc), NewReg)
1976 .addReg(Reg);
1977 MO.setReg(NewReg);
1978
1979 (void)MI;
1980 LLVM_DEBUG(dbgs() << " Created COPY: " << *MI);
1981 }
1982
1983 // Replace the PHI's result with a new register.
1984 Register NewReg = MRI->createVirtualRegister(ARC);
1985 PHI.getOperand(0).setReg(NewReg);
1986
1987 // COPY that new register back to the original PhiOut register. This COPY will
1988 // usually be folded out later.
1989 MachineBasicBlock *MBB = PHI.getParent();
1990 BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(),
1991 TII->get(AMDGPU::COPY), PhiOut)
1992 .addReg(NewReg);
1993
1994 LLVM_DEBUG(dbgs() << " Done: Folded " << PHI);
1995 return true;
1996}
1997
1998// Attempt to convert VGPR load to an AGPR load.
1999bool SIFoldOperands::tryFoldLoad(MachineInstr &MI) {
2000 assert(MI.mayLoad());
2001 if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
2002 return false;
2003
2004 MachineOperand &Def = MI.getOperand(0);
2005 if (!Def.isDef())
2006 return false;
2007
2008 Register DefReg = Def.getReg();
2009
2010 if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
2011 return false;
2012
2014 SmallVector<Register, 8> MoveRegs;
2015 for (const MachineInstr &I : MRI->use_nodbg_instructions(DefReg))
2016 Users.push_back(&I);
2017
2018 if (Users.empty())
2019 return false;
2020
2021 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
2022 while (!Users.empty()) {
2023 const MachineInstr *I = Users.pop_back_val();
2024 if (!I->isCopy() && !I->isRegSequence())
2025 return false;
2026 Register DstReg = I->getOperand(0).getReg();
2027 // Physical registers may have more than one instruction definitions
2028 if (DstReg.isPhysical())
2029 return false;
2030 if (TRI->isAGPR(*MRI, DstReg))
2031 continue;
2032 MoveRegs.push_back(DstReg);
2033 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg))
2034 Users.push_back(&U);
2035 }
2036
2037 const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
2038 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
2039 if (!TII->isOperandLegal(MI, 0, &Def)) {
2040 MRI->setRegClass(DefReg, RC);
2041 return false;
2042 }
2043
2044 while (!MoveRegs.empty()) {
2045 Register Reg = MoveRegs.pop_back_val();
2046 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
2047 }
2048
2049 LLVM_DEBUG(dbgs() << "Folded " << MI);
2050
2051 return true;
2052}
2053
2054// tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
2055// For GFX90A and later, this is pretty much always a good thing, but for GFX908
2056// there's cases where it can create a lot more AGPR-AGPR copies, which are
2057// expensive on this architecture due to the lack of V_ACCVGPR_MOV.
2058//
2059// This function looks at all AGPR PHIs in a basic block and collects their
2060// operands. Then, it checks for register that are used more than once across
2061// all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
2062// having to create one VGPR temporary per use, which can get very messy if
2063// these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
2064// element).
2065//
2066// Example
2067// a:
2068// %in:agpr_256 = COPY %foo:vgpr_256
2069// c:
2070// %x:agpr_32 = ..
2071// b:
2072// %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
2073// %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
2074// %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
2075// =>
2076// a:
2077// %in:agpr_256 = COPY %foo:vgpr_256
2078// %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
2079// %tmp_agpr:agpr_32 = COPY %tmp
2080// c:
2081// %x:agpr_32 = ..
2082// b:
2083// %0:areg = PHI %tmp_agpr, %a, %x, %c
2084// %1:areg = PHI %tmp_agpr, %a, %y, %c
2085// %2:areg = PHI %tmp_agpr, %a, %z, %c
2086bool SIFoldOperands::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
2087 // This is only really needed on GFX908 where AGPR-AGPR copies are
2088 // unreasonably difficult.
2089 if (ST->hasGFX90AInsts())
2090 return false;
2091
2092 // Look at all AGPR Phis and collect the register + subregister used.
2093 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
2094 RegToMO;
2095
2096 for (auto &MI : MBB) {
2097 if (!MI.isPHI())
2098 break;
2099
2100 if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))
2101 continue;
2102
2103 for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
2104 MachineOperand &PhiMO = MI.getOperand(K);
2105 RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
2106 }
2107 }
2108
2109 // For all (Reg, SubReg) pair that are used more than once, cache the value in
2110 // a VGPR.
2111 bool Changed = false;
2112 for (const auto &[Entry, MOs] : RegToMO) {
2113 if (MOs.size() == 1)
2114 continue;
2115
2116 const auto [Reg, SubReg] = Entry;
2117 MachineInstr *Def = MRI->getVRegDef(Reg);
2118 MachineBasicBlock *DefMBB = Def->getParent();
2119
2120 // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
2121 // out.
2122 const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
2123 Register TempVGPR =
2124 MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
2125 MachineInstr *VGPRCopy =
2126 BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
2127 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2128 .addReg(Reg, /* flags */ 0, SubReg);
2129
2130 // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
2131 Register TempAGPR = MRI->createVirtualRegister(ARC);
2132 BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(),
2133 TII->get(AMDGPU::COPY), TempAGPR)
2134 .addReg(TempVGPR);
2135
2136 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
2137 for (MachineOperand *MO : MOs) {
2138 MO->setReg(TempAGPR);
2139 MO->setSubReg(AMDGPU::NoSubRegister);
2140 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n");
2141 }
2142
2143 Changed = true;
2144 }
2145
2146 return Changed;
2147}
2148
2149bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
2150 if (skipFunction(MF.getFunction()))
2151 return false;
2152
2153 MRI = &MF.getRegInfo();
2154 ST = &MF.getSubtarget<GCNSubtarget>();
2155 TII = ST->getInstrInfo();
2156 TRI = &TII->getRegisterInfo();
2157 MFI = MF.getInfo<SIMachineFunctionInfo>();
2158
2159 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
2160 // correctly handle signed zeros.
2161 //
2162 // FIXME: Also need to check strictfp
2163 bool IsIEEEMode = MFI->getMode().IEEE;
2164 bool HasNSZ = MFI->hasNoSignedZerosFPMath();
2165
2166 bool Changed = false;
2167 for (MachineBasicBlock *MBB : depth_first(&MF)) {
2168 MachineOperand *CurrentKnownM0Val = nullptr;
2169 for (auto &MI : make_early_inc_range(*MBB)) {
2170 Changed |= tryFoldCndMask(MI);
2171
2172 if (tryFoldZeroHighBits(MI)) {
2173 Changed = true;
2174 continue;
2175 }
2176
2177 if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
2178 Changed = true;
2179 continue;
2180 }
2181
2182 if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
2183 Changed = true;
2184 continue;
2185 }
2186
2187 if (MI.mayLoad() && tryFoldLoad(MI)) {
2188 Changed = true;
2189 continue;
2190 }
2191
2192 if (TII->isFoldableCopy(MI)) {
2193 Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
2194 continue;
2195 }
2196
2197 // Saw an unknown clobber of m0, so we no longer know what it is.
2198 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
2199 CurrentKnownM0Val = nullptr;
2200
2201 // TODO: Omod might be OK if there is NSZ only on the source
2202 // instruction, and not the omod multiply.
2203 if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
2204 !tryFoldOMod(MI))
2205 Changed |= tryFoldClamp(MI);
2206 }
2207
2208 Changed |= tryOptimizeAGPRPhis(*MBB);
2209 }
2210
2211 return Changed;
2212}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
aarch64 promote const
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static bool isReg(const MCInst &MI, unsigned OpNo)
Module * Mod
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static bool isUseMIInFoldList(ArrayRef< FoldCandidate > FoldList, const MachineInstr *MI)
static unsigned getMovOpc(bool IsScalar)
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc)
#define DEBUG_TYPE
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, bool Commuted=false, int ShrinkOp=-1)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
Value * RHS
Value * LHS
support::ulittle16_t & Lo
Definition: aarch32.cpp:206
support::ulittle16_t & Hi
Definition: aarch32.cpp:205
Class for arbitrary precision integers.
Definition: APInt.h:76
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:269
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
Definition: MCInstrDesc.h:261
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg=Register(), bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:544
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:327
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:547
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
Definition: MachineInstr.h:750
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:541
bool isRegSequence() const
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:473
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr reads the specified register.
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
iterator_range< mop_iterator > implicit_operands()
Definition: MachineInstr.h:674
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:554
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
self_iterator getIterator()
Definition: ilist_node.h:109
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo)
Does this operand support only inlinable literals?
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
@ OPERAND_REG_IMM_V2FP16
Definition: SIDefines.h:211
@ OPERAND_REG_INLINE_C_V2BF16
Definition: SIDefines.h:225
@ OPERAND_REG_IMM_V2INT16
Definition: SIDefines.h:212
@ OPERAND_REG_IMM_V2BF16
Definition: SIDefines.h:210
@ OPERAND_REG_INLINE_C_V2FP16
Definition: SIDefines.h:226
@ OPERAND_REG_INLINE_AC_V2INT16
Definition: SIDefines.h:244
@ OPERAND_REG_INLINE_C_INT32
Definition: SIDefines.h:218
@ OPERAND_REG_INLINE_C_V2INT16
Definition: SIDefines.h:224
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY int getFlatScratchInstSSfromSV(uint16_t Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ OPERAND_REGISTER
Definition: MCInstrDesc.h:61
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1395
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
char & SIFoldOperandsID
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:665
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1738
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
FunctionPass * createSIFoldOperandsPass()
DWARFExpression::Operation Op
void initializeSIFoldOperandsPass(PassRegistry &)
iterator_range< df_iterator< T > > depth_first(const T &G)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
Description of the encoding of one expression Op.
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
A pair composed of a register and a sub-register index.