LLVM 20.0.0git
SIFoldOperands.cpp
Go to the documentation of this file.
1//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7/// \file
8//===----------------------------------------------------------------------===//
9//
10
11#include "AMDGPU.h"
12#include "GCNSubtarget.h"
18
19#define DEBUG_TYPE "si-fold-operands"
20using namespace llvm;
21
22namespace {
23
24struct FoldCandidate {
26 union {
27 MachineOperand *OpToFold;
28 uint64_t ImmToFold;
29 int FrameIndexToFold;
30 };
31 int ShrinkOpcode;
32 unsigned UseOpNo;
34 bool Commuted;
35
36 FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
37 bool Commuted_ = false,
38 int ShrinkOp = -1) :
39 UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
40 Kind(FoldOp->getType()),
41 Commuted(Commuted_) {
42 if (FoldOp->isImm()) {
43 ImmToFold = FoldOp->getImm();
44 } else if (FoldOp->isFI()) {
45 FrameIndexToFold = FoldOp->getIndex();
46 } else {
47 assert(FoldOp->isReg() || FoldOp->isGlobal());
48 OpToFold = FoldOp;
49 }
50 }
51
52 bool isFI() const {
53 return Kind == MachineOperand::MO_FrameIndex;
54 }
55
56 bool isImm() const {
57 return Kind == MachineOperand::MO_Immediate;
58 }
59
60 bool isReg() const {
61 return Kind == MachineOperand::MO_Register;
62 }
63
64 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
65
66 bool needsShrink() const { return ShrinkOpcode != -1; }
67};
68
69class SIFoldOperands : public MachineFunctionPass {
70public:
71 static char ID;
73 const SIInstrInfo *TII;
74 const SIRegisterInfo *TRI;
75 const GCNSubtarget *ST;
76 const SIMachineFunctionInfo *MFI;
77
78 bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
79 const MachineOperand &OpToFold) const;
80
81 bool updateOperand(FoldCandidate &Fold) const;
82
83 bool canUseImmWithOpSel(FoldCandidate &Fold) const;
84
85 bool tryFoldImmWithOpSel(FoldCandidate &Fold) const;
86
87 bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
88 MachineInstr *MI, unsigned OpNo,
89 MachineOperand *OpToFold) const;
90 bool isUseSafeToFold(const MachineInstr &MI,
91 const MachineOperand &UseMO) const;
92 bool
93 getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
94 Register UseReg, uint8_t OpTy) const;
95 bool tryToFoldACImm(const MachineOperand &OpToFold, MachineInstr *UseMI,
96 unsigned UseOpIdx,
97 SmallVectorImpl<FoldCandidate> &FoldList) const;
98 void foldOperand(MachineOperand &OpToFold,
100 int UseOpIdx,
102 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
103
104 MachineOperand *getImmOrMaterializedImm(MachineOperand &Op) const;
105 bool tryConstantFoldOp(MachineInstr *MI) const;
106 bool tryFoldCndMask(MachineInstr &MI) const;
107 bool tryFoldZeroHighBits(MachineInstr &MI) const;
108 bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
109 bool tryFoldFoldableCopy(MachineInstr &MI,
110 MachineOperand *&CurrentKnownM0Val) const;
111
112 const MachineOperand *isClamp(const MachineInstr &MI) const;
113 bool tryFoldClamp(MachineInstr &MI);
114
115 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
116 bool tryFoldOMod(MachineInstr &MI);
117 bool tryFoldRegSequence(MachineInstr &MI);
118 bool tryFoldPhiAGPR(MachineInstr &MI);
119 bool tryFoldLoad(MachineInstr &MI);
120
121 bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
122
123public:
124 SIFoldOperands() : MachineFunctionPass(ID) {
126 }
127
128 bool runOnMachineFunction(MachineFunction &MF) override;
129
130 StringRef getPassName() const override { return "SI Fold Operands"; }
131
132 void getAnalysisUsage(AnalysisUsage &AU) const override {
133 AU.setPreservesCFG();
135 }
136};
137
138} // End anonymous namespace.
139
140INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
141 "SI Fold Operands", false, false)
142
143char SIFoldOperands::ID = 0;
144
145char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
146
149 const MachineOperand &MO) {
150 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
151 if (const TargetRegisterClass *SubRC =
152 TRI.getSubRegisterClass(RC, MO.getSubReg()))
153 RC = SubRC;
154 return RC;
155}
156
157// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
158static unsigned macToMad(unsigned Opc) {
159 switch (Opc) {
160 case AMDGPU::V_MAC_F32_e64:
161 return AMDGPU::V_MAD_F32_e64;
162 case AMDGPU::V_MAC_F16_e64:
163 return AMDGPU::V_MAD_F16_e64;
164 case AMDGPU::V_FMAC_F32_e64:
165 return AMDGPU::V_FMA_F32_e64;
166 case AMDGPU::V_FMAC_F16_e64:
167 return AMDGPU::V_FMA_F16_gfx9_e64;
168 case AMDGPU::V_FMAC_F16_t16_e64:
169 return AMDGPU::V_FMA_F16_gfx9_e64;
170 case AMDGPU::V_FMAC_LEGACY_F32_e64:
171 return AMDGPU::V_FMA_LEGACY_F32_e64;
172 case AMDGPU::V_FMAC_F64_e64:
173 return AMDGPU::V_FMA_F64_e64;
174 }
175 return AMDGPU::INSTRUCTION_LIST_END;
176}
177
178// TODO: Add heuristic that the frame index might not fit in the addressing mode
179// immediate offset to avoid materializing in loops.
180bool SIFoldOperands::frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
181 const MachineOperand &OpToFold) const {
182 if (!OpToFold.isFI())
183 return false;
184
185 const unsigned Opc = UseMI.getOpcode();
186 if (TII->isMUBUF(UseMI))
187 return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
188 if (!TII->isFLATScratch(UseMI))
189 return false;
190
191 int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
192 if (OpNo == SIdx)
193 return true;
194
195 int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
196 return OpNo == VIdx && SIdx == -1;
197}
198
200 return new SIFoldOperands();
201}
202
203bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {
204 MachineInstr *MI = Fold.UseMI;
205 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
206 const uint64_t TSFlags = MI->getDesc().TSFlags;
207
208 assert(Old.isReg() && Fold.isImm());
209
210 if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
211 (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) ||
212 (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
213 return false;
214
215 unsigned Opcode = MI->getOpcode();
216 int OpNo = MI->getOperandNo(&Old);
217 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
218 switch (OpType) {
219 default:
220 return false;
227 break;
228 }
229
230 return true;
231}
232
233bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold) const {
234 MachineInstr *MI = Fold.UseMI;
235 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
236 unsigned Opcode = MI->getOpcode();
237 int OpNo = MI->getOperandNo(&Old);
238 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
239
240 // If the literal can be inlined as-is, apply it and short-circuit the
241 // tests below. The main motivation for this is to avoid unintuitive
242 // uses of opsel.
243 if (AMDGPU::isInlinableLiteralV216(Fold.ImmToFold, OpType)) {
244 Old.ChangeToImmediate(Fold.ImmToFold);
245 return true;
246 }
247
248 // Refer to op_sel/op_sel_hi and check if we can change the immediate and
249 // op_sel in a way that allows an inline constant.
250 int ModIdx = -1;
251 unsigned SrcIdx = ~0;
252 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
253 ModIdx = AMDGPU::OpName::src0_modifiers;
254 SrcIdx = 0;
255 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
256 ModIdx = AMDGPU::OpName::src1_modifiers;
257 SrcIdx = 1;
258 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
259 ModIdx = AMDGPU::OpName::src2_modifiers;
260 SrcIdx = 2;
261 }
262 assert(ModIdx != -1);
263 ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
264 MachineOperand &Mod = MI->getOperand(ModIdx);
265 unsigned ModVal = Mod.getImm();
266
267 uint16_t ImmLo = static_cast<uint16_t>(
268 Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
269 uint16_t ImmHi = static_cast<uint16_t>(
270 Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
271 uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
272 unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
273
274 // Helper function that attempts to inline the given value with a newly
275 // chosen opsel pattern.
276 auto tryFoldToInline = [&](uint32_t Imm) -> bool {
277 if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) {
278 Mod.setImm(NewModVal | SISrcMods::OP_SEL_1);
279 Old.ChangeToImmediate(Imm);
280 return true;
281 }
282
283 // Try to shuffle the halves around and leverage opsel to get an inline
284 // constant.
285 uint16_t Lo = static_cast<uint16_t>(Imm);
286 uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
287 if (Lo == Hi) {
288 if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) {
289 Mod.setImm(NewModVal);
291 return true;
292 }
293
294 if (static_cast<int16_t>(Lo) < 0) {
295 int32_t SExt = static_cast<int16_t>(Lo);
296 if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) {
297 Mod.setImm(NewModVal);
298 Old.ChangeToImmediate(SExt);
299 return true;
300 }
301 }
302
303 // This check is only useful for integer instructions
304 if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16 ||
306 if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) {
307 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
308 Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16);
309 return true;
310 }
311 }
312 } else {
313 uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
314 if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) {
315 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0);
316 Old.ChangeToImmediate(Swapped);
317 return true;
318 }
319 }
320
321 return false;
322 };
323
324 if (tryFoldToInline(Imm))
325 return true;
326
327 // Replace integer addition by subtraction and vice versa if it allows
328 // folding the immediate to an inline constant.
329 //
330 // We should only ever get here for SrcIdx == 1 due to canonicalization
331 // earlier in the pipeline, but we double-check here to be safe / fully
332 // general.
333 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
334 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
335 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
336 unsigned ClampIdx =
337 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
338 bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
339
340 if (!Clamp) {
341 uint16_t NegLo = -static_cast<uint16_t>(Imm);
342 uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
343 uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
344
345 if (tryFoldToInline(NegImm)) {
346 unsigned NegOpcode =
347 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
348 MI->setDesc(TII->get(NegOpcode));
349 return true;
350 }
351 }
352 }
353
354 return false;
355}
356
357bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
358 MachineInstr *MI = Fold.UseMI;
359 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
360 assert(Old.isReg());
361
362 if (Fold.isImm() && canUseImmWithOpSel(Fold)) {
363 if (tryFoldImmWithOpSel(Fold))
364 return true;
365
366 // We can't represent the candidate as an inline constant. Try as a literal
367 // with the original opsel, checking constant bus limitations.
369 int OpNo = MI->getOperandNo(&Old);
370 if (!TII->isOperandLegal(*MI, OpNo, &New))
371 return false;
372 Old.ChangeToImmediate(Fold.ImmToFold);
373 return true;
374 }
375
376 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
377 MachineBasicBlock *MBB = MI->getParent();
378 auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16);
379 if (Liveness != MachineBasicBlock::LQR_Dead) {
380 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
381 return false;
382 }
383
384 int Op32 = Fold.ShrinkOpcode;
385 MachineOperand &Dst0 = MI->getOperand(0);
386 MachineOperand &Dst1 = MI->getOperand(1);
387 assert(Dst0.isDef() && Dst1.isDef());
388
389 bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(Dst1.getReg());
390
391 const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg());
392 Register NewReg0 = MRI->createVirtualRegister(Dst0RC);
393
394 MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32);
395
396 if (HaveNonDbgCarryUse) {
397 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::COPY),
398 Dst1.getReg())
399 .addReg(AMDGPU::VCC, RegState::Kill);
400 }
401
402 // Keep the old instruction around to avoid breaking iterators, but
403 // replace it with a dummy instruction to remove uses.
404 //
405 // FIXME: We should not invert how this pass looks at operands to avoid
406 // this. Should track set of foldable movs instead of looking for uses
407 // when looking at a use.
408 Dst0.setReg(NewReg0);
409 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
410 MI->removeOperand(I);
411 MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF));
412
413 if (Fold.Commuted)
414 TII->commuteInstruction(*Inst32, false);
415 return true;
416 }
417
418 assert(!Fold.needsShrink() && "not handled");
419
420 if (Fold.isImm()) {
421 if (Old.isTied()) {
422 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
423 if (NewMFMAOpc == -1)
424 return false;
425 MI->setDesc(TII->get(NewMFMAOpc));
426 MI->untieRegOperand(0);
427 }
428 Old.ChangeToImmediate(Fold.ImmToFold);
429 return true;
430 }
431
432 if (Fold.isGlobal()) {
433 Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
434 Fold.OpToFold->getTargetFlags());
435 return true;
436 }
437
438 if (Fold.isFI()) {
439 Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
440 return true;
441 }
442
443 MachineOperand *New = Fold.OpToFold;
444 Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
445 Old.setIsUndef(New->isUndef());
446 return true;
447}
448
450 const MachineInstr *MI) {
451 return any_of(FoldList, [&](const auto &C) { return C.UseMI == MI; });
452}
453
455 MachineInstr *MI, unsigned OpNo,
456 MachineOperand *FoldOp, bool Commuted = false,
457 int ShrinkOp = -1) {
458 // Skip additional folding on the same operand.
459 for (FoldCandidate &Fold : FoldList)
460 if (Fold.UseMI == MI && Fold.UseOpNo == OpNo)
461 return;
462 LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal")
463 << " operand " << OpNo << "\n " << *MI);
464 FoldList.emplace_back(MI, OpNo, FoldOp, Commuted, ShrinkOp);
465}
466
467bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
468 MachineInstr *MI, unsigned OpNo,
469 MachineOperand *OpToFold) const {
470 const unsigned Opc = MI->getOpcode();
471
472 auto tryToFoldAsFMAAKorMK = [&]() {
473 if (!OpToFold->isImm())
474 return false;
475
476 const bool TryAK = OpNo == 3;
477 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
478 MI->setDesc(TII->get(NewOpc));
479
480 // We have to fold into operand which would be Imm not into OpNo.
481 bool FoldAsFMAAKorMK =
482 tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
483 if (FoldAsFMAAKorMK) {
484 // Untie Src2 of fmac.
485 MI->untieRegOperand(3);
486 // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
487 if (OpNo == 1) {
488 MachineOperand &Op1 = MI->getOperand(1);
489 MachineOperand &Op2 = MI->getOperand(2);
490 Register OldReg = Op1.getReg();
491 // Operand 2 might be an inlinable constant
492 if (Op2.isImm()) {
493 Op1.ChangeToImmediate(Op2.getImm());
494 Op2.ChangeToRegister(OldReg, false);
495 } else {
496 Op1.setReg(Op2.getReg());
497 Op2.setReg(OldReg);
498 }
499 }
500 return true;
501 }
502 MI->setDesc(TII->get(Opc));
503 return false;
504 };
505
506 bool IsLegal = TII->isOperandLegal(*MI, OpNo, OpToFold);
507 if (!IsLegal && OpToFold->isImm()) {
508 FoldCandidate Fold(MI, OpNo, OpToFold);
509 IsLegal = canUseImmWithOpSel(Fold);
510 }
511
512 if (!IsLegal) {
513 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
514 unsigned NewOpc = macToMad(Opc);
515 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
516 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
517 // to fold the operand.
518 MI->setDesc(TII->get(NewOpc));
519 bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) &&
520 AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel);
521 if (AddOpSel)
522 MI->addOperand(MachineOperand::CreateImm(0));
523 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);
524 if (FoldAsMAD) {
525 MI->untieRegOperand(OpNo);
526 return true;
527 }
528 if (AddOpSel)
529 MI->removeOperand(MI->getNumExplicitOperands() - 1);
530 MI->setDesc(TII->get(Opc));
531 }
532
533 // Special case for s_fmac_f32 if we are trying to fold into Src2.
534 // By transforming into fmaak we can untie Src2 and make folding legal.
535 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
536 if (tryToFoldAsFMAAKorMK())
537 return true;
538 }
539
540 // Special case for s_setreg_b32
541 if (OpToFold->isImm()) {
542 unsigned ImmOpc = 0;
543 if (Opc == AMDGPU::S_SETREG_B32)
544 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
545 else if (Opc == AMDGPU::S_SETREG_B32_mode)
546 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
547 if (ImmOpc) {
548 MI->setDesc(TII->get(ImmOpc));
549 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
550 return true;
551 }
552 }
553
554 // If we are already folding into another operand of MI, then
555 // we can't commute the instruction, otherwise we risk making the
556 // other fold illegal.
557 if (isUseMIInFoldList(FoldList, MI))
558 return false;
559
560 // Operand is not legal, so try to commute the instruction to
561 // see if this makes it possible to fold.
562 unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex;
563 bool CanCommute = TII->findCommutedOpIndices(*MI, OpNo, CommuteOpNo);
564 if (!CanCommute)
565 return false;
566
567 // One of operands might be an Imm operand, and OpNo may refer to it after
568 // the call of commuteInstruction() below. Such situations are avoided
569 // here explicitly as OpNo must be a register operand to be a candidate
570 // for memory folding.
571 if (!MI->getOperand(OpNo).isReg() || !MI->getOperand(CommuteOpNo).isReg())
572 return false;
573
574 if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo))
575 return false;
576
577 int Op32 = -1;
578 if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
579 if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&
580 Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME
581 (!OpToFold->isImm() && !OpToFold->isFI() && !OpToFold->isGlobal())) {
582 TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo);
583 return false;
584 }
585
586 // Verify the other operand is a VGPR, otherwise we would violate the
587 // constant bus restriction.
588 MachineOperand &OtherOp = MI->getOperand(OpNo);
589 if (!OtherOp.isReg() ||
590 !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
591 return false;
592
593 assert(MI->getOperand(1).isDef());
594
595 // Make sure to get the 32-bit version of the commuted opcode.
596 unsigned MaybeCommutedOpc = MI->getOpcode();
597 Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
598 }
599
600 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32);
601 return true;
602 }
603
604 // Inlineable constant might have been folded into Imm operand of fmaak or
605 // fmamk and we are trying to fold a non-inlinable constant.
606 if ((Opc == AMDGPU::S_FMAAK_F32 || Opc == AMDGPU::S_FMAMK_F32) &&
607 !OpToFold->isReg() && !TII->isInlineConstant(*OpToFold)) {
608 unsigned ImmIdx = Opc == AMDGPU::S_FMAAK_F32 ? 3 : 2;
609 MachineOperand &OpImm = MI->getOperand(ImmIdx);
610 if (!OpImm.isReg() &&
611 TII->isInlineConstant(*MI, MI->getOperand(OpNo), OpImm))
612 return tryToFoldAsFMAAKorMK();
613 }
614
615 // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
616 // By changing into fmamk we can untie Src2.
617 // If folding for Src0 happens first and it is identical operand to Src1 we
618 // should avoid transforming into fmamk which requires commuting as it would
619 // cause folding into Src1 to fail later on due to wrong OpNo used.
620 if (Opc == AMDGPU::S_FMAC_F32 &&
621 (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
622 if (tryToFoldAsFMAAKorMK())
623 return true;
624 }
625
626 // Check the case where we might introduce a second constant operand to a
627 // scalar instruction
628 if (TII->isSALU(MI->getOpcode())) {
629 const MCInstrDesc &InstDesc = MI->getDesc();
630 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
631
632 // Fine if the operand can be encoded as an inline constant
633 if (!OpToFold->isReg() && !TII->isInlineConstant(*OpToFold, OpInfo)) {
634 // Otherwise check for another constant
635 for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) {
636 auto &Op = MI->getOperand(i);
637 if (OpNo != i && !Op.isReg() &&
638 !TII->isInlineConstant(Op, InstDesc.operands()[i]))
639 return false;
640 }
641 }
642 }
643
644 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
645 return true;
646}
647
648bool SIFoldOperands::isUseSafeToFold(const MachineInstr &MI,
649 const MachineOperand &UseMO) const {
650 // Operands of SDWA instructions must be registers.
651 return !TII->isSDWA(MI);
652}
653
654// Find a def of the UseReg, check if it is a reg_sequence and find initializers
655// for each subreg, tracking it to foldable inline immediate if possible.
656// Returns true on success.
657bool SIFoldOperands::getRegSeqInit(
658 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
659 Register UseReg, uint8_t OpTy) const {
660 MachineInstr *Def = MRI->getVRegDef(UseReg);
661 if (!Def || !Def->isRegSequence())
662 return false;
663
664 for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
665 MachineOperand *Sub = &Def->getOperand(I);
666 assert(Sub->isReg());
667
668 for (MachineInstr *SubDef = MRI->getVRegDef(Sub->getReg());
669 SubDef && Sub->isReg() && Sub->getReg().isVirtual() &&
670 !Sub->getSubReg() && TII->isFoldableCopy(*SubDef);
671 SubDef = MRI->getVRegDef(Sub->getReg())) {
672 MachineOperand *Op = &SubDef->getOperand(1);
673 if (Op->isImm()) {
674 if (TII->isInlineConstant(*Op, OpTy))
675 Sub = Op;
676 break;
677 }
678 if (!Op->isReg() || Op->getReg().isPhysical())
679 break;
680 Sub = Op;
681 }
682
683 Defs.emplace_back(Sub, Def->getOperand(I + 1).getImm());
684 }
685
686 return true;
687}
688
689bool SIFoldOperands::tryToFoldACImm(
690 const MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
691 SmallVectorImpl<FoldCandidate> &FoldList) const {
692 const MCInstrDesc &Desc = UseMI->getDesc();
693 if (UseOpIdx >= Desc.getNumOperands())
694 return false;
695
697 return false;
698
699 uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
700 if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) &&
701 TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
702 UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
703 return true;
704 }
705
706 if (!OpToFold.isReg())
707 return false;
708
709 Register UseReg = OpToFold.getReg();
710 if (!UseReg.isVirtual())
711 return false;
712
713 if (isUseMIInFoldList(FoldList, UseMI))
714 return false;
715
716 // Maybe it is just a COPY of an immediate itself.
717 MachineInstr *Def = MRI->getVRegDef(UseReg);
718 MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
719 if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) {
720 MachineOperand &DefOp = Def->getOperand(1);
721 if (DefOp.isImm() && TII->isInlineConstant(DefOp, OpTy) &&
722 TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
723 UseMI->getOperand(UseOpIdx).ChangeToImmediate(DefOp.getImm());
724 return true;
725 }
726 }
727
729 if (!getRegSeqInit(Defs, UseReg, OpTy))
730 return false;
731
732 int32_t Imm;
733 for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
734 const MachineOperand *Op = Defs[I].first;
735 if (!Op->isImm())
736 return false;
737
738 auto SubImm = Op->getImm();
739 if (!I) {
740 Imm = SubImm;
741 if (!TII->isInlineConstant(*Op, OpTy) ||
742 !TII->isOperandLegal(*UseMI, UseOpIdx, Op))
743 return false;
744
745 continue;
746 }
747 if (Imm != SubImm)
748 return false; // Can only fold splat constants
749 }
750
751 appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first);
752 return true;
753}
754
755void SIFoldOperands::foldOperand(
756 MachineOperand &OpToFold,
758 int UseOpIdx,
760 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
761 const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);
762
763 if (!isUseSafeToFold(*UseMI, *UseOp))
764 return;
765
766 // FIXME: Fold operands with subregs.
767 if (UseOp->isReg() && OpToFold.isReg() &&
768 (UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister))
769 return;
770
771 // Special case for REG_SEQUENCE: We can't fold literals into
772 // REG_SEQUENCE instructions, so we have to fold them into the
773 // uses of REG_SEQUENCE.
774 if (UseMI->isRegSequence()) {
775 Register RegSeqDstReg = UseMI->getOperand(0).getReg();
776 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
777
778 // Grab the use operands first
780 for (auto &Use : MRI->use_nodbg_operands(RegSeqDstReg))
781 UsesToProcess.push_back(&Use);
782 for (auto *RSUse : UsesToProcess) {
783 MachineInstr *RSUseMI = RSUse->getParent();
784
785 if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI,
786 RSUseMI->getOperandNo(RSUse), FoldList))
787 continue;
788
789 if (RSUse->getSubReg() != RegSeqDstSubReg)
790 continue;
791
792 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
793 CopiesToReplace);
794 }
795 return;
796 }
797
798 if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList))
799 return;
800
801 if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) {
802 // Verify that this is a stack access.
803 // FIXME: Should probably use stack pseudos before frame lowering.
804
805 if (TII->isMUBUF(*UseMI)) {
806 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
807 MFI->getScratchRSrcReg())
808 return;
809
810 // Ensure this is either relative to the current frame or the current
811 // wave.
812 MachineOperand &SOff =
813 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
814 if (!SOff.isImm() || SOff.getImm() != 0)
815 return;
816 }
817
818 // A frame index will resolve to a positive constant, so it should always be
819 // safe to fold the addressing mode, even pre-GFX9.
820 UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
821
822 const unsigned Opc = UseMI->getOpcode();
823 if (TII->isFLATScratch(*UseMI) &&
824 AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
825 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
826 unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
827 UseMI->setDesc(TII->get(NewOpc));
828 }
829
830 return;
831 }
832
833 bool FoldingImmLike =
834 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
835
836 if (FoldingImmLike && UseMI->isCopy()) {
837 Register DestReg = UseMI->getOperand(0).getReg();
838 Register SrcReg = UseMI->getOperand(1).getReg();
839 assert(SrcReg.isVirtual());
840
841 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
842
843 // Don't fold into a copy to a physical register with the same class. Doing
844 // so would interfere with the register coalescer's logic which would avoid
845 // redundant initializations.
846 if (DestReg.isPhysical() && SrcRC->contains(DestReg))
847 return;
848
849 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
850 if (!DestReg.isPhysical()) {
851 if (DestRC == &AMDGPU::AGPR_32RegClass &&
852 TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
853 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
855 CopiesToReplace.push_back(UseMI);
856 return;
857 }
858 }
859
860 // In order to fold immediates into copies, we need to change the
861 // copy to a MOV.
862
863 unsigned MovOp = TII->getMovOpcode(DestRC);
864 if (MovOp == AMDGPU::COPY)
865 return;
866
869 while (ImpOpI != ImpOpE) {
870 MachineInstr::mop_iterator Tmp = ImpOpI;
871 ImpOpI++;
873 }
874 UseMI->setDesc(TII->get(MovOp));
875
876 if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
877 const auto &SrcOp = UseMI->getOperand(UseOpIdx);
878 MachineOperand NewSrcOp(SrcOp);
881 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
882 UseMI->addOperand(NewSrcOp); // src0
883 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
884 UseOpIdx = 2;
885 UseOp = &UseMI->getOperand(UseOpIdx);
886 }
887 CopiesToReplace.push_back(UseMI);
888 } else {
889 if (UseMI->isCopy() && OpToFold.isReg() &&
891 !UseMI->getOperand(1).getSubReg()) {
892 LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
893 unsigned Size = TII->getOpSize(*UseMI, 1);
894 Register UseReg = OpToFold.getReg();
896 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
897 UseMI->getOperand(1).setIsKill(false);
898 CopiesToReplace.push_back(UseMI);
899 OpToFold.setIsKill(false);
900
901 // Remove kill flags as kills may now be out of order with uses.
902 MRI->clearKillFlags(OpToFold.getReg());
903
904 // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
905 // can only accept VGPR or inline immediate. Recreate a reg_sequence with
906 // its initializers right here, so we will rematerialize immediates and
907 // avoid copies via different reg classes.
909 if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
910 getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
911 const DebugLoc &DL = UseMI->getDebugLoc();
913
914 UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
915 for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
917
921 for (unsigned I = 0; I < Size / 4; ++I) {
922 MachineOperand *Def = Defs[I].first;
924 if (Def->isImm() &&
925 TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
926 int64_t Imm = Def->getImm();
927
928 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
930 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm);
931 B.addReg(Tmp);
932 } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
933 auto Src = getRegSubRegPair(*Def);
934 Def->setIsKill(false);
935 if (!SeenAGPRs.insert(Src)) {
936 // We cannot build a reg_sequence out of the same registers, they
937 // must be copied. Better do it here before copyPhysReg() created
938 // several reads to do the AGPR->VGPR->AGPR copy.
939 CopyToVGPR = Src;
940 } else {
941 B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0,
942 Src.SubReg);
943 }
944 } else {
945 assert(Def->isReg());
946 Def->setIsKill(false);
947 auto Src = getRegSubRegPair(*Def);
948
949 // Direct copy from SGPR to AGPR is not possible. To avoid creation
950 // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
951 // create a copy here and track if we already have such a copy.
952 if (TRI->isSGPRReg(*MRI, Src.Reg)) {
953 CopyToVGPR = Src;
954 } else {
955 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
956 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
957 B.addReg(Tmp);
958 }
959 }
960
961 if (CopyToVGPR.Reg) {
962 Register Vgpr;
963 if (VGPRCopies.count(CopyToVGPR)) {
964 Vgpr = VGPRCopies[CopyToVGPR];
965 } else {
966 Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
967 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
968 VGPRCopies[CopyToVGPR] = Vgpr;
969 }
970 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
972 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg(Vgpr);
973 B.addReg(Tmp);
974 }
975
976 B.addImm(Defs[I].second);
977 }
978 LLVM_DEBUG(dbgs() << "Folded " << *UseMI);
979 return;
980 }
981
982 if (Size != 4)
983 return;
984
985 Register Reg0 = UseMI->getOperand(0).getReg();
986 Register Reg1 = UseMI->getOperand(1).getReg();
987 if (TRI->isAGPR(*MRI, Reg0) && TRI->isVGPR(*MRI, Reg1))
988 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
989 else if (TRI->isVGPR(*MRI, Reg0) && TRI->isAGPR(*MRI, Reg1))
990 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64));
991 else if (ST->hasGFX90AInsts() && TRI->isAGPR(*MRI, Reg0) &&
992 TRI->isAGPR(*MRI, Reg1))
993 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32));
994 return;
995 }
996
997 unsigned UseOpc = UseMI->getOpcode();
998 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
999 (UseOpc == AMDGPU::V_READLANE_B32 &&
1000 (int)UseOpIdx ==
1001 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
1002 // %vgpr = V_MOV_B32 imm
1003 // %sgpr = V_READFIRSTLANE_B32 %vgpr
1004 // =>
1005 // %sgpr = S_MOV_B32 imm
1006 if (FoldingImmLike) {
1008 UseMI->getOperand(UseOpIdx).getReg(),
1009 *OpToFold.getParent(),
1010 *UseMI))
1011 return;
1012
1013 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
1014
1015 if (OpToFold.isImm())
1016 UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
1017 else
1019 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1020 return;
1021 }
1022
1023 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
1025 UseMI->getOperand(UseOpIdx).getReg(),
1026 *OpToFold.getParent(),
1027 *UseMI))
1028 return;
1029
1030 // %vgpr = COPY %sgpr0
1031 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
1032 // =>
1033 // %sgpr1 = COPY %sgpr0
1034 UseMI->setDesc(TII->get(AMDGPU::COPY));
1035 UseMI->getOperand(1).setReg(OpToFold.getReg());
1036 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1037 UseMI->getOperand(1).setIsKill(false);
1038 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1039 return;
1040 }
1041 }
1042
1043 const MCInstrDesc &UseDesc = UseMI->getDesc();
1044
1045 // Don't fold into target independent nodes. Target independent opcodes
1046 // don't have defined register classes.
1047 if (UseDesc.isVariadic() || UseOp->isImplicit() ||
1048 UseDesc.operands()[UseOpIdx].RegClass == -1)
1049 return;
1050 }
1051
1052 if (!FoldingImmLike) {
1053 if (OpToFold.isReg() && ST->needsAlignedVGPRs()) {
1054 // Don't fold if OpToFold doesn't hold an aligned register.
1055 const TargetRegisterClass *RC =
1056 TRI->getRegClassForReg(*MRI, OpToFold.getReg());
1057 assert(RC);
1058 if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) {
1059 unsigned SubReg = OpToFold.getSubReg();
1060 if (const TargetRegisterClass *SubRC =
1061 TRI->getSubRegisterClass(RC, SubReg))
1062 RC = SubRC;
1063 }
1064
1065 if (!RC || !TRI->isProperlyAlignedRC(*RC))
1066 return;
1067 }
1068
1069 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold);
1070
1071 // FIXME: We could try to change the instruction from 64-bit to 32-bit
1072 // to enable more folding opportunities. The shrink operands pass
1073 // already does this.
1074 return;
1075 }
1076
1077
1078 const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
1079 const TargetRegisterClass *FoldRC =
1080 TRI->getRegClass(FoldDesc.operands()[0].RegClass);
1081
1082 // Split 64-bit constants into 32-bits for folding.
1083 if (UseOp->getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) {
1084 Register UseReg = UseOp->getReg();
1085 const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
1086 if (AMDGPU::getRegBitWidth(*UseRC) != 64)
1087 return;
1088
1089 APInt Imm(64, OpToFold.getImm());
1090 if (UseOp->getSubReg() == AMDGPU::sub0) {
1091 Imm = Imm.getLoBits(32);
1092 } else {
1093 assert(UseOp->getSubReg() == AMDGPU::sub1);
1094 Imm = Imm.getHiBits(32);
1095 }
1096
1097 MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
1098 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp);
1099 return;
1100 }
1101
1102 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold);
1103}
1104
1105static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
1106 uint32_t LHS, uint32_t RHS) {
1107 switch (Opcode) {
1108 case AMDGPU::V_AND_B32_e64:
1109 case AMDGPU::V_AND_B32_e32:
1110 case AMDGPU::S_AND_B32:
1111 Result = LHS & RHS;
1112 return true;
1113 case AMDGPU::V_OR_B32_e64:
1114 case AMDGPU::V_OR_B32_e32:
1115 case AMDGPU::S_OR_B32:
1116 Result = LHS | RHS;
1117 return true;
1118 case AMDGPU::V_XOR_B32_e64:
1119 case AMDGPU::V_XOR_B32_e32:
1120 case AMDGPU::S_XOR_B32:
1121 Result = LHS ^ RHS;
1122 return true;
1123 case AMDGPU::S_XNOR_B32:
1124 Result = ~(LHS ^ RHS);
1125 return true;
1126 case AMDGPU::S_NAND_B32:
1127 Result = ~(LHS & RHS);
1128 return true;
1129 case AMDGPU::S_NOR_B32:
1130 Result = ~(LHS | RHS);
1131 return true;
1132 case AMDGPU::S_ANDN2_B32:
1133 Result = LHS & ~RHS;
1134 return true;
1135 case AMDGPU::S_ORN2_B32:
1136 Result = LHS | ~RHS;
1137 return true;
1138 case AMDGPU::V_LSHL_B32_e64:
1139 case AMDGPU::V_LSHL_B32_e32:
1140 case AMDGPU::S_LSHL_B32:
1141 // The instruction ignores the high bits for out of bounds shifts.
1142 Result = LHS << (RHS & 31);
1143 return true;
1144 case AMDGPU::V_LSHLREV_B32_e64:
1145 case AMDGPU::V_LSHLREV_B32_e32:
1146 Result = RHS << (LHS & 31);
1147 return true;
1148 case AMDGPU::V_LSHR_B32_e64:
1149 case AMDGPU::V_LSHR_B32_e32:
1150 case AMDGPU::S_LSHR_B32:
1151 Result = LHS >> (RHS & 31);
1152 return true;
1153 case AMDGPU::V_LSHRREV_B32_e64:
1154 case AMDGPU::V_LSHRREV_B32_e32:
1155 Result = RHS >> (LHS & 31);
1156 return true;
1157 case AMDGPU::V_ASHR_I32_e64:
1158 case AMDGPU::V_ASHR_I32_e32:
1159 case AMDGPU::S_ASHR_I32:
1160 Result = static_cast<int32_t>(LHS) >> (RHS & 31);
1161 return true;
1162 case AMDGPU::V_ASHRREV_I32_e64:
1163 case AMDGPU::V_ASHRREV_I32_e32:
1164 Result = static_cast<int32_t>(RHS) >> (LHS & 31);
1165 return true;
1166 default:
1167 return false;
1168 }
1169}
1170
1171static unsigned getMovOpc(bool IsScalar) {
1172 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1173}
1174
1175static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
1176 MI.setDesc(NewDesc);
1177
1178 // Remove any leftover implicit operands from mutating the instruction. e.g.
1179 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
1180 // anymore.
1181 const MCInstrDesc &Desc = MI.getDesc();
1182 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
1183 Desc.implicit_defs().size();
1184
1185 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
1186 MI.removeOperand(I);
1187}
1188
1190SIFoldOperands::getImmOrMaterializedImm(MachineOperand &Op) const {
1191 // If this has a subregister, it obviously is a register source.
1192 if (!Op.isReg() || Op.getSubReg() != AMDGPU::NoSubRegister ||
1193 !Op.getReg().isVirtual())
1194 return &Op;
1195
1196 MachineInstr *Def = MRI->getVRegDef(Op.getReg());
1197 if (Def && Def->isMoveImmediate()) {
1198 MachineOperand &ImmSrc = Def->getOperand(1);
1199 if (ImmSrc.isImm())
1200 return &ImmSrc;
1201 }
1202
1203 return &Op;
1204}
1205
1206// Try to simplify operations with a constant that may appear after instruction
1207// selection.
1208// TODO: See if a frame index with a fixed offset can fold.
1209bool SIFoldOperands::tryConstantFoldOp(MachineInstr *MI) const {
1210 if (!MI->allImplicitDefsAreDead())
1211 return false;
1212
1213 unsigned Opc = MI->getOpcode();
1214
1215 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1216 if (Src0Idx == -1)
1217 return false;
1218 MachineOperand *Src0 = getImmOrMaterializedImm(MI->getOperand(Src0Idx));
1219
1220 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1221 Opc == AMDGPU::S_NOT_B32) &&
1222 Src0->isImm()) {
1223 MI->getOperand(1).ChangeToImmediate(~Src0->getImm());
1224 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
1225 return true;
1226 }
1227
1228 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1229 if (Src1Idx == -1)
1230 return false;
1231 MachineOperand *Src1 = getImmOrMaterializedImm(MI->getOperand(Src1Idx));
1232
1233 if (!Src0->isImm() && !Src1->isImm())
1234 return false;
1235
1236 // and k0, k1 -> v_mov_b32 (k0 & k1)
1237 // or k0, k1 -> v_mov_b32 (k0 | k1)
1238 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1239 if (Src0->isImm() && Src1->isImm()) {
1240 int32_t NewImm;
1241 if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
1242 return false;
1243
1244 bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg());
1245
1246 // Be careful to change the right operand, src0 may belong to a different
1247 // instruction.
1248 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1249 MI->removeOperand(Src1Idx);
1250 mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
1251 return true;
1252 }
1253
1254 if (!MI->isCommutable())
1255 return false;
1256
1257 if (Src0->isImm() && !Src1->isImm()) {
1258 std::swap(Src0, Src1);
1259 std::swap(Src0Idx, Src1Idx);
1260 }
1261
1262 int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
1263 if (Opc == AMDGPU::V_OR_B32_e64 ||
1264 Opc == AMDGPU::V_OR_B32_e32 ||
1265 Opc == AMDGPU::S_OR_B32) {
1266 if (Src1Val == 0) {
1267 // y = or x, 0 => y = copy x
1268 MI->removeOperand(Src1Idx);
1269 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1270 } else if (Src1Val == -1) {
1271 // y = or x, -1 => y = v_mov_b32 -1
1272 MI->removeOperand(Src1Idx);
1273 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
1274 } else
1275 return false;
1276
1277 return true;
1278 }
1279
1280 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1281 Opc == AMDGPU::S_AND_B32) {
1282 if (Src1Val == 0) {
1283 // y = and x, 0 => y = v_mov_b32 0
1284 MI->removeOperand(Src0Idx);
1285 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
1286 } else if (Src1Val == -1) {
1287 // y = and x, -1 => y = copy x
1288 MI->removeOperand(Src1Idx);
1289 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1290 } else
1291 return false;
1292
1293 return true;
1294 }
1295
1296 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1297 Opc == AMDGPU::S_XOR_B32) {
1298 if (Src1Val == 0) {
1299 // y = xor x, 0 => y = copy x
1300 MI->removeOperand(Src1Idx);
1301 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1302 return true;
1303 }
1304 }
1305
1306 return false;
1307}
1308
1309// Try to fold an instruction into a simpler one
1310bool SIFoldOperands::tryFoldCndMask(MachineInstr &MI) const {
1311 unsigned Opc = MI.getOpcode();
1312 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1313 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1314 return false;
1315
1316 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1317 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1318 if (!Src1->isIdenticalTo(*Src0)) {
1319 auto *Src0Imm = getImmOrMaterializedImm(*Src0);
1320 auto *Src1Imm = getImmOrMaterializedImm(*Src1);
1321 if (!Src1Imm->isIdenticalTo(*Src0Imm))
1322 return false;
1323 }
1324
1325 int Src1ModIdx =
1326 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1327 int Src0ModIdx =
1328 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1329 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1330 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1331 return false;
1332
1333 LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1334 auto &NewDesc =
1335 TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
1336 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1337 if (Src2Idx != -1)
1338 MI.removeOperand(Src2Idx);
1339 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1340 if (Src1ModIdx != -1)
1341 MI.removeOperand(Src1ModIdx);
1342 if (Src0ModIdx != -1)
1343 MI.removeOperand(Src0ModIdx);
1344 mutateCopyOp(MI, NewDesc);
1345 LLVM_DEBUG(dbgs() << MI);
1346 return true;
1347}
1348
1349bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr &MI) const {
1350 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1351 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1352 return false;
1353
1354 MachineOperand *Src0 = getImmOrMaterializedImm(MI.getOperand(1));
1355 if (!Src0->isImm() || Src0->getImm() != 0xffff)
1356 return false;
1357
1358 Register Src1 = MI.getOperand(2).getReg();
1359 MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1360 if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
1361 return false;
1362
1363 Register Dst = MI.getOperand(0).getReg();
1364 MRI->replaceRegWith(Dst, Src1);
1365 if (!MI.getOperand(2).isKill())
1366 MRI->clearKillFlags(Src1);
1367 MI.eraseFromParent();
1368 return true;
1369}
1370
1371bool SIFoldOperands::foldInstOperand(MachineInstr &MI,
1372 MachineOperand &OpToFold) const {
1373 // We need mutate the operands of new mov instructions to add implicit
1374 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1375 // this.
1376 SmallVector<MachineInstr *, 4> CopiesToReplace;
1378 MachineOperand &Dst = MI.getOperand(0);
1379 bool Changed = false;
1380
1381 if (OpToFold.isImm()) {
1382 for (auto &UseMI :
1383 make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
1384 // Folding the immediate may reveal operations that can be constant
1385 // folded or replaced with a copy. This can happen for example after
1386 // frame indices are lowered to constants or from splitting 64-bit
1387 // constants.
1388 //
1389 // We may also encounter cases where one or both operands are
1390 // immediates materialized into a register, which would ordinarily not
1391 // be folded due to multiple uses or operand constraints.
1392 if (tryConstantFoldOp(&UseMI)) {
1393 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
1394 Changed = true;
1395 }
1396 }
1397 }
1398
1400 for (auto &Use : MRI->use_nodbg_operands(Dst.getReg()))
1401 UsesToProcess.push_back(&Use);
1402 for (auto *U : UsesToProcess) {
1403 MachineInstr *UseMI = U->getParent();
1404 foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
1405 CopiesToReplace);
1406 }
1407
1408 if (CopiesToReplace.empty() && FoldList.empty())
1409 return Changed;
1410
1411 MachineFunction *MF = MI.getParent()->getParent();
1412 // Make sure we add EXEC uses to any new v_mov instructions created.
1413 for (MachineInstr *Copy : CopiesToReplace)
1414 Copy->addImplicitDefUseOperands(*MF);
1415
1416 for (FoldCandidate &Fold : FoldList) {
1417 assert(!Fold.isReg() || Fold.OpToFold);
1418 if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) {
1419 Register Reg = Fold.OpToFold->getReg();
1420 MachineInstr *DefMI = Fold.OpToFold->getParent();
1421 if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
1422 execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
1423 continue;
1424 }
1425 if (updateOperand(Fold)) {
1426 // Clear kill flags.
1427 if (Fold.isReg()) {
1428 assert(Fold.OpToFold && Fold.OpToFold->isReg());
1429 // FIXME: Probably shouldn't bother trying to fold if not an
1430 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1431 // copies.
1432 MRI->clearKillFlags(Fold.OpToFold->getReg());
1433 }
1434 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1435 << static_cast<int>(Fold.UseOpNo) << " of "
1436 << *Fold.UseMI);
1437 } else if (Fold.Commuted) {
1438 // Restoring instruction's original operand order if fold has failed.
1439 TII->commuteInstruction(*Fold.UseMI, false);
1440 }
1441 }
1442 return true;
1443}
1444
1445bool SIFoldOperands::tryFoldFoldableCopy(
1446 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
1447 // Specially track simple redefs of m0 to the same value in a block, so we
1448 // can erase the later ones.
1449 if (MI.getOperand(0).getReg() == AMDGPU::M0) {
1450 MachineOperand &NewM0Val = MI.getOperand(1);
1451 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
1452 MI.eraseFromParent();
1453 return true;
1454 }
1455
1456 // We aren't tracking other physical registers
1457 CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical())
1458 ? nullptr
1459 : &NewM0Val;
1460 return false;
1461 }
1462
1463 MachineOperand *OpToFoldPtr;
1464 if (MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e64) {
1465 // Folding when any src_modifiers are non-zero is unsupported
1466 if (TII->hasAnyModifiersSet(MI))
1467 return false;
1468 OpToFoldPtr = &MI.getOperand(2);
1469 } else
1470 OpToFoldPtr = &MI.getOperand(1);
1471 MachineOperand &OpToFold = *OpToFoldPtr;
1472 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1473
1474 // FIXME: We could also be folding things like TargetIndexes.
1475 if (!FoldingImm && !OpToFold.isReg())
1476 return false;
1477
1478 if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
1479 return false;
1480
1481 // Prevent folding operands backwards in the function. For example,
1482 // the COPY opcode must not be replaced by 1 in this example:
1483 //
1484 // %3 = COPY %vgpr0; VGPR_32:%3
1485 // ...
1486 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1487 if (!MI.getOperand(0).getReg().isVirtual())
1488 return false;
1489
1490 bool Changed = foldInstOperand(MI, OpToFold);
1491
1492 // If we managed to fold all uses of this copy then we might as well
1493 // delete it now.
1494 // The only reason we need to follow chains of copies here is that
1495 // tryFoldRegSequence looks forward through copies before folding a
1496 // REG_SEQUENCE into its eventual users.
1497 auto *InstToErase = &MI;
1498 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1499 auto &SrcOp = InstToErase->getOperand(1);
1500 auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
1501 InstToErase->eraseFromParent();
1502 Changed = true;
1503 InstToErase = nullptr;
1504 if (!SrcReg || SrcReg.isPhysical())
1505 break;
1506 InstToErase = MRI->getVRegDef(SrcReg);
1507 if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
1508 break;
1509 }
1510
1511 if (InstToErase && InstToErase->isRegSequence() &&
1512 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1513 InstToErase->eraseFromParent();
1514 Changed = true;
1515 }
1516
1517 return Changed;
1518}
1519
1520// Clamp patterns are canonically selected to v_max_* instructions, so only
1521// handle them.
1522const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
1523 unsigned Op = MI.getOpcode();
1524 switch (Op) {
1525 case AMDGPU::V_MAX_F32_e64:
1526 case AMDGPU::V_MAX_F16_e64:
1527 case AMDGPU::V_MAX_F16_t16_e64:
1528 case AMDGPU::V_MAX_F16_fake16_e64:
1529 case AMDGPU::V_MAX_F64_e64:
1530 case AMDGPU::V_MAX_NUM_F64_e64:
1531 case AMDGPU::V_PK_MAX_F16: {
1532 if (MI.mayRaiseFPException())
1533 return nullptr;
1534
1535 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
1536 return nullptr;
1537
1538 // Make sure sources are identical.
1539 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1540 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1541 if (!Src0->isReg() || !Src1->isReg() ||
1542 Src0->getReg() != Src1->getReg() ||
1543 Src0->getSubReg() != Src1->getSubReg() ||
1544 Src0->getSubReg() != AMDGPU::NoSubRegister)
1545 return nullptr;
1546
1547 // Can't fold up if we have modifiers.
1548 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1549 return nullptr;
1550
1551 unsigned Src0Mods
1552 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
1553 unsigned Src1Mods
1554 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
1555
1556 // Having a 0 op_sel_hi would require swizzling the output in the source
1557 // instruction, which we can't do.
1558 unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1
1559 : 0u;
1560 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
1561 return nullptr;
1562 return Src0;
1563 }
1564 default:
1565 return nullptr;
1566 }
1567}
1568
1569// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
1570bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
1571 const MachineOperand *ClampSrc = isClamp(MI);
1572 if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
1573 return false;
1574
1575 MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
1576
1577 // The type of clamp must be compatible.
1578 if (TII->getClampMask(*Def) != TII->getClampMask(MI))
1579 return false;
1580
1581 if (Def->mayRaiseFPException())
1582 return false;
1583
1584 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
1585 if (!DefClamp)
1586 return false;
1587
1588 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
1589
1590 // Clamp is applied after omod, so it is OK if omod is set.
1591 DefClamp->setImm(1);
1592
1593 Register DefReg = Def->getOperand(0).getReg();
1594 Register MIDstReg = MI.getOperand(0).getReg();
1595 if (TRI->isSGPRReg(*MRI, DefReg)) {
1596 // Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
1597 // instruction with a VGPR dst.
1598 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
1599 MIDstReg)
1600 .addReg(DefReg);
1601 } else {
1602 MRI->replaceRegWith(MIDstReg, DefReg);
1603 }
1604 MI.eraseFromParent();
1605
1606 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1607 // instruction, so we might as well convert it to the more flexible VOP3-only
1608 // mad/fma form.
1609 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1610 Def->eraseFromParent();
1611
1612 return true;
1613}
1614
1615static int getOModValue(unsigned Opc, int64_t Val) {
1616 switch (Opc) {
1617 case AMDGPU::V_MUL_F64_e64:
1618 case AMDGPU::V_MUL_F64_pseudo_e64: {
1619 switch (Val) {
1620 case 0x3fe0000000000000: // 0.5
1621 return SIOutMods::DIV2;
1622 case 0x4000000000000000: // 2.0
1623 return SIOutMods::MUL2;
1624 case 0x4010000000000000: // 4.0
1625 return SIOutMods::MUL4;
1626 default:
1627 return SIOutMods::NONE;
1628 }
1629 }
1630 case AMDGPU::V_MUL_F32_e64: {
1631 switch (static_cast<uint32_t>(Val)) {
1632 case 0x3f000000: // 0.5
1633 return SIOutMods::DIV2;
1634 case 0x40000000: // 2.0
1635 return SIOutMods::MUL2;
1636 case 0x40800000: // 4.0
1637 return SIOutMods::MUL4;
1638 default:
1639 return SIOutMods::NONE;
1640 }
1641 }
1642 case AMDGPU::V_MUL_F16_e64:
1643 case AMDGPU::V_MUL_F16_t16_e64:
1644 case AMDGPU::V_MUL_F16_fake16_e64: {
1645 switch (static_cast<uint16_t>(Val)) {
1646 case 0x3800: // 0.5
1647 return SIOutMods::DIV2;
1648 case 0x4000: // 2.0
1649 return SIOutMods::MUL2;
1650 case 0x4400: // 4.0
1651 return SIOutMods::MUL4;
1652 default:
1653 return SIOutMods::NONE;
1654 }
1655 }
1656 default:
1657 llvm_unreachable("invalid mul opcode");
1658 }
1659}
1660
1661// FIXME: Does this really not support denormals with f16?
1662// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
1663// handled, so will anything other than that break?
1664std::pair<const MachineOperand *, int>
1665SIFoldOperands::isOMod(const MachineInstr &MI) const {
1666 unsigned Op = MI.getOpcode();
1667 switch (Op) {
1668 case AMDGPU::V_MUL_F64_e64:
1669 case AMDGPU::V_MUL_F64_pseudo_e64:
1670 case AMDGPU::V_MUL_F32_e64:
1671 case AMDGPU::V_MUL_F16_t16_e64:
1672 case AMDGPU::V_MUL_F16_fake16_e64:
1673 case AMDGPU::V_MUL_F16_e64: {
1674 // If output denormals are enabled, omod is ignored.
1675 if ((Op == AMDGPU::V_MUL_F32_e64 &&
1676 MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1677 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
1678 Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
1679 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
1680 MFI->getMode().FP64FP16Denormals.Output !=
1682 MI.mayRaiseFPException())
1683 return std::pair(nullptr, SIOutMods::NONE);
1684
1685 const MachineOperand *RegOp = nullptr;
1686 const MachineOperand *ImmOp = nullptr;
1687 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1688 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1689 if (Src0->isImm()) {
1690 ImmOp = Src0;
1691 RegOp = Src1;
1692 } else if (Src1->isImm()) {
1693 ImmOp = Src1;
1694 RegOp = Src0;
1695 } else
1696 return std::pair(nullptr, SIOutMods::NONE);
1697
1698 int OMod = getOModValue(Op, ImmOp->getImm());
1699 if (OMod == SIOutMods::NONE ||
1700 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
1701 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
1702 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
1703 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
1704 return std::pair(nullptr, SIOutMods::NONE);
1705
1706 return std::pair(RegOp, OMod);
1707 }
1708 case AMDGPU::V_ADD_F64_e64:
1709 case AMDGPU::V_ADD_F64_pseudo_e64:
1710 case AMDGPU::V_ADD_F32_e64:
1711 case AMDGPU::V_ADD_F16_e64:
1712 case AMDGPU::V_ADD_F16_t16_e64:
1713 case AMDGPU::V_ADD_F16_fake16_e64: {
1714 // If output denormals are enabled, omod is ignored.
1715 if ((Op == AMDGPU::V_ADD_F32_e64 &&
1716 MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1717 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
1718 Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
1719 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
1720 MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
1721 return std::pair(nullptr, SIOutMods::NONE);
1722
1723 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
1724 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1725 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1726
1727 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
1728 Src0->getSubReg() == Src1->getSubReg() &&
1729 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
1730 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
1731 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
1732 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1733 return std::pair(Src0, SIOutMods::MUL2);
1734
1735 return std::pair(nullptr, SIOutMods::NONE);
1736 }
1737 default:
1738 return std::pair(nullptr, SIOutMods::NONE);
1739 }
1740}
1741
1742// FIXME: Does this need to check IEEE bit on function?
1743bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
1744 const MachineOperand *RegOp;
1745 int OMod;
1746 std::tie(RegOp, OMod) = isOMod(MI);
1747 if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
1748 RegOp->getSubReg() != AMDGPU::NoSubRegister ||
1749 !MRI->hasOneNonDBGUser(RegOp->getReg()))
1750 return false;
1751
1752 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
1753 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
1754 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
1755 return false;
1756
1757 if (Def->mayRaiseFPException())
1758 return false;
1759
1760 // Clamp is applied after omod. If the source already has clamp set, don't
1761 // fold it.
1762 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
1763 return false;
1764
1765 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
1766
1767 DefOMod->setImm(OMod);
1768 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1769 MI.eraseFromParent();
1770
1771 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1772 // instruction, so we might as well convert it to the more flexible VOP3-only
1773 // mad/fma form.
1774 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1775 Def->eraseFromParent();
1776
1777 return true;
1778}
1779
1780// Try to fold a reg_sequence with vgpr output and agpr inputs into an
1781// instruction which can take an agpr. So far that means a store.
1782bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) {
1783 assert(MI.isRegSequence());
1784 auto Reg = MI.getOperand(0).getReg();
1785
1786 if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
1787 !MRI->hasOneNonDBGUse(Reg))
1788 return false;
1789
1791 if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER))
1792 return false;
1793
1794 for (auto &[Op, SubIdx] : Defs) {
1795 if (!Op->isReg())
1796 return false;
1797 if (TRI->isAGPR(*MRI, Op->getReg()))
1798 continue;
1799 // Maybe this is a COPY from AREG
1800 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
1801 if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
1802 return false;
1803 if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
1804 return false;
1805 }
1806
1807 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
1808 MachineInstr *UseMI = Op->getParent();
1809 while (UseMI->isCopy() && !Op->getSubReg()) {
1810 Reg = UseMI->getOperand(0).getReg();
1811 if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
1812 return false;
1813 Op = &*MRI->use_nodbg_begin(Reg);
1814 UseMI = Op->getParent();
1815 }
1816
1817 if (Op->getSubReg())
1818 return false;
1819
1820 unsigned OpIdx = Op - &UseMI->getOperand(0);
1821 const MCInstrDesc &InstDesc = UseMI->getDesc();
1822 const TargetRegisterClass *OpRC =
1823 TII->getRegClass(InstDesc, OpIdx, TRI, *MI.getMF());
1824 if (!OpRC || !TRI->isVectorSuperClass(OpRC))
1825 return false;
1826
1827 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
1828 auto Dst = MRI->createVirtualRegister(NewDstRC);
1829 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
1830 TII->get(AMDGPU::REG_SEQUENCE), Dst);
1831
1832 for (auto &[Def, SubIdx] : Defs) {
1833 Def->setIsKill(false);
1834 if (TRI->isAGPR(*MRI, Def->getReg())) {
1835 RS.add(*Def);
1836 } else { // This is a copy
1837 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
1838 SubDef->getOperand(1).setIsKill(false);
1839 RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg());
1840 }
1841 RS.addImm(SubIdx);
1842 }
1843
1844 Op->setReg(Dst);
1845 if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
1846 Op->setReg(Reg);
1847 RS->eraseFromParent();
1848 return false;
1849 }
1850
1851 LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
1852
1853 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
1854 // in which case we can erase them all later in runOnMachineFunction.
1855 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
1856 MI.eraseFromParent();
1857 return true;
1858}
1859
1860/// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and
1861/// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg
1862static bool isAGPRCopy(const SIRegisterInfo &TRI,
1863 const MachineRegisterInfo &MRI, const MachineInstr &Copy,
1864 Register &OutReg, unsigned &OutSubReg) {
1865 assert(Copy.isCopy());
1866
1867 const MachineOperand &CopySrc = Copy.getOperand(1);
1868 Register CopySrcReg = CopySrc.getReg();
1869 if (!CopySrcReg.isVirtual())
1870 return false;
1871
1872 // Common case: copy from AGPR directly, e.g.
1873 // %1:vgpr_32 = COPY %0:agpr_32
1874 if (TRI.isAGPR(MRI, CopySrcReg)) {
1875 OutReg = CopySrcReg;
1876 OutSubReg = CopySrc.getSubReg();
1877 return true;
1878 }
1879
1880 // Sometimes it can also involve two copies, e.g.
1881 // %1:vgpr_256 = COPY %0:agpr_256
1882 // %2:vgpr_32 = COPY %1:vgpr_256.sub0
1883 const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg);
1884 if (!CopySrcDef || !CopySrcDef->isCopy())
1885 return false;
1886
1887 const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1);
1888 Register OtherCopySrcReg = OtherCopySrc.getReg();
1889 if (!OtherCopySrcReg.isVirtual() ||
1890 CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister ||
1891 OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister ||
1892 !TRI.isAGPR(MRI, OtherCopySrcReg))
1893 return false;
1894
1895 OutReg = OtherCopySrcReg;
1896 OutSubReg = CopySrc.getSubReg();
1897 return true;
1898}
1899
1900// Try to hoist an AGPR to VGPR copy across a PHI.
1901// This should allow folding of an AGPR into a consumer which may support it.
1902//
1903// Example 1: LCSSA PHI
1904// loop:
1905// %1:vreg = COPY %0:areg
1906// exit:
1907// %2:vreg = PHI %1:vreg, %loop
1908// =>
1909// loop:
1910// exit:
1911// %1:areg = PHI %0:areg, %loop
1912// %2:vreg = COPY %1:areg
1913//
1914// Example 2: PHI with multiple incoming values:
1915// entry:
1916// %1:vreg = GLOBAL_LOAD(..)
1917// loop:
1918// %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
1919// %3:areg = COPY %2:vreg
1920// %4:areg = (instr using %3:areg)
1921// %5:vreg = COPY %4:areg
1922// =>
1923// entry:
1924// %1:vreg = GLOBAL_LOAD(..)
1925// %2:areg = COPY %1:vreg
1926// loop:
1927// %3:areg = PHI %2:areg, %entry, %X:areg,
1928// %4:areg = (instr using %3:areg)
1929bool SIFoldOperands::tryFoldPhiAGPR(MachineInstr &PHI) {
1930 assert(PHI.isPHI());
1931
1932 Register PhiOut = PHI.getOperand(0).getReg();
1933 if (!TRI->isVGPR(*MRI, PhiOut))
1934 return false;
1935
1936 // Iterate once over all incoming values of the PHI to check if this PHI is
1937 // eligible, and determine the exact AGPR RC we'll target.
1938 const TargetRegisterClass *ARC = nullptr;
1939 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
1940 MachineOperand &MO = PHI.getOperand(K);
1941 MachineInstr *Copy = MRI->getVRegDef(MO.getReg());
1942 if (!Copy || !Copy->isCopy())
1943 continue;
1944
1945 Register AGPRSrc;
1946 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
1947 if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask))
1948 continue;
1949
1950 const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc);
1951 if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
1952 CopyInRC = SubRC;
1953
1954 if (ARC && !ARC->hasSubClassEq(CopyInRC))
1955 return false;
1956 ARC = CopyInRC;
1957 }
1958
1959 if (!ARC)
1960 return false;
1961
1962 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
1963
1964 // Rewrite the PHI's incoming values to ARC.
1965 LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI);
1966 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
1967 MachineOperand &MO = PHI.getOperand(K);
1968 Register Reg = MO.getReg();
1969
1971 MachineBasicBlock *InsertMBB = nullptr;
1972
1973 // Look at the def of Reg, ignoring all copies.
1974 unsigned CopyOpc = AMDGPU::COPY;
1975 if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
1976
1977 // Look at pre-existing COPY instructions from ARC: Steal the operand. If
1978 // the copy was single-use, it will be removed by DCE later.
1979 if (Def->isCopy()) {
1980 Register AGPRSrc;
1981 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
1982 if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) {
1983 MO.setReg(AGPRSrc);
1984 MO.setSubReg(AGPRSubReg);
1985 continue;
1986 }
1987
1988 // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
1989 // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
1990 // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
1991 // is unlikely to be profitable.
1992 //
1993 // Note that V_ACCVGPR_WRITE is only used for AGPR_32.
1994 MachineOperand &CopyIn = Def->getOperand(1);
1995 if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) &&
1996 TRI->isSGPRReg(*MRI, CopyIn.getReg()))
1997 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1998 }
1999
2000 InsertMBB = Def->getParent();
2001 InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator());
2002 } else {
2003 InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
2004 InsertPt = InsertMBB->getFirstTerminator();
2005 }
2006
2007 Register NewReg = MRI->createVirtualRegister(ARC);
2008 MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
2009 TII->get(CopyOpc), NewReg)
2010 .addReg(Reg);
2011 MO.setReg(NewReg);
2012
2013 (void)MI;
2014 LLVM_DEBUG(dbgs() << " Created COPY: " << *MI);
2015 }
2016
2017 // Replace the PHI's result with a new register.
2018 Register NewReg = MRI->createVirtualRegister(ARC);
2019 PHI.getOperand(0).setReg(NewReg);
2020
2021 // COPY that new register back to the original PhiOut register. This COPY will
2022 // usually be folded out later.
2023 MachineBasicBlock *MBB = PHI.getParent();
2024 BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(),
2025 TII->get(AMDGPU::COPY), PhiOut)
2026 .addReg(NewReg);
2027
2028 LLVM_DEBUG(dbgs() << " Done: Folded " << PHI);
2029 return true;
2030}
2031
2032// Attempt to convert VGPR load to an AGPR load.
2033bool SIFoldOperands::tryFoldLoad(MachineInstr &MI) {
2034 assert(MI.mayLoad());
2035 if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
2036 return false;
2037
2038 MachineOperand &Def = MI.getOperand(0);
2039 if (!Def.isDef())
2040 return false;
2041
2042 Register DefReg = Def.getReg();
2043
2044 if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
2045 return false;
2046
2048 SmallVector<Register, 8> MoveRegs;
2049 for (const MachineInstr &I : MRI->use_nodbg_instructions(DefReg))
2050 Users.push_back(&I);
2051
2052 if (Users.empty())
2053 return false;
2054
2055 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
2056 while (!Users.empty()) {
2057 const MachineInstr *I = Users.pop_back_val();
2058 if (!I->isCopy() && !I->isRegSequence())
2059 return false;
2060 Register DstReg = I->getOperand(0).getReg();
2061 // Physical registers may have more than one instruction definitions
2062 if (DstReg.isPhysical())
2063 return false;
2064 if (TRI->isAGPR(*MRI, DstReg))
2065 continue;
2066 MoveRegs.push_back(DstReg);
2067 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg))
2068 Users.push_back(&U);
2069 }
2070
2071 const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
2072 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
2073 if (!TII->isOperandLegal(MI, 0, &Def)) {
2074 MRI->setRegClass(DefReg, RC);
2075 return false;
2076 }
2077
2078 while (!MoveRegs.empty()) {
2079 Register Reg = MoveRegs.pop_back_val();
2080 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
2081 }
2082
2083 LLVM_DEBUG(dbgs() << "Folded " << MI);
2084
2085 return true;
2086}
2087
2088// tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
2089// For GFX90A and later, this is pretty much always a good thing, but for GFX908
2090// there's cases where it can create a lot more AGPR-AGPR copies, which are
2091// expensive on this architecture due to the lack of V_ACCVGPR_MOV.
2092//
2093// This function looks at all AGPR PHIs in a basic block and collects their
2094// operands. Then, it checks for register that are used more than once across
2095// all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
2096// having to create one VGPR temporary per use, which can get very messy if
2097// these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
2098// element).
2099//
2100// Example
2101// a:
2102// %in:agpr_256 = COPY %foo:vgpr_256
2103// c:
2104// %x:agpr_32 = ..
2105// b:
2106// %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
2107// %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
2108// %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
2109// =>
2110// a:
2111// %in:agpr_256 = COPY %foo:vgpr_256
2112// %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
2113// %tmp_agpr:agpr_32 = COPY %tmp
2114// c:
2115// %x:agpr_32 = ..
2116// b:
2117// %0:areg = PHI %tmp_agpr, %a, %x, %c
2118// %1:areg = PHI %tmp_agpr, %a, %y, %c
2119// %2:areg = PHI %tmp_agpr, %a, %z, %c
2120bool SIFoldOperands::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
2121 // This is only really needed on GFX908 where AGPR-AGPR copies are
2122 // unreasonably difficult.
2123 if (ST->hasGFX90AInsts())
2124 return false;
2125
2126 // Look at all AGPR Phis and collect the register + subregister used.
2127 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
2128 RegToMO;
2129
2130 for (auto &MI : MBB) {
2131 if (!MI.isPHI())
2132 break;
2133
2134 if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))
2135 continue;
2136
2137 for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
2138 MachineOperand &PhiMO = MI.getOperand(K);
2139 if (!PhiMO.getSubReg())
2140 continue;
2141 RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
2142 }
2143 }
2144
2145 // For all (Reg, SubReg) pair that are used more than once, cache the value in
2146 // a VGPR.
2147 bool Changed = false;
2148 for (const auto &[Entry, MOs] : RegToMO) {
2149 if (MOs.size() == 1)
2150 continue;
2151
2152 const auto [Reg, SubReg] = Entry;
2153 MachineInstr *Def = MRI->getVRegDef(Reg);
2154 MachineBasicBlock *DefMBB = Def->getParent();
2155
2156 // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
2157 // out.
2158 const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
2159 Register TempVGPR =
2160 MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
2161 MachineInstr *VGPRCopy =
2162 BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
2163 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2164 .addReg(Reg, /* flags */ 0, SubReg);
2165
2166 // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
2167 Register TempAGPR = MRI->createVirtualRegister(ARC);
2168 BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(),
2169 TII->get(AMDGPU::COPY), TempAGPR)
2170 .addReg(TempVGPR);
2171
2172 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
2173 for (MachineOperand *MO : MOs) {
2174 MO->setReg(TempAGPR);
2175 MO->setSubReg(AMDGPU::NoSubRegister);
2176 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n");
2177 }
2178
2179 Changed = true;
2180 }
2181
2182 return Changed;
2183}
2184
2185bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
2186 if (skipFunction(MF.getFunction()))
2187 return false;
2188
2189 MRI = &MF.getRegInfo();
2190 ST = &MF.getSubtarget<GCNSubtarget>();
2191 TII = ST->getInstrInfo();
2192 TRI = &TII->getRegisterInfo();
2193 MFI = MF.getInfo<SIMachineFunctionInfo>();
2194
2195 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
2196 // correctly handle signed zeros.
2197 //
2198 // FIXME: Also need to check strictfp
2199 bool IsIEEEMode = MFI->getMode().IEEE;
2200 bool HasNSZ = MFI->hasNoSignedZerosFPMath();
2201
2202 bool Changed = false;
2203 for (MachineBasicBlock *MBB : depth_first(&MF)) {
2204 MachineOperand *CurrentKnownM0Val = nullptr;
2205 for (auto &MI : make_early_inc_range(*MBB)) {
2206 Changed |= tryFoldCndMask(MI);
2207
2208 if (tryFoldZeroHighBits(MI)) {
2209 Changed = true;
2210 continue;
2211 }
2212
2213 if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
2214 Changed = true;
2215 continue;
2216 }
2217
2218 if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
2219 Changed = true;
2220 continue;
2221 }
2222
2223 if (MI.mayLoad() && tryFoldLoad(MI)) {
2224 Changed = true;
2225 continue;
2226 }
2227
2228 if (TII->isFoldableCopy(MI)) {
2229 Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
2230 continue;
2231 }
2232
2233 // Saw an unknown clobber of m0, so we no longer know what it is.
2234 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
2235 CurrentKnownM0Val = nullptr;
2236
2237 // TODO: Omod might be OK if there is NSZ only on the source
2238 // instruction, and not the omod multiply.
2239 if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
2240 !tryFoldOMod(MI))
2241 Changed |= tryFoldClamp(MI);
2242 }
2243
2244 Changed |= tryOptimizeAGPRPhis(*MBB);
2245 }
2246
2247 return Changed;
2248}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static bool isReg(const MCInst &MI, unsigned OpNo)
Module * Mod
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static bool isUseMIInFoldList(ArrayRef< FoldCandidate > FoldList, const MachineInstr *MI)
static unsigned getMovOpc(bool IsScalar)
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc)
#define DEBUG_TYPE
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, bool Commuted=false, int ShrinkOp=-1)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
Value * RHS
Value * LHS
support::ulittle16_t & Lo
Definition: aarch32.cpp:206
support::ulittle16_t & Hi
Definition: aarch32.cpp:205
Class for arbitrary precision integers.
Definition: APInt.h:78
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:256
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
Definition: MCInstrDesc.h:261
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg=Register(), bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:346
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:572
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
Definition: MachineInstr.h:775
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:566
bool isRegSequence() const
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:498
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
iterator_range< mop_iterator > implicit_operands()
Definition: MachineInstr.h:699
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:95
size_t size() const
Definition: SmallVector.h:92
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:587
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:951
void push_back(const T &Elt)
Definition: SmallVector.h:427
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1210
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo)
Does this operand support only inlinable literals?
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
@ OPERAND_REG_IMM_V2FP16
Definition: SIDefines.h:211
@ OPERAND_REG_INLINE_C_V2BF16
Definition: SIDefines.h:225
@ OPERAND_REG_IMM_V2INT16
Definition: SIDefines.h:212
@ OPERAND_REG_IMM_V2BF16
Definition: SIDefines.h:210
@ OPERAND_REG_INLINE_C_V2FP16
Definition: SIDefines.h:226
@ OPERAND_REG_INLINE_AC_V2INT16
Definition: SIDefines.h:244
@ OPERAND_REG_INLINE_C_INT32
Definition: SIDefines.h:218
@ OPERAND_REG_INLINE_C_V2INT16
Definition: SIDefines.h:224
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY int getFlatScratchInstSSfromSV(uint16_t Opcode)
@ Entry
Definition: COFF.h:826
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ OPERAND_REGISTER
Definition: MCInstrDesc.h:61
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1454
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
char & SIFoldOperandsID
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
FunctionPass * createSIFoldOperandsPass()
DWARFExpression::Operation Op
void initializeSIFoldOperandsPass(PassRegistry &)
iterator_range< df_iterator< T > > depth_first(const T &G)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
Description of the encoding of one expression Op.
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
A pair composed of a register and a sub-register index.