LLVM 23.0.0git
SIFoldOperands.cpp
Go to the documentation of this file.
1//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7/// \file
8//===----------------------------------------------------------------------===//
9//
10
11#include "SIFoldOperands.h"
12#include "AMDGPU.h"
13#include "GCNSubtarget.h"
15#include "SIInstrInfo.h"
17#include "SIRegisterInfo.h"
22
23#define DEBUG_TYPE "si-fold-operands"
24using namespace llvm;
25
26namespace {
27
28/// Track a value we may want to fold into downstream users, applying
29/// subregister extracts along the way.
30struct FoldableDef {
31 union {
32 MachineOperand *OpToFold = nullptr;
33 uint64_t ImmToFold;
34 int FrameIndexToFold;
35 };
36
37 /// Register class of the originally defined value.
38 const TargetRegisterClass *DefRC = nullptr;
39
40 /// Track the original defining instruction for the value.
41 const MachineInstr *DefMI = nullptr;
42
43 /// Subregister to apply to the value at the use point.
44 unsigned DefSubReg = AMDGPU::NoSubRegister;
45
46 /// Kind of value stored in the union.
48
49 FoldableDef() = delete;
50 FoldableDef(MachineOperand &FoldOp, const TargetRegisterClass *DefRC,
51 unsigned DefSubReg = AMDGPU::NoSubRegister)
52 : DefRC(DefRC), DefSubReg(DefSubReg), Kind(FoldOp.getType()) {
53
54 if (FoldOp.isImm()) {
55 ImmToFold = FoldOp.getImm();
56 } else if (FoldOp.isFI()) {
57 FrameIndexToFold = FoldOp.getIndex();
58 } else {
59 assert(FoldOp.isReg() || FoldOp.isGlobal());
60 OpToFold = &FoldOp;
61 }
62
63 DefMI = FoldOp.getParent();
64 }
65
66 FoldableDef(int64_t FoldImm, const TargetRegisterClass *DefRC,
67 unsigned DefSubReg = AMDGPU::NoSubRegister)
68 : ImmToFold(FoldImm), DefRC(DefRC), DefSubReg(DefSubReg),
70
71 /// Copy the current def and apply \p SubReg to the value.
72 FoldableDef getWithSubReg(const SIRegisterInfo &TRI, unsigned SubReg) const {
73 FoldableDef Copy(*this);
74 Copy.DefSubReg = TRI.composeSubRegIndices(DefSubReg, SubReg);
75 return Copy;
76 }
77
78 bool isReg() const { return Kind == MachineOperand::MO_Register; }
79
80 Register getReg() const {
81 assert(isReg());
82 return OpToFold->getReg();
83 }
84
85 unsigned getSubReg() const {
86 assert(isReg());
87 return OpToFold->getSubReg();
88 }
89
90 bool isImm() const { return Kind == MachineOperand::MO_Immediate; }
91
92 bool isFI() const {
93 return Kind == MachineOperand::MO_FrameIndex;
94 }
95
96 int getFI() const {
97 assert(isFI());
98 return FrameIndexToFold;
99 }
100
101 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
102
103 /// Return the effective immediate value defined by this instruction, after
104 /// application of any subregister extracts which may exist between the use
105 /// and def instruction.
106 std::optional<int64_t> getEffectiveImmVal() const {
107 assert(isImm());
108 return SIInstrInfo::extractSubregFromImm(ImmToFold, DefSubReg);
109 }
110
111 /// Check if it is legal to fold this effective value into \p MI's \p OpNo
112 /// operand.
113 bool isOperandLegal(const SIInstrInfo &TII, const MachineInstr &MI,
114 unsigned OpIdx) const {
115 switch (Kind) {
117 std::optional<int64_t> ImmToFold = getEffectiveImmVal();
118 if (!ImmToFold)
119 return false;
120
121 // TODO: Should verify the subregister index is supported by the class
122 // TODO: Avoid the temporary MachineOperand
123 MachineOperand TmpOp = MachineOperand::CreateImm(*ImmToFold);
124 return TII.isOperandLegal(MI, OpIdx, &TmpOp);
125 }
127 if (DefSubReg != AMDGPU::NoSubRegister)
128 return false;
129 MachineOperand TmpOp = MachineOperand::CreateFI(FrameIndexToFold);
130 return TII.isOperandLegal(MI, OpIdx, &TmpOp);
131 }
132 default:
133 // TODO: Try to apply DefSubReg, for global address we can extract
134 // low/high.
135 if (DefSubReg != AMDGPU::NoSubRegister)
136 return false;
137 return TII.isOperandLegal(MI, OpIdx, OpToFold);
138 }
139
140 llvm_unreachable("covered MachineOperand kind switch");
141 }
142};
143
144struct FoldCandidate {
146 FoldableDef Def;
147 int ShrinkOpcode;
148 unsigned UseOpNo;
149 bool Commuted;
150
151 FoldCandidate(MachineInstr *MI, unsigned OpNo, FoldableDef Def,
152 bool Commuted = false, int ShrinkOp = -1)
153 : UseMI(MI), Def(Def), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
154 Commuted(Commuted) {}
155
156 bool isFI() const { return Def.isFI(); }
157
158 int getFI() const {
159 assert(isFI());
160 return Def.FrameIndexToFold;
161 }
162
163 bool isImm() const { return Def.isImm(); }
164
165 bool isReg() const { return Def.isReg(); }
166
167 Register getReg() const { return Def.getReg(); }
168
169 bool isGlobal() const { return Def.isGlobal(); }
170
171 bool needsShrink() const { return ShrinkOpcode != -1; }
172};
173
174class SIFoldOperandsImpl {
175public:
176 MachineFunction *MF;
178 const SIInstrInfo *TII;
179 const SIRegisterInfo *TRI;
180 const GCNSubtarget *ST;
181 const SIMachineFunctionInfo *MFI;
182
183 bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
184 const FoldableDef &OpToFold) const;
185
186 // TODO: Just use TII::getVALUOp
187 unsigned convertToVALUOp(unsigned Opc, bool UseVOP3 = false) const {
188 switch (Opc) {
189 case AMDGPU::S_ADD_I32: {
190 if (ST->hasAddNoCarryInsts())
191 return UseVOP3 ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_U32_e32;
192 return UseVOP3 ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
193 }
194 case AMDGPU::S_OR_B32:
195 return UseVOP3 ? AMDGPU::V_OR_B32_e64 : AMDGPU::V_OR_B32_e32;
196 case AMDGPU::S_AND_B32:
197 return UseVOP3 ? AMDGPU::V_AND_B32_e64 : AMDGPU::V_AND_B32_e32;
198 case AMDGPU::S_MUL_I32:
199 return AMDGPU::V_MUL_LO_U32_e64;
200 default:
201 return AMDGPU::INSTRUCTION_LIST_END;
202 }
203 }
204
205 bool foldCopyToVGPROfScalarAddOfFrameIndex(Register DstReg, Register SrcReg,
206 MachineInstr &MI) const;
207
208 bool updateOperand(FoldCandidate &Fold) const;
209
210 bool canUseImmWithOpSel(const MachineInstr *MI, unsigned UseOpNo,
211 int64_t ImmVal) const;
212
213 /// Try to fold immediate \p ImmVal into \p MI's operand at index \p UseOpNo.
214 bool tryFoldImmWithOpSel(MachineInstr *MI, unsigned UseOpNo,
215 int64_t ImmVal) const;
216
217 bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
218 MachineInstr *MI, unsigned OpNo,
219 const FoldableDef &OpToFold) const;
220 bool isUseSafeToFold(const MachineInstr &MI,
221 const MachineOperand &UseMO) const;
222
223 const TargetRegisterClass *getRegSeqInit(
224 MachineInstr &RegSeq,
225 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const;
226
227 const TargetRegisterClass *
228 getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
229 Register UseReg) const;
230
231 std::pair<int64_t, const TargetRegisterClass *>
232 isRegSeqSplat(MachineInstr &RegSeg) const;
233
234 bool tryFoldRegSeqSplat(MachineInstr *UseMI, unsigned UseOpIdx,
235 int64_t SplatVal,
236 const TargetRegisterClass *SplatRC) const;
237
238 bool tryToFoldACImm(const FoldableDef &OpToFold, MachineInstr *UseMI,
239 unsigned UseOpIdx,
240 SmallVectorImpl<FoldCandidate> &FoldList) const;
241 void foldOperand(FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
243 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
244
245 bool tryConstantFoldOp(MachineInstr *MI) const;
246 bool tryFoldCndMask(MachineInstr &MI) const;
247 bool tryFoldZeroHighBits(MachineInstr &MI) const;
248 bool foldInstOperand(MachineInstr &MI, const FoldableDef &OpToFold) const;
249
250 bool foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const;
251 bool tryFoldFoldableCopy(MachineInstr &MI,
252 MachineOperand *&CurrentKnownM0Val) const;
253
254 const MachineOperand *isClamp(const MachineInstr &MI) const;
255 bool tryFoldClamp(MachineInstr &MI);
256
257 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
258 bool tryFoldOMod(MachineInstr &MI);
259 bool tryFoldRegSequence(MachineInstr &MI);
260 bool tryFoldPhiAGPR(MachineInstr &MI);
261 bool tryFoldLoad(MachineInstr &MI);
262
263 bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
264
265public:
266 SIFoldOperandsImpl() = default;
267
268 bool run(MachineFunction &MF);
269};
270
271class SIFoldOperandsLegacy : public MachineFunctionPass {
272public:
273 static char ID;
274
275 SIFoldOperandsLegacy() : MachineFunctionPass(ID) {}
276
277 bool runOnMachineFunction(MachineFunction &MF) override {
278 if (skipFunction(MF.getFunction()))
279 return false;
280 return SIFoldOperandsImpl().run(MF);
281 }
282
283 StringRef getPassName() const override { return "SI Fold Operands"; }
284
285 void getAnalysisUsage(AnalysisUsage &AU) const override {
286 AU.setPreservesCFG();
288 }
289
290 MachineFunctionProperties getRequiredProperties() const override {
291 return MachineFunctionProperties().setIsSSA();
292 }
293};
294
295} // End anonymous namespace.
296
297INITIALIZE_PASS(SIFoldOperandsLegacy, DEBUG_TYPE, "SI Fold Operands", false,
298 false)
299
300char SIFoldOperandsLegacy::ID = 0;
301
302char &llvm::SIFoldOperandsLegacyID = SIFoldOperandsLegacy::ID;
303
306 const MachineOperand &MO) {
307 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
308 if (const TargetRegisterClass *SubRC =
309 TRI.getSubRegisterClass(RC, MO.getSubReg()))
310 RC = SubRC;
311 return RC;
312}
313
314// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
315static unsigned macToMad(unsigned Opc) {
316 switch (Opc) {
317 case AMDGPU::V_MAC_F32_e64:
318 return AMDGPU::V_MAD_F32_e64;
319 case AMDGPU::V_MAC_F16_e64:
320 return AMDGPU::V_MAD_F16_e64;
321 case AMDGPU::V_FMAC_F32_e64:
322 return AMDGPU::V_FMA_F32_e64;
323 case AMDGPU::V_FMAC_F16_e64:
324 return AMDGPU::V_FMA_F16_gfx9_e64;
325 case AMDGPU::V_FMAC_F16_t16_e64:
326 return AMDGPU::V_FMA_F16_gfx9_t16_e64;
327 case AMDGPU::V_FMAC_F16_fake16_e64:
328 return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
329 case AMDGPU::V_FMAC_LEGACY_F32_e64:
330 return AMDGPU::V_FMA_LEGACY_F32_e64;
331 case AMDGPU::V_FMAC_F64_e64:
332 return AMDGPU::V_FMA_F64_e64;
333 }
334 return AMDGPU::INSTRUCTION_LIST_END;
335}
336
337// TODO: Add heuristic that the frame index might not fit in the addressing mode
338// immediate offset to avoid materializing in loops.
339bool SIFoldOperandsImpl::frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
340 const FoldableDef &OpToFold) const {
341 if (!OpToFold.isFI())
342 return false;
343
344 const unsigned Opc = UseMI.getOpcode();
345 switch (Opc) {
346 case AMDGPU::S_ADD_I32:
347 case AMDGPU::S_ADD_U32:
348 case AMDGPU::V_ADD_U32_e32:
349 case AMDGPU::V_ADD_CO_U32_e32:
350 // TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have
351 // to insert the wave size shift at every point we use the index.
352 // TODO: Fix depending on visit order to fold immediates into the operand
353 return UseMI.getOperand(OpNo == 1 ? 2 : 1).isImm() &&
354 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
355 case AMDGPU::V_ADD_U32_e64:
356 case AMDGPU::V_ADD_CO_U32_e64:
357 return UseMI.getOperand(OpNo == 2 ? 3 : 2).isImm() &&
358 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
359 default:
360 break;
361 }
362
363 if (TII->isMUBUF(UseMI))
364 return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
365 if (!TII->isFLATScratch(UseMI))
366 return false;
367
368 int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
369 if (OpNo == SIdx)
370 return true;
371
372 int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
373 return OpNo == VIdx && SIdx == -1;
374}
375
376/// Fold %vgpr = COPY (S_ADD_I32 x, frameindex)
377///
378/// => %vgpr = V_ADD_U32 x, frameindex
379bool SIFoldOperandsImpl::foldCopyToVGPROfScalarAddOfFrameIndex(
380 Register DstReg, Register SrcReg, MachineInstr &MI) const {
381 if (TRI->isVGPR(*MRI, DstReg) && TRI->isSGPRReg(*MRI, SrcReg) &&
382 MRI->hasOneNonDBGUse(SrcReg)) {
383 MachineInstr *Def = MRI->getVRegDef(SrcReg);
384 if (!Def || Def->getNumOperands() != 4)
385 return false;
386
387 MachineOperand *Src0 = &Def->getOperand(1);
388 MachineOperand *Src1 = &Def->getOperand(2);
389
390 // TODO: This is profitable with more operand types, and for more
391 // opcodes. But ultimately this is working around poor / nonexistent
392 // regbankselect.
393 if (!Src0->isFI() && !Src1->isFI())
394 return false;
395
396 if (Src0->isFI())
397 std::swap(Src0, Src1);
398
399 const bool UseVOP3 = !Src0->isImm() || TII->isInlineConstant(*Src0);
400 unsigned NewOp = convertToVALUOp(Def->getOpcode(), UseVOP3);
401 if (NewOp == AMDGPU::INSTRUCTION_LIST_END ||
402 !Def->getOperand(3).isDead()) // Check if scc is dead
403 return false;
404
405 MachineBasicBlock *MBB = Def->getParent();
406 const DebugLoc &DL = Def->getDebugLoc();
407 if (NewOp != AMDGPU::V_ADD_CO_U32_e32) {
408 MachineInstrBuilder Add =
409 BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg);
410
411 if (Add->getDesc().getNumDefs() == 2) {
412 Register CarryOutReg = MRI->createVirtualRegister(TRI->getBoolRC());
413 Add.addDef(CarryOutReg, RegState::Dead);
414 MRI->setRegAllocationHint(CarryOutReg, 0, TRI->getVCC());
415 }
416
417 Add.add(*Src0).add(*Src1).setMIFlags(Def->getFlags());
418 if (AMDGPU::hasNamedOperand(NewOp, AMDGPU::OpName::clamp))
419 Add.addImm(0);
420
421 Def->eraseFromParent();
422 MI.eraseFromParent();
423 return true;
424 }
425
426 assert(NewOp == AMDGPU::V_ADD_CO_U32_e32);
427
429 MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, *Def, 16);
430 if (Liveness == MachineBasicBlock::LQR_Dead) {
431 // TODO: If src1 satisfies operand constraints, use vop3 version.
432 BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg)
433 .add(*Src0)
434 .add(*Src1)
435 .setOperandDead(3) // implicit-def $vcc
436 .setMIFlags(Def->getFlags());
437 Def->eraseFromParent();
438 MI.eraseFromParent();
439 return true;
440 }
441 }
442
443 return false;
444}
445
447 return new SIFoldOperandsLegacy();
448}
449
450bool SIFoldOperandsImpl::canUseImmWithOpSel(const MachineInstr *MI,
451 unsigned UseOpNo,
452 int64_t ImmVal) const {
453 const uint64_t TSFlags = MI->getDesc().TSFlags;
454
455 if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
456 (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) ||
457 (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
458 return false;
459
460 const MachineOperand &Old = MI->getOperand(UseOpNo);
461 int OpNo = MI->getOperandNo(&Old);
462
463 unsigned Opcode = MI->getOpcode();
464 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
465 switch (OpType) {
466 default:
467 return false;
475 // VOP3 packed instructions ignore op_sel source modifiers, we cannot encode
476 // two different constants.
477 if ((TSFlags & SIInstrFlags::VOP3) && !(TSFlags & SIInstrFlags::VOP3P) &&
478 static_cast<uint16_t>(ImmVal) != static_cast<uint16_t>(ImmVal >> 16))
479 return false;
480 break;
481 }
482
483 return true;
484}
485
486bool SIFoldOperandsImpl::tryFoldImmWithOpSel(MachineInstr *MI, unsigned UseOpNo,
487 int64_t ImmVal) const {
488 MachineOperand &Old = MI->getOperand(UseOpNo);
489 unsigned Opcode = MI->getOpcode();
490 int OpNo = MI->getOperandNo(&Old);
491 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
492
493 // If the literal can be inlined as-is, apply it and short-circuit the
494 // tests below. The main motivation for this is to avoid unintuitive
495 // uses of opsel.
496 if (AMDGPU::isInlinableLiteralV216(ImmVal, OpType)) {
497 Old.ChangeToImmediate(ImmVal);
498 return true;
499 }
500
501 // Refer to op_sel/op_sel_hi and check if we can change the immediate and
502 // op_sel in a way that allows an inline constant.
503 AMDGPU::OpName ModName = AMDGPU::OpName::NUM_OPERAND_NAMES;
504 unsigned SrcIdx = ~0;
505 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
506 ModName = AMDGPU::OpName::src0_modifiers;
507 SrcIdx = 0;
508 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
509 ModName = AMDGPU::OpName::src1_modifiers;
510 SrcIdx = 1;
511 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
512 ModName = AMDGPU::OpName::src2_modifiers;
513 SrcIdx = 2;
514 }
515 assert(ModName != AMDGPU::OpName::NUM_OPERAND_NAMES);
516 int ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModName);
517 MachineOperand &Mod = MI->getOperand(ModIdx);
518 unsigned ModVal = Mod.getImm();
519
520 uint16_t ImmLo =
521 static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
522 uint16_t ImmHi =
523 static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
524 uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
525 unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
526
527 // Helper function that attempts to inline the given value with a newly
528 // chosen opsel pattern.
529 auto tryFoldToInline = [&](uint32_t Imm) -> bool {
530 if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) {
531 Mod.setImm(NewModVal | SISrcMods::OP_SEL_1);
532 Old.ChangeToImmediate(Imm);
533 return true;
534 }
535
536 // Try to shuffle the halves around and leverage opsel to get an inline
537 // constant.
538 uint16_t Lo = static_cast<uint16_t>(Imm);
539 uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
540 if (Lo == Hi) {
541 if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) {
542 Mod.setImm(NewModVal);
544 return true;
545 }
546
547 if (static_cast<int16_t>(Lo) < 0) {
548 int32_t SExt = static_cast<int16_t>(Lo);
549 if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) {
550 Mod.setImm(NewModVal);
551 Old.ChangeToImmediate(SExt);
552 return true;
553 }
554 }
555
556 // This check is only useful for integer instructions
557 if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16) {
558 if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) {
559 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
560 Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16);
561 return true;
562 }
563 }
564 } else {
565 uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
566 if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) {
567 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0);
568 Old.ChangeToImmediate(Swapped);
569 return true;
570 }
571 }
572
573 return false;
574 };
575
576 if (tryFoldToInline(Imm))
577 return true;
578
579 // Replace integer addition by subtraction and vice versa if it allows
580 // folding the immediate to an inline constant.
581 //
582 // We should only ever get here for SrcIdx == 1 due to canonicalization
583 // earlier in the pipeline, but we double-check here to be safe / fully
584 // general.
585 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
586 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
587 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
588 unsigned ClampIdx =
589 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
590 bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
591
592 if (!Clamp) {
593 uint16_t NegLo = -static_cast<uint16_t>(Imm);
594 uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
595 uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
596
597 if (tryFoldToInline(NegImm)) {
598 unsigned NegOpcode =
599 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
600 MI->setDesc(TII->get(NegOpcode));
601 return true;
602 }
603 }
604 }
605
606 return false;
607}
608
609bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
610 MachineInstr *MI = Fold.UseMI;
611 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
612 assert(Old.isReg());
613
614 std::optional<int64_t> ImmVal;
615 if (Fold.isImm())
616 ImmVal = Fold.Def.getEffectiveImmVal();
617
618 if (ImmVal && canUseImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal)) {
619 if (tryFoldImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal))
620 return true;
621
622 // We can't represent the candidate as an inline constant. Try as a literal
623 // with the original opsel, checking constant bus limitations.
624 MachineOperand New = MachineOperand::CreateImm(*ImmVal);
625 int OpNo = MI->getOperandNo(&Old);
626 if (!TII->isOperandLegal(*MI, OpNo, &New))
627 return false;
628 Old.ChangeToImmediate(*ImmVal);
629 return true;
630 }
631
632 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
633 MachineBasicBlock *MBB = MI->getParent();
634 auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16);
635 if (Liveness != MachineBasicBlock::LQR_Dead) {
636 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
637 return false;
638 }
639
640 int Op32 = Fold.ShrinkOpcode;
641 MachineOperand &Dst0 = MI->getOperand(0);
642 MachineOperand &Dst1 = MI->getOperand(1);
643 assert(Dst0.isDef() && Dst1.isDef());
644
645 bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(Dst1.getReg());
646
647 const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg());
648 Register NewReg0 = MRI->createVirtualRegister(Dst0RC);
649
650 MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32);
651
652 if (HaveNonDbgCarryUse) {
653 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::COPY),
654 Dst1.getReg())
655 .addReg(AMDGPU::VCC, RegState::Kill);
656 }
657
658 // Keep the old instruction around to avoid breaking iterators, but
659 // replace it with a dummy instruction to remove uses.
660 //
661 // FIXME: We should not invert how this pass looks at operands to avoid
662 // this. Should track set of foldable movs instead of looking for uses
663 // when looking at a use.
664 Dst0.setReg(NewReg0);
665 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
666 MI->removeOperand(I);
667 MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF));
668
669 if (Fold.Commuted)
670 TII->commuteInstruction(*Inst32, false);
671 return true;
672 }
673
674 assert(!Fold.needsShrink() && "not handled");
675
676 if (ImmVal) {
677 if (Old.isTied()) {
678 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
679 if (NewMFMAOpc == -1)
680 return false;
681 MI->setDesc(TII->get(NewMFMAOpc));
682 MI->untieRegOperand(0);
683 const MCInstrDesc &MCID = MI->getDesc();
684 for (unsigned I = 0; I < MI->getNumDefs(); ++I)
686 MI->getOperand(I).setIsEarlyClobber(true);
687 }
688
689 // TODO: Should we try to avoid adding this to the candidate list?
690 MachineOperand New = MachineOperand::CreateImm(*ImmVal);
691 int OpNo = MI->getOperandNo(&Old);
692 if (!TII->isOperandLegal(*MI, OpNo, &New))
693 return false;
694
695 Old.ChangeToImmediate(*ImmVal);
696 return true;
697 }
698
699 if (Fold.isGlobal()) {
700 Old.ChangeToGA(Fold.Def.OpToFold->getGlobal(),
701 Fold.Def.OpToFold->getOffset(),
702 Fold.Def.OpToFold->getTargetFlags());
703 return true;
704 }
705
706 if (Fold.isFI()) {
707 Old.ChangeToFrameIndex(Fold.getFI());
708 return true;
709 }
710
711 MachineOperand *New = Fold.Def.OpToFold;
712
713 // Verify the register is compatible with the operand.
714 if (const TargetRegisterClass *OpRC =
715 TII->getRegClass(MI->getDesc(), Fold.UseOpNo)) {
716 const TargetRegisterClass *NewRC =
717 TRI->getRegClassForReg(*MRI, New->getReg());
718
719 const TargetRegisterClass *ConstrainRC = OpRC;
720 if (New->getSubReg()) {
721 ConstrainRC =
722 TRI->getMatchingSuperRegClass(NewRC, OpRC, New->getSubReg());
723
724 if (!ConstrainRC)
725 return false;
726 }
727
728 if (New->getReg().isVirtual() &&
729 !MRI->constrainRegClass(New->getReg(), ConstrainRC)) {
730 LLVM_DEBUG(dbgs() << "Cannot constrain " << printReg(New->getReg(), TRI)
731 << TRI->getRegClassName(ConstrainRC) << '\n');
732 return false;
733 }
734 }
735
736 // Rework once the VS_16 register class is updated to include proper
737 // 16-bit SGPRs instead of 32-bit ones.
738 if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
739 Old.setSubReg(AMDGPU::NoSubRegister);
740 if (New->getReg().isPhysical()) {
741 Old.substPhysReg(New->getReg(), *TRI);
742 } else {
743 Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
744 Old.setIsUndef(New->isUndef());
745 }
746 return true;
747}
748
750 FoldCandidate &&Entry) {
751 // Skip additional folding on the same operand.
752 for (FoldCandidate &Fold : FoldList)
753 if (Fold.UseMI == Entry.UseMI && Fold.UseOpNo == Entry.UseOpNo)
754 return;
755 LLVM_DEBUG(dbgs() << "Append " << (Entry.Commuted ? "commuted" : "normal")
756 << " operand " << Entry.UseOpNo << "\n " << *Entry.UseMI);
757 FoldList.push_back(Entry);
758}
759
761 MachineInstr *MI, unsigned OpNo,
762 const FoldableDef &FoldOp,
763 bool Commuted = false, int ShrinkOp = -1) {
764 appendFoldCandidate(FoldList,
765 FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
766}
767
768// Returns true if the instruction is a packed F32 instruction and the
769// corresponding scalar operand reads 32 bits and replicates the bits to both
770// channels.
772 const GCNSubtarget *ST, MachineInstr *MI, unsigned OpNo) {
773 if (!ST->hasPKF32InstsReplicatingLower32BitsOfScalarInput())
774 return false;
775 const MCOperandInfo &OpDesc = MI->getDesc().operands()[OpNo];
777}
778
779// Packed FP32 instructions only read 32 bits from a scalar operand (SGPR or
780// literal) and replicates the bits to both channels. Therefore, if the hi and
781// lo are not same, we can't fold it.
783 const FoldableDef &OpToFold) {
784 assert(OpToFold.isImm() && "Expected immediate operand");
785 uint64_t ImmVal = OpToFold.getEffectiveImmVal().value();
786 uint32_t Lo = Lo_32(ImmVal);
787 uint32_t Hi = Hi_32(ImmVal);
788 return Lo == Hi;
789}
790
791bool SIFoldOperandsImpl::tryAddToFoldList(
792 SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo,
793 const FoldableDef &OpToFold) const {
794 const unsigned Opc = MI->getOpcode();
795
796 auto tryToFoldAsFMAAKorMK = [&]() {
797 if (!OpToFold.isImm())
798 return false;
799
800 const bool TryAK = OpNo == 3;
801 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
802 MI->setDesc(TII->get(NewOpc));
803
804 // We have to fold into operand which would be Imm not into OpNo.
805 bool FoldAsFMAAKorMK =
806 tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
807 if (FoldAsFMAAKorMK) {
808 // Untie Src2 of fmac.
809 MI->untieRegOperand(3);
810 // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
811 if (OpNo == 1) {
812 MachineOperand &Op1 = MI->getOperand(1);
813 MachineOperand &Op2 = MI->getOperand(2);
814 Register OldReg = Op1.getReg();
815 // Operand 2 might be an inlinable constant
816 if (Op2.isImm()) {
817 Op1.ChangeToImmediate(Op2.getImm());
818 Op2.ChangeToRegister(OldReg, false);
819 } else {
820 Op1.setReg(Op2.getReg());
821 Op2.setReg(OldReg);
822 }
823 }
824 return true;
825 }
826 MI->setDesc(TII->get(Opc));
827 return false;
828 };
829
830 bool IsLegal = OpToFold.isOperandLegal(*TII, *MI, OpNo);
831 if (!IsLegal && OpToFold.isImm()) {
832 if (std::optional<int64_t> ImmVal = OpToFold.getEffectiveImmVal())
833 IsLegal = canUseImmWithOpSel(MI, OpNo, *ImmVal);
834 }
835
836 if (!IsLegal) {
837 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
838 unsigned NewOpc = macToMad(Opc);
839 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
840 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
841 // to fold the operand.
842 MI->setDesc(TII->get(NewOpc));
843 bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) &&
844 AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel);
845 if (AddOpSel)
846 MI->addOperand(MachineOperand::CreateImm(0));
847 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);
848 if (FoldAsMAD) {
849 MI->untieRegOperand(OpNo);
850 return true;
851 }
852 if (AddOpSel)
853 MI->removeOperand(MI->getNumExplicitOperands() - 1);
854 MI->setDesc(TII->get(Opc));
855 }
856
857 // Special case for s_fmac_f32 if we are trying to fold into Src2.
858 // By transforming into fmaak we can untie Src2 and make folding legal.
859 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
860 if (tryToFoldAsFMAAKorMK())
861 return true;
862 }
863
864 // Special case for s_setreg_b32
865 if (OpToFold.isImm()) {
866 unsigned ImmOpc = 0;
867 if (Opc == AMDGPU::S_SETREG_B32)
868 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
869 else if (Opc == AMDGPU::S_SETREG_B32_mode)
870 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
871 if (ImmOpc) {
872 MI->setDesc(TII->get(ImmOpc));
873 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
874 return true;
875 }
876 }
877
878 // Operand is not legal, so try to commute the instruction to
879 // see if this makes it possible to fold.
880 unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex;
881 bool CanCommute = TII->findCommutedOpIndices(*MI, OpNo, CommuteOpNo);
882 if (!CanCommute)
883 return false;
884
885 MachineOperand &Op = MI->getOperand(OpNo);
886 MachineOperand &CommutedOp = MI->getOperand(CommuteOpNo);
887
888 // One of operands might be an Imm operand, and OpNo may refer to it after
889 // the call of commuteInstruction() below. Such situations are avoided
890 // here explicitly as OpNo must be a register operand to be a candidate
891 // for memory folding.
892 if (!Op.isReg() || !CommutedOp.isReg())
893 return false;
894
895 // The same situation with an immediate could reproduce if both inputs are
896 // the same register.
897 if (Op.isReg() && CommutedOp.isReg() &&
898 (Op.getReg() == CommutedOp.getReg() &&
899 Op.getSubReg() == CommutedOp.getSubReg()))
900 return false;
901
902 if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo))
903 return false;
904
905 int Op32 = -1;
906 if (!OpToFold.isOperandLegal(*TII, *MI, CommuteOpNo)) {
907 if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&
908 Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME
909 (!OpToFold.isImm() && !OpToFold.isFI() && !OpToFold.isGlobal())) {
910 TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo);
911 return false;
912 }
913
914 // Verify the other operand is a VGPR, otherwise we would violate the
915 // constant bus restriction.
916 MachineOperand &OtherOp = MI->getOperand(OpNo);
917 if (!OtherOp.isReg() ||
918 !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
919 return false;
920
921 assert(MI->getOperand(1).isDef());
922
923 // Make sure to get the 32-bit version of the commuted opcode.
924 unsigned MaybeCommutedOpc = MI->getOpcode();
925 Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
926 }
927
928 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, /*Commuted=*/true,
929 Op32);
930 return true;
931 }
932
933 // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
934 // By changing into fmamk we can untie Src2.
935 // If folding for Src0 happens first and it is identical operand to Src1 we
936 // should avoid transforming into fmamk which requires commuting as it would
937 // cause folding into Src1 to fail later on due to wrong OpNo used.
938 if (Opc == AMDGPU::S_FMAC_F32 &&
939 (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
940 if (tryToFoldAsFMAAKorMK())
941 return true;
942 }
943
944 // Special case for PK_F32 instructions if we are trying to fold an imm to
945 // src0 or src1.
946 if (OpToFold.isImm() &&
949 return false;
950
951 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
952 return true;
953}
954
955bool SIFoldOperandsImpl::isUseSafeToFold(const MachineInstr &MI,
956 const MachineOperand &UseMO) const {
957 // Operands of SDWA instructions must be registers.
958 return !TII->isSDWA(MI);
959}
960
963 Register SrcReg) {
964 MachineOperand *Sub = nullptr;
965 for (MachineInstr *SubDef = MRI.getVRegDef(SrcReg);
966 SubDef && TII.isFoldableCopy(*SubDef);
967 SubDef = MRI.getVRegDef(Sub->getReg())) {
968 unsigned SrcIdx = TII.getFoldableCopySrcIdx(*SubDef);
969 MachineOperand &SrcOp = SubDef->getOperand(SrcIdx);
970
971 if (SrcOp.isImm())
972 return &SrcOp;
973 if (!SrcOp.isReg() || SrcOp.getReg().isPhysical())
974 break;
975 Sub = &SrcOp;
976 // TODO: Support compose
977 if (SrcOp.getSubReg())
978 break;
979 }
980
981 return Sub;
982}
983
984const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
985 MachineInstr &RegSeq,
986 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const {
987
988 assert(RegSeq.isRegSequence());
989
990 const TargetRegisterClass *RC = nullptr;
991
992 for (unsigned I = 1, E = RegSeq.getNumExplicitOperands(); I != E; I += 2) {
993 MachineOperand &SrcOp = RegSeq.getOperand(I);
994 unsigned SubRegIdx = RegSeq.getOperand(I + 1).getImm();
995
996 // Only accept reg_sequence with uniform reg class inputs for simplicity.
997 const TargetRegisterClass *OpRC = getRegOpRC(*MRI, *TRI, SrcOp);
998 if (!RC)
999 RC = OpRC;
1000 else if (!TRI->getCommonSubClass(RC, OpRC))
1001 return nullptr;
1002
1003 if (SrcOp.getSubReg()) {
1004 // TODO: Handle subregister compose
1005 Defs.emplace_back(&SrcOp, SubRegIdx);
1006 continue;
1007 }
1008
1009 MachineOperand *DefSrc = lookUpCopyChain(*TII, *MRI, SrcOp.getReg());
1010 if (DefSrc && (DefSrc->isReg() || DefSrc->isImm())) {
1011 Defs.emplace_back(DefSrc, SubRegIdx);
1012 continue;
1013 }
1014
1015 Defs.emplace_back(&SrcOp, SubRegIdx);
1016 }
1017
1018 return RC;
1019}
1020
1021// Find a def of the UseReg, check if it is a reg_sequence and find initializers
1022// for each subreg, tracking it to an immediate if possible. Returns the
1023// register class of the inputs on success.
1024const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
1025 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
1026 Register UseReg) const {
1027 MachineInstr *Def = MRI->getVRegDef(UseReg);
1028 if (!Def || !Def->isRegSequence())
1029 return nullptr;
1030
1031 return getRegSeqInit(*Def, Defs);
1032}
1033
1034std::pair<int64_t, const TargetRegisterClass *>
1035SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const {
1037 const TargetRegisterClass *SrcRC = getRegSeqInit(RegSeq, Defs);
1038 if (!SrcRC)
1039 return {};
1040
1041 bool TryToMatchSplat64 = false;
1042
1043 int64_t Imm;
1044 for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
1045 const MachineOperand *Op = Defs[I].first;
1046 if (!Op->isImm())
1047 return {};
1048
1049 int64_t SubImm = Op->getImm();
1050 if (!I) {
1051 Imm = SubImm;
1052 continue;
1053 }
1054
1055 if (Imm != SubImm) {
1056 if (I == 1 && (E & 1) == 0) {
1057 // If we have an even number of inputs, there's a chance this is a
1058 // 64-bit element splat broken into 32-bit pieces.
1059 TryToMatchSplat64 = true;
1060 break;
1061 }
1062
1063 return {}; // Can only fold splat constants
1064 }
1065 }
1066
1067 if (!TryToMatchSplat64)
1068 return {Defs[0].first->getImm(), SrcRC};
1069
1070 // Fallback to recognizing 64-bit splats broken into 32-bit pieces
1071 // (i.e. recognize every other other element is 0 for 64-bit immediates)
1072 int64_t SplatVal64;
1073 for (unsigned I = 0, E = Defs.size(); I != E; I += 2) {
1074 const MachineOperand *Op0 = Defs[I].first;
1075 const MachineOperand *Op1 = Defs[I + 1].first;
1076
1077 if (!Op0->isImm() || !Op1->isImm())
1078 return {};
1079
1080 unsigned SubReg0 = Defs[I].second;
1081 unsigned SubReg1 = Defs[I + 1].second;
1082
1083 // Assume we're going to generally encounter reg_sequences with sorted
1084 // subreg indexes, so reject any that aren't consecutive.
1085 if (TRI->getChannelFromSubReg(SubReg0) + 1 !=
1086 TRI->getChannelFromSubReg(SubReg1))
1087 return {};
1088
1089 int64_t MergedVal = Make_64(Op1->getImm(), Op0->getImm());
1090 if (I == 0)
1091 SplatVal64 = MergedVal;
1092 else if (SplatVal64 != MergedVal)
1093 return {};
1094 }
1095
1096 const TargetRegisterClass *RC64 = TRI->getSubRegisterClass(
1097 MRI->getRegClass(RegSeq.getOperand(0).getReg()), AMDGPU::sub0_sub1);
1098
1099 return {SplatVal64, RC64};
1100}
1101
1102bool SIFoldOperandsImpl::tryFoldRegSeqSplat(
1103 MachineInstr *UseMI, unsigned UseOpIdx, int64_t SplatVal,
1104 const TargetRegisterClass *SplatRC) const {
1105 const MCInstrDesc &Desc = UseMI->getDesc();
1106 if (UseOpIdx >= Desc.getNumOperands())
1107 return false;
1108
1109 // Filter out unhandled pseudos.
1110 if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
1111 return false;
1112
1113 int16_t RCID = TII->getOpRegClassID(Desc.operands()[UseOpIdx]);
1114 if (RCID == -1)
1115 return false;
1116
1117 const TargetRegisterClass *OpRC = TRI->getRegClass(RCID);
1118
1119 // Special case 0/-1, since when interpreted as a 64-bit element both halves
1120 // have the same bits. These are the only cases where a splat has the same
1121 // interpretation for 32-bit and 64-bit splats.
1122 if (SplatVal != 0 && SplatVal != -1) {
1123 // We need to figure out the scalar type read by the operand. e.g. the MFMA
1124 // operand will be AReg_128, and we want to check if it's compatible with an
1125 // AReg_32 constant.
1126 uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
1127 switch (OpTy) {
1132 OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
1133 break;
1137 OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
1138 break;
1139 default:
1140 return false;
1141 }
1142
1143 if (!TRI->getCommonSubClass(OpRC, SplatRC))
1144 return false;
1145 }
1146
1147 MachineOperand TmpOp = MachineOperand::CreateImm(SplatVal);
1148 if (!TII->isOperandLegal(*UseMI, UseOpIdx, &TmpOp))
1149 return false;
1150
1151 return true;
1152}
1153
1154bool SIFoldOperandsImpl::tryToFoldACImm(
1155 const FoldableDef &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
1156 SmallVectorImpl<FoldCandidate> &FoldList) const {
1157 const MCInstrDesc &Desc = UseMI->getDesc();
1158 if (UseOpIdx >= Desc.getNumOperands())
1159 return false;
1160
1161 // Filter out unhandled pseudos.
1162 if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
1163 return false;
1164
1165 if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) {
1168 return false;
1169 appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold);
1170 return true;
1171 }
1172
1173 return false;
1174}
1175
1176void SIFoldOperandsImpl::foldOperand(
1177 FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
1178 SmallVectorImpl<FoldCandidate> &FoldList,
1179 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
1180 const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);
1181
1182 if (!isUseSafeToFold(*UseMI, *UseOp))
1183 return;
1184
1185 // FIXME: Fold operands with subregs.
1186 if (UseOp->isReg() && OpToFold.isReg()) {
1187 if (UseOp->isImplicit())
1188 return;
1189 // Allow folding from SGPRs to 16-bit VGPRs.
1190 if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&
1191 (UseOp->getSubReg() != AMDGPU::lo16 ||
1192 !TRI->isSGPRReg(*MRI, OpToFold.getReg())))
1193 return;
1194 }
1195
1196 // Special case for REG_SEQUENCE: We can't fold literals into
1197 // REG_SEQUENCE instructions, so we have to fold them into the
1198 // uses of REG_SEQUENCE.
1199 if (UseMI->isRegSequence()) {
1200 Register RegSeqDstReg = UseMI->getOperand(0).getReg();
1201 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
1202
1203 int64_t SplatVal;
1204 const TargetRegisterClass *SplatRC;
1205 std::tie(SplatVal, SplatRC) = isRegSeqSplat(*UseMI);
1206
1207 // Grab the use operands first
1209 llvm::make_pointer_range(MRI->use_nodbg_operands(RegSeqDstReg)));
1210 for (unsigned I = 0; I != UsesToProcess.size(); ++I) {
1211 MachineOperand *RSUse = UsesToProcess[I];
1212 MachineInstr *RSUseMI = RSUse->getParent();
1213 unsigned OpNo = RSUseMI->getOperandNo(RSUse);
1214
1215 if (SplatRC) {
1216 if (RSUseMI->isCopy()) {
1217 Register DstReg = RSUseMI->getOperand(0).getReg();
1218 append_range(UsesToProcess,
1219 make_pointer_range(MRI->use_nodbg_operands(DstReg)));
1220 continue;
1221 }
1222 if (tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) {
1223 FoldableDef SplatDef(SplatVal, SplatRC);
1224 appendFoldCandidate(FoldList, RSUseMI, OpNo, SplatDef);
1225 continue;
1226 }
1227 }
1228
1229 // TODO: Handle general compose
1230 if (RSUse->getSubReg() != RegSeqDstSubReg)
1231 continue;
1232
1233 // FIXME: We should avoid recursing here. There should be a cleaner split
1234 // between the in-place mutations and adding to the fold list.
1235 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
1236 CopiesToReplace);
1237 }
1238
1239 return;
1240 }
1241
1242 if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList))
1243 return;
1244
1245 if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) {
1246 // Verify that this is a stack access.
1247 // FIXME: Should probably use stack pseudos before frame lowering.
1248
1249 if (TII->isMUBUF(*UseMI)) {
1250 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
1251 MFI->getScratchRSrcReg())
1252 return;
1253
1254 // Ensure this is either relative to the current frame or the current
1255 // wave.
1256 MachineOperand &SOff =
1257 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
1258 if (!SOff.isImm() || SOff.getImm() != 0)
1259 return;
1260 }
1261
1262 const unsigned Opc = UseMI->getOpcode();
1263 if (TII->isFLATScratch(*UseMI) &&
1264 AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
1265 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
1266 unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
1267 unsigned CPol =
1268 TII->getNamedOperand(*UseMI, AMDGPU::OpName::cpol)->getImm();
1269 if ((CPol & AMDGPU::CPol::SCAL) &&
1271 return;
1272
1273 UseMI->setDesc(TII->get(NewOpc));
1274 }
1275
1276 // A frame index will resolve to a positive constant, so it should always be
1277 // safe to fold the addressing mode, even pre-GFX9.
1278 UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI());
1279
1280 return;
1281 }
1282
1283 bool FoldingImmLike =
1284 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1285
1286 if (FoldingImmLike && UseMI->isCopy()) {
1287 Register DestReg = UseMI->getOperand(0).getReg();
1288 Register SrcReg = UseMI->getOperand(1).getReg();
1289 unsigned UseSubReg = UseMI->getOperand(1).getSubReg();
1290 assert(SrcReg.isVirtual());
1291
1292 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
1293
1294 // Don't fold into a copy to a physical register with the same class. Doing
1295 // so would interfere with the register coalescer's logic which would avoid
1296 // redundant initializations.
1297 if (DestReg.isPhysical() && SrcRC->contains(DestReg))
1298 return;
1299
1300 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
1301 // In order to fold immediates into copies, we need to change the copy to a
1302 // MOV. Find a compatible mov instruction with the value.
1303 for (unsigned MovOp :
1304 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
1305 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_MOV_B16_t16_e64,
1306 AMDGPU::V_ACCVGPR_WRITE_B32_e64, AMDGPU::AV_MOV_B32_IMM_PSEUDO,
1307 AMDGPU::AV_MOV_B64_IMM_PSEUDO}) {
1308 const MCInstrDesc &MovDesc = TII->get(MovOp);
1309 const TargetRegisterClass *MovDstRC =
1310 TRI->getRegClass(TII->getOpRegClassID(MovDesc.operands()[0]));
1311
1312 // Fold if the destination register class of the MOV instruction (ResRC)
1313 // is a superclass of (or equal to) the destination register class of the
1314 // COPY (DestRC). If this condition fails, folding would be illegal.
1315 if (!DestRC->hasSuperClassEq(MovDstRC))
1316 continue;
1317
1318 const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1;
1319
1320 int16_t RegClassID = TII->getOpRegClassID(MovDesc.operands()[SrcIdx]);
1321 if (RegClassID != -1) {
1322 const TargetRegisterClass *MovSrcRC = TRI->getRegClass(RegClassID);
1323
1324 if (UseSubReg)
1325 MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg);
1326
1327 // FIXME: We should be able to directly check immediate operand legality
1328 // for all cases, but gfx908 hacks break.
1329 if (MovOp == AMDGPU::AV_MOV_B32_IMM_PSEUDO &&
1330 (!OpToFold.isImm() ||
1331 !TII->isImmOperandLegal(MovDesc, SrcIdx,
1332 *OpToFold.getEffectiveImmVal())))
1333 break;
1334
1335 if (!MRI->constrainRegClass(SrcReg, MovSrcRC))
1336 break;
1337
1338 // FIXME: This is mutating the instruction only and deferring the actual
1339 // fold of the immediate
1340 } else {
1341 // For the _IMM_PSEUDO cases, there can be value restrictions on the
1342 // immediate to verify. Technically we should always verify this, but it
1343 // only matters for these concrete cases.
1344 // TODO: Handle non-imm case if it's useful.
1345 if (!OpToFold.isImm() ||
1346 !TII->isImmOperandLegal(MovDesc, 1, *OpToFold.getEffectiveImmVal()))
1347 break;
1348 }
1349
1352 while (ImpOpI != ImpOpE) {
1353 MachineInstr::mop_iterator Tmp = ImpOpI;
1354 ImpOpI++;
1356 }
1357 UseMI->setDesc(MovDesc);
1358
1359 if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
1360 const auto &SrcOp = UseMI->getOperand(UseOpIdx);
1361 MachineOperand NewSrcOp(SrcOp);
1362 MachineFunction *MF = UseMI->getMF();
1363 UseMI->removeOperand(1);
1364 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
1365 UseMI->addOperand(NewSrcOp); // src0
1366 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
1367 UseOpIdx = SrcIdx;
1368 UseOp = &UseMI->getOperand(UseOpIdx);
1369 }
1370 CopiesToReplace.push_back(UseMI);
1371 break;
1372 }
1373
1374 // We failed to replace the copy, so give up.
1375 if (UseMI->getOpcode() == AMDGPU::COPY)
1376 return;
1377
1378 } else {
1379 if (UseMI->isCopy() && OpToFold.isReg() &&
1380 UseMI->getOperand(0).getReg().isVirtual() &&
1381 !UseMI->getOperand(1).getSubReg() &&
1382 OpToFold.DefMI->implicit_operands().empty()) {
1383 LLVM_DEBUG(dbgs() << "Folding " << OpToFold.OpToFold << "\n into "
1384 << *UseMI);
1385 unsigned Size = TII->getOpSize(*UseMI, 1);
1386 Register UseReg = OpToFold.getReg();
1388 unsigned SubRegIdx = OpToFold.getSubReg();
1389 // Hack to allow 32-bit SGPRs to be folded into True16 instructions
1390 // Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the
1391 // VS_16RegClass
1392 //
1393 // Excerpt from AMDGPUGenRegisterInfoEnums.inc
1394 // NoSubRegister, //0
1395 // hi16, // 1
1396 // lo16, // 2
1397 // sub0, // 3
1398 // ...
1399 // sub1, // 11
1400 // sub1_hi16, // 12
1401 // sub1_lo16, // 13
1402 static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed");
1403 if (Size == 2 && TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
1404 TRI->isSGPRReg(*MRI, UseReg)) {
1405 // Produce the 32 bit subregister index to which the 16-bit subregister
1406 // is aligned.
1407 if (SubRegIdx > AMDGPU::sub1) {
1408 LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx);
1409 M |= M.getLane(M.getHighestLane() - 1);
1410 SmallVector<unsigned, 4> Indexes;
1411 TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M,
1412 Indexes);
1413 assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover");
1414 SubRegIdx = Indexes[0];
1415 // 32-bit registers do not have a sub0 index
1416 } else if (TII->getOpSize(*UseMI, 1) == 4)
1417 SubRegIdx = 0;
1418 else
1419 SubRegIdx = AMDGPU::sub0;
1420 }
1421 UseMI->getOperand(1).setSubReg(SubRegIdx);
1422 UseMI->getOperand(1).setIsKill(false);
1423 CopiesToReplace.push_back(UseMI);
1424 OpToFold.OpToFold->setIsKill(false);
1425
1426 // Remove kill flags as kills may now be out of order with uses.
1427 MRI->clearKillFlags(UseReg);
1428 if (foldCopyToAGPRRegSequence(UseMI))
1429 return;
1430 }
1431
1432 unsigned UseOpc = UseMI->getOpcode();
1433 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
1434 (UseOpc == AMDGPU::V_READLANE_B32 &&
1435 (int)UseOpIdx ==
1436 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
1437 // %vgpr = V_MOV_B32 imm
1438 // %sgpr = V_READFIRSTLANE_B32 %vgpr
1439 // =>
1440 // %sgpr = S_MOV_B32 imm
1441 if (FoldingImmLike) {
1443 UseMI->getOperand(UseOpIdx).getReg(),
1444 *OpToFold.DefMI, *UseMI))
1445 return;
1446
1447 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
1448
1449 if (OpToFold.isImm()) {
1451 *OpToFold.getEffectiveImmVal());
1452 } else if (OpToFold.isFI())
1453 UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getFI());
1454 else {
1455 assert(OpToFold.isGlobal());
1456 UseMI->getOperand(1).ChangeToGA(OpToFold.OpToFold->getGlobal(),
1457 OpToFold.OpToFold->getOffset(),
1458 OpToFold.OpToFold->getTargetFlags());
1459 }
1460 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1461 return;
1462 }
1463
1464 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
1466 UseMI->getOperand(UseOpIdx).getReg(),
1467 *OpToFold.DefMI, *UseMI))
1468 return;
1469
1470 // %vgpr = COPY %sgpr0
1471 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
1472 // =>
1473 // %sgpr1 = COPY %sgpr0
1474 UseMI->setDesc(TII->get(AMDGPU::COPY));
1475 UseMI->getOperand(1).setReg(OpToFold.getReg());
1476 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1477 UseMI->getOperand(1).setIsKill(false);
1478 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1479 return;
1480 }
1481 }
1482
1483 const MCInstrDesc &UseDesc = UseMI->getDesc();
1484
1485 // Don't fold into target independent nodes. Target independent opcodes
1486 // don't have defined register classes.
1487 if (UseDesc.isVariadic() || UseOp->isImplicit() ||
1488 UseDesc.operands()[UseOpIdx].RegClass == -1)
1489 return;
1490 }
1491
1492 // FIXME: We could try to change the instruction from 64-bit to 32-bit
1493 // to enable more folding opportunities. The shrink operands pass
1494 // already does this.
1495
1496 tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold);
1497}
1498
1499static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
1501 switch (Opcode) {
1502 case AMDGPU::V_AND_B32_e64:
1503 case AMDGPU::V_AND_B32_e32:
1504 case AMDGPU::S_AND_B32:
1505 Result = LHS & RHS;
1506 return true;
1507 case AMDGPU::V_OR_B32_e64:
1508 case AMDGPU::V_OR_B32_e32:
1509 case AMDGPU::S_OR_B32:
1510 Result = LHS | RHS;
1511 return true;
1512 case AMDGPU::V_XOR_B32_e64:
1513 case AMDGPU::V_XOR_B32_e32:
1514 case AMDGPU::S_XOR_B32:
1515 Result = LHS ^ RHS;
1516 return true;
1517 case AMDGPU::S_XNOR_B32:
1518 Result = ~(LHS ^ RHS);
1519 return true;
1520 case AMDGPU::S_NAND_B32:
1521 Result = ~(LHS & RHS);
1522 return true;
1523 case AMDGPU::S_NOR_B32:
1524 Result = ~(LHS | RHS);
1525 return true;
1526 case AMDGPU::S_ANDN2_B32:
1527 Result = LHS & ~RHS;
1528 return true;
1529 case AMDGPU::S_ORN2_B32:
1530 Result = LHS | ~RHS;
1531 return true;
1532 case AMDGPU::V_LSHL_B32_e64:
1533 case AMDGPU::V_LSHL_B32_e32:
1534 case AMDGPU::S_LSHL_B32:
1535 // The instruction ignores the high bits for out of bounds shifts.
1536 Result = LHS << (RHS & 31);
1537 return true;
1538 case AMDGPU::V_LSHLREV_B32_e64:
1539 case AMDGPU::V_LSHLREV_B32_e32:
1540 Result = RHS << (LHS & 31);
1541 return true;
1542 case AMDGPU::V_LSHR_B32_e64:
1543 case AMDGPU::V_LSHR_B32_e32:
1544 case AMDGPU::S_LSHR_B32:
1545 Result = LHS >> (RHS & 31);
1546 return true;
1547 case AMDGPU::V_LSHRREV_B32_e64:
1548 case AMDGPU::V_LSHRREV_B32_e32:
1549 Result = RHS >> (LHS & 31);
1550 return true;
1551 case AMDGPU::V_ASHR_I32_e64:
1552 case AMDGPU::V_ASHR_I32_e32:
1553 case AMDGPU::S_ASHR_I32:
1554 Result = static_cast<int32_t>(LHS) >> (RHS & 31);
1555 return true;
1556 case AMDGPU::V_ASHRREV_I32_e64:
1557 case AMDGPU::V_ASHRREV_I32_e32:
1558 Result = static_cast<int32_t>(RHS) >> (LHS & 31);
1559 return true;
1560 default:
1561 return false;
1562 }
1563}
1564
1565static unsigned getMovOpc(bool IsScalar) {
1566 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1567}
1568
1569// Try to simplify operations with a constant that may appear after instruction
1570// selection.
1571// TODO: See if a frame index with a fixed offset can fold.
1572bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
1573 if (!MI->allImplicitDefsAreDead())
1574 return false;
1575
1576 unsigned Opc = MI->getOpcode();
1577
1578 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1579 if (Src0Idx == -1)
1580 return false;
1581
1582 MachineOperand *Src0 = &MI->getOperand(Src0Idx);
1583 std::optional<int64_t> Src0Imm = TII->getImmOrMaterializedImm(*Src0);
1584
1585 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1586 Opc == AMDGPU::S_NOT_B32) &&
1587 Src0Imm) {
1588 MI->getOperand(1).ChangeToImmediate(~*Src0Imm);
1589 TII->mutateAndCleanupImplicit(
1590 *MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
1591 return true;
1592 }
1593
1594 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1595 if (Src1Idx == -1)
1596 return false;
1597
1598 MachineOperand *Src1 = &MI->getOperand(Src1Idx);
1599 std::optional<int64_t> Src1Imm = TII->getImmOrMaterializedImm(*Src1);
1600
1601 if (!Src0Imm && !Src1Imm)
1602 return false;
1603
1604 // and k0, k1 -> v_mov_b32 (k0 & k1)
1605 // or k0, k1 -> v_mov_b32 (k0 | k1)
1606 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1607 if (Src0Imm && Src1Imm) {
1608 int32_t NewImm;
1609 if (!evalBinaryInstruction(Opc, NewImm, *Src0Imm, *Src1Imm))
1610 return false;
1611
1612 bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg());
1613
1614 // Be careful to change the right operand, src0 may belong to a different
1615 // instruction.
1616 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1617 MI->removeOperand(Src1Idx);
1618 TII->mutateAndCleanupImplicit(*MI, TII->get(getMovOpc(IsSGPR)));
1619 return true;
1620 }
1621
1622 if (!MI->isCommutable())
1623 return false;
1624
1625 if (Src0Imm && !Src1Imm) {
1626 std::swap(Src0, Src1);
1627 std::swap(Src0Idx, Src1Idx);
1628 std::swap(Src0Imm, Src1Imm);
1629 }
1630
1631 int32_t Src1Val = static_cast<int32_t>(*Src1Imm);
1632 if (Opc == AMDGPU::V_OR_B32_e64 ||
1633 Opc == AMDGPU::V_OR_B32_e32 ||
1634 Opc == AMDGPU::S_OR_B32) {
1635 if (Src1Val == 0) {
1636 // y = or x, 0 => y = copy x
1637 MI->removeOperand(Src1Idx);
1638 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1639 } else if (Src1Val == -1) {
1640 // y = or x, -1 => y = v_mov_b32 -1
1641 MI->removeOperand(Src1Idx);
1642 TII->mutateAndCleanupImplicit(
1643 *MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
1644 } else
1645 return false;
1646
1647 return true;
1648 }
1649
1650 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1651 Opc == AMDGPU::S_AND_B32) {
1652 if (Src1Val == 0) {
1653 // y = and x, 0 => y = v_mov_b32 0
1654 MI->removeOperand(Src0Idx);
1655 TII->mutateAndCleanupImplicit(
1656 *MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
1657 } else if (Src1Val == -1) {
1658 // y = and x, -1 => y = copy x
1659 MI->removeOperand(Src1Idx);
1660 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1661 } else
1662 return false;
1663
1664 return true;
1665 }
1666
1667 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1668 Opc == AMDGPU::S_XOR_B32) {
1669 if (Src1Val == 0) {
1670 // y = xor x, 0 => y = copy x
1671 MI->removeOperand(Src1Idx);
1672 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1673 return true;
1674 }
1675 }
1676
1677 return false;
1678}
1679
1680// Try to fold an instruction into a simpler one
1681bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
1682 unsigned Opc = MI.getOpcode();
1683 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1684 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1685 return false;
1686
1687 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1688 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1689 if (!Src1->isIdenticalTo(*Src0)) {
1690 std::optional<int64_t> Src1Imm = TII->getImmOrMaterializedImm(*Src1);
1691 if (!Src1Imm)
1692 return false;
1693
1694 std::optional<int64_t> Src0Imm = TII->getImmOrMaterializedImm(*Src0);
1695 if (!Src0Imm || *Src0Imm != *Src1Imm)
1696 return false;
1697 }
1698
1699 int Src1ModIdx =
1700 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1701 int Src0ModIdx =
1702 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1703 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1704 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1705 return false;
1706
1707 LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1708 auto &NewDesc =
1709 TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
1710 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1711 if (Src2Idx != -1)
1712 MI.removeOperand(Src2Idx);
1713 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1714 if (Src1ModIdx != -1)
1715 MI.removeOperand(Src1ModIdx);
1716 if (Src0ModIdx != -1)
1717 MI.removeOperand(Src0ModIdx);
1718 TII->mutateAndCleanupImplicit(MI, NewDesc);
1719 LLVM_DEBUG(dbgs() << MI);
1720 return true;
1721}
1722
1723bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
1724 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1725 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1726 return false;
1727
1728 std::optional<int64_t> Src0Imm =
1729 TII->getImmOrMaterializedImm(MI.getOperand(1));
1730 if (!Src0Imm || *Src0Imm != 0xffff || !MI.getOperand(2).isReg())
1731 return false;
1732
1733 Register Src1 = MI.getOperand(2).getReg();
1734 MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1735 if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
1736 return false;
1737
1738 Register Dst = MI.getOperand(0).getReg();
1739 MRI->replaceRegWith(Dst, Src1);
1740 if (!MI.getOperand(2).isKill())
1741 MRI->clearKillFlags(Src1);
1742 MI.eraseFromParent();
1743 return true;
1744}
1745
1746bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
1747 const FoldableDef &OpToFold) const {
1748 // We need mutate the operands of new mov instructions to add implicit
1749 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1750 // this.
1751 SmallVector<MachineInstr *, 4> CopiesToReplace;
1753 MachineOperand &Dst = MI.getOperand(0);
1754 bool Changed = false;
1755
1756 if (OpToFold.isImm()) {
1757 for (auto &UseMI :
1758 make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
1759 // Folding the immediate may reveal operations that can be constant
1760 // folded or replaced with a copy. This can happen for example after
1761 // frame indices are lowered to constants or from splitting 64-bit
1762 // constants.
1763 //
1764 // We may also encounter cases where one or both operands are
1765 // immediates materialized into a register, which would ordinarily not
1766 // be folded due to multiple uses or operand constraints.
1767 if (tryConstantFoldOp(&UseMI)) {
1768 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
1769 Changed = true;
1770 }
1771 }
1772 }
1773
1775 llvm::make_pointer_range(MRI->use_nodbg_operands(Dst.getReg())));
1776 for (auto *U : UsesToProcess) {
1777 MachineInstr *UseMI = U->getParent();
1778
1779 FoldableDef SubOpToFold = OpToFold.getWithSubReg(*TRI, U->getSubReg());
1780 foldOperand(SubOpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
1781 CopiesToReplace);
1782 }
1783
1784 if (CopiesToReplace.empty() && FoldList.empty())
1785 return Changed;
1786
1787 MachineFunction *MF = MI.getMF();
1788 // Make sure we add EXEC uses to any new v_mov instructions created.
1789 for (MachineInstr *Copy : CopiesToReplace)
1790 Copy->addImplicitDefUseOperands(*MF);
1791
1792 SetVector<MachineInstr *> ConstantFoldCandidates;
1793 for (FoldCandidate &Fold : FoldList) {
1794 assert(!Fold.isReg() || Fold.Def.OpToFold);
1795 if (Fold.isReg() && Fold.getReg().isVirtual()) {
1796 Register Reg = Fold.getReg();
1797 const MachineInstr *DefMI = Fold.Def.DefMI;
1798 if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
1799 execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
1800 continue;
1801 }
1802 if (updateOperand(Fold)) {
1803 // Clear kill flags.
1804 if (Fold.isReg()) {
1805 assert(Fold.Def.OpToFold && Fold.isReg());
1806 // FIXME: Probably shouldn't bother trying to fold if not an
1807 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1808 // copies.
1809 MRI->clearKillFlags(Fold.getReg());
1810 }
1811 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1812 << static_cast<int>(Fold.UseOpNo) << " of "
1813 << *Fold.UseMI);
1814
1815 if (Fold.isImm())
1816 ConstantFoldCandidates.insert(Fold.UseMI);
1817
1818 } else if (Fold.Commuted) {
1819 // Restoring instruction's original operand order if fold has failed.
1820 TII->commuteInstruction(*Fold.UseMI, false);
1821 }
1822 }
1823
1824 for (MachineInstr *MI : ConstantFoldCandidates) {
1825 if (tryConstantFoldOp(MI)) {
1826 LLVM_DEBUG(dbgs() << "Constant folded " << *MI);
1827 Changed = true;
1828 }
1829 }
1830 return true;
1831}
1832
1833/// Fold %agpr = COPY (REG_SEQUENCE x_MOV_B32, ...) into REG_SEQUENCE
1834/// (V_ACCVGPR_WRITE_B32_e64) ... depending on the reg_sequence input values.
1835bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const {
1836 // It is very tricky to store a value into an AGPR. v_accvgpr_write_b32 can
1837 // only accept VGPR or inline immediate. Recreate a reg_sequence with its
1838 // initializers right here, so we will rematerialize immediates and avoid
1839 // copies via different reg classes.
1840 const TargetRegisterClass *DefRC =
1841 MRI->getRegClass(CopyMI->getOperand(0).getReg());
1842 if (!TRI->isAGPRClass(DefRC))
1843 return false;
1844
1845 Register UseReg = CopyMI->getOperand(1).getReg();
1846 MachineInstr *RegSeq = MRI->getVRegDef(UseReg);
1847 if (!RegSeq || !RegSeq->isRegSequence())
1848 return false;
1849
1850 const DebugLoc &DL = CopyMI->getDebugLoc();
1851 MachineBasicBlock &MBB = *CopyMI->getParent();
1852
1853 MachineInstrBuilder B(*MBB.getParent(), CopyMI);
1854 DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
1855
1856 const TargetRegisterClass *UseRC =
1857 MRI->getRegClass(CopyMI->getOperand(1).getReg());
1858
1859 // Value, subregindex for new REG_SEQUENCE
1861
1862 unsigned NumRegSeqOperands = RegSeq->getNumOperands();
1863 unsigned NumFoldable = 0;
1864
1865 for (unsigned I = 1; I != NumRegSeqOperands; I += 2) {
1866 MachineOperand &RegOp = RegSeq->getOperand(I);
1867 unsigned SubRegIdx = RegSeq->getOperand(I + 1).getImm();
1868
1869 if (RegOp.getSubReg()) {
1870 // TODO: Handle subregister compose
1871 NewDefs.emplace_back(&RegOp, SubRegIdx);
1872 continue;
1873 }
1874
1875 MachineOperand *Lookup = lookUpCopyChain(*TII, *MRI, RegOp.getReg());
1876 if (!Lookup)
1877 Lookup = &RegOp;
1878
1879 if (Lookup->isImm()) {
1880 // Check if this is an agpr_32 subregister.
1881 const TargetRegisterClass *DestSuperRC = TRI->getMatchingSuperRegClass(
1882 DefRC, &AMDGPU::AGPR_32RegClass, SubRegIdx);
1883 if (DestSuperRC &&
1884 TII->isInlineConstant(*Lookup, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1885 ++NumFoldable;
1886 NewDefs.emplace_back(Lookup, SubRegIdx);
1887 continue;
1888 }
1889 }
1890
1891 const TargetRegisterClass *InputRC =
1892 Lookup->isReg() ? MRI->getRegClass(Lookup->getReg())
1893 : MRI->getRegClass(RegOp.getReg());
1894
1895 // TODO: Account for Lookup->getSubReg()
1896
1897 // If we can't find a matching super class, this is an SGPR->AGPR or
1898 // VGPR->AGPR subreg copy (or something constant-like we have to materialize
1899 // in the AGPR). We can't directly copy from SGPR to AGPR on gfx908, so we
1900 // want to rewrite to copy to an intermediate VGPR class.
1901 const TargetRegisterClass *MatchRC =
1902 TRI->getMatchingSuperRegClass(DefRC, InputRC, SubRegIdx);
1903 if (!MatchRC) {
1904 ++NumFoldable;
1905 NewDefs.emplace_back(&RegOp, SubRegIdx);
1906 continue;
1907 }
1908
1909 NewDefs.emplace_back(&RegOp, SubRegIdx);
1910 }
1911
1912 // Do not clone a reg_sequence and merely change the result register class.
1913 if (NumFoldable == 0)
1914 return false;
1915
1916 CopyMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
1917 for (unsigned I = CopyMI->getNumOperands() - 1; I > 0; --I)
1918 CopyMI->removeOperand(I);
1919
1920 for (auto [Def, DestSubIdx] : NewDefs) {
1921 if (!Def->isReg()) {
1922 // TODO: Should we use single write for each repeated value like in
1923 // register case?
1924 Register Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1925 BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
1926 .add(*Def);
1927 B.addReg(Tmp);
1928 } else {
1929 TargetInstrInfo::RegSubRegPair Src = getRegSubRegPair(*Def);
1930 Def->setIsKill(false);
1931
1932 Register &VGPRCopy = VGPRCopies[Src];
1933 if (!VGPRCopy) {
1934 const TargetRegisterClass *VGPRUseSubRC =
1935 TRI->getSubRegisterClass(UseRC, DestSubIdx);
1936
1937 // We cannot build a reg_sequence out of the same registers, they
1938 // must be copied. Better do it here before copyPhysReg() created
1939 // several reads to do the AGPR->VGPR->AGPR copy.
1940
1941 // Direct copy from SGPR to AGPR is not possible on gfx908. To avoid
1942 // creation of exploded copies SGPR->VGPR->AGPR in the copyPhysReg()
1943 // later, create a copy here and track if we already have such a copy.
1944 const TargetRegisterClass *SubRC =
1945 TRI->getSubRegisterClass(MRI->getRegClass(Src.Reg), Src.SubReg);
1946 if (!VGPRUseSubRC->hasSubClassEq(SubRC)) {
1947 // TODO: Try to reconstrain class
1948 VGPRCopy = MRI->createVirtualRegister(VGPRUseSubRC);
1949 BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), VGPRCopy).add(*Def);
1950 B.addReg(VGPRCopy);
1951 } else {
1952 // If it is already a VGPR, do not copy the register.
1953 B.add(*Def);
1954 }
1955 } else {
1956 B.addReg(VGPRCopy);
1957 }
1958 }
1959
1960 B.addImm(DestSubIdx);
1961 }
1962
1963 LLVM_DEBUG(dbgs() << "Folded " << *CopyMI);
1964 return true;
1965}
1966
1967bool SIFoldOperandsImpl::tryFoldFoldableCopy(
1968 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
1969 Register DstReg = MI.getOperand(0).getReg();
1970 // Specially track simple redefs of m0 to the same value in a block, so we
1971 // can erase the later ones.
1972 if (DstReg == AMDGPU::M0) {
1973 MachineOperand &NewM0Val = MI.getOperand(1);
1974 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
1975 MI.eraseFromParent();
1976 return true;
1977 }
1978
1979 // We aren't tracking other physical registers
1980 CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical())
1981 ? nullptr
1982 : &NewM0Val;
1983 return false;
1984 }
1985
1986 MachineOperand *OpToFoldPtr;
1987 if (MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e64) {
1988 // Folding when any src_modifiers are non-zero is unsupported
1989 if (TII->hasAnyModifiersSet(MI))
1990 return false;
1991 OpToFoldPtr = &MI.getOperand(2);
1992 } else
1993 OpToFoldPtr = &MI.getOperand(1);
1994 MachineOperand &OpToFold = *OpToFoldPtr;
1995 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1996
1997 // FIXME: We could also be folding things like TargetIndexes.
1998 if (!FoldingImm && !OpToFold.isReg())
1999 return false;
2000
2001 // Fold virtual registers and constant physical registers.
2002 if (OpToFold.isReg() && OpToFold.getReg().isPhysical() &&
2003 !TRI->isConstantPhysReg(OpToFold.getReg()))
2004 return false;
2005
2006 // Prevent folding operands backwards in the function. For example,
2007 // the COPY opcode must not be replaced by 1 in this example:
2008 //
2009 // %3 = COPY %vgpr0; VGPR_32:%3
2010 // ...
2011 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
2012 if (!DstReg.isVirtual())
2013 return false;
2014
2015 const TargetRegisterClass *DstRC =
2016 MRI->getRegClass(MI.getOperand(0).getReg());
2017
2018 // True16: Fix malformed 16-bit sgpr COPY produced by peephole-opt
2019 // Can remove this code if proper 16-bit SGPRs are implemented
2020 // Example: Pre-peephole-opt
2021 // %29:sgpr_lo16 = COPY %16.lo16:sreg_32
2022 // %32:sreg_32 = COPY %29:sgpr_lo16
2023 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2024 // Post-peephole-opt and DCE
2025 // %32:sreg_32 = COPY %16.lo16:sreg_32
2026 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2027 // After this transform
2028 // %32:sreg_32 = COPY %16:sreg_32
2029 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2030 // After the fold operands pass
2031 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %16:sreg_32
2032 if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() &&
2033 OpToFold.getSubReg()) {
2034 if (DstRC == &AMDGPU::SReg_32RegClass &&
2035 DstRC == MRI->getRegClass(OpToFold.getReg())) {
2036 assert(OpToFold.getSubReg() == AMDGPU::lo16);
2037 OpToFold.setSubReg(0);
2038 }
2039 }
2040
2041 // Fold copy to AGPR through reg_sequence
2042 // TODO: Handle with subregister extract
2043 if (OpToFold.isReg() && MI.isCopy() && !MI.getOperand(1).getSubReg()) {
2044 if (foldCopyToAGPRRegSequence(&MI))
2045 return true;
2046 }
2047
2048 FoldableDef Def(OpToFold, DstRC);
2049 bool Changed = foldInstOperand(MI, Def);
2050
2051 // If we managed to fold all uses of this copy then we might as well
2052 // delete it now.
2053 // The only reason we need to follow chains of copies here is that
2054 // tryFoldRegSequence looks forward through copies before folding a
2055 // REG_SEQUENCE into its eventual users.
2056 auto *InstToErase = &MI;
2057 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2058 auto &SrcOp = InstToErase->getOperand(1);
2059 auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
2060 InstToErase->eraseFromParent();
2061 Changed = true;
2062 InstToErase = nullptr;
2063 if (!SrcReg || SrcReg.isPhysical())
2064 break;
2065 InstToErase = MRI->getVRegDef(SrcReg);
2066 if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
2067 break;
2068 }
2069
2070 if (InstToErase && InstToErase->isRegSequence() &&
2071 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2072 InstToErase->eraseFromParent();
2073 Changed = true;
2074 }
2075
2076 if (Changed)
2077 return true;
2078
2079 // Run this after foldInstOperand to avoid turning scalar additions into
2080 // vector additions when the result scalar result could just be folded into
2081 // the user(s).
2082 return OpToFold.isReg() &&
2083 foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI);
2084}
2085
2086// Clamp patterns are canonically selected to v_max_* instructions, so only
2087// handle them.
2088const MachineOperand *
2089SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const {
2090 unsigned Op = MI.getOpcode();
2091 switch (Op) {
2092 case AMDGPU::V_MAX_F32_e64:
2093 case AMDGPU::V_MAX_F16_e64:
2094 case AMDGPU::V_MAX_F16_t16_e64:
2095 case AMDGPU::V_MAX_F16_fake16_e64:
2096 case AMDGPU::V_MAX_F64_e64:
2097 case AMDGPU::V_MAX_NUM_F64_e64:
2098 case AMDGPU::V_PK_MAX_F16:
2099 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2100 case AMDGPU::V_PK_MAX_NUM_BF16: {
2101 if (MI.mayRaiseFPException())
2102 return nullptr;
2103
2104 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
2105 return nullptr;
2106
2107 // Make sure sources are identical.
2108 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2109 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2110 if (!Src0->isReg() || !Src1->isReg() ||
2111 Src0->getReg() != Src1->getReg() ||
2112 Src0->getSubReg() != Src1->getSubReg() ||
2113 Src0->getSubReg() != AMDGPU::NoSubRegister)
2114 return nullptr;
2115
2116 // Can't fold up if we have modifiers.
2117 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
2118 return nullptr;
2119
2120 unsigned Src0Mods
2121 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
2122 unsigned Src1Mods
2123 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
2124
2125 // Having a 0 op_sel_hi would require swizzling the output in the source
2126 // instruction, which we can't do.
2127 unsigned UnsetMods =
2128 (Op == AMDGPU::V_PK_MAX_F16 || Op == AMDGPU::V_PK_MAX_NUM_BF16)
2130 : 0u;
2131 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
2132 return nullptr;
2133 return Src0;
2134 }
2135 default:
2136 return nullptr;
2137 }
2138}
2139
2140// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
2141bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
2142 const MachineOperand *ClampSrc = isClamp(MI);
2143 if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
2144 return false;
2145
2146 if (!ClampSrc->getReg().isVirtual())
2147 return false;
2148
2149 // Look through COPY. COPY only observed with True16.
2150 Register DefSrcReg = TRI->lookThruCopyLike(ClampSrc->getReg(), MRI);
2151 MachineInstr *Def =
2152 MRI->getVRegDef(DefSrcReg.isVirtual() ? DefSrcReg : ClampSrc->getReg());
2153
2154 // The type of clamp must be compatible.
2155 if (TII->getClampMask(*Def) != TII->getClampMask(MI))
2156 return false;
2157
2158 if (Def->mayRaiseFPException())
2159 return false;
2160
2161 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
2162 if (!DefClamp)
2163 return false;
2164
2165 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
2166
2167 // Clamp is applied after omod, so it is OK if omod is set.
2168 DefClamp->setImm(1);
2169
2170 Register DefReg = Def->getOperand(0).getReg();
2171 Register MIDstReg = MI.getOperand(0).getReg();
2172 if (TRI->isSGPRReg(*MRI, DefReg)) {
2173 // Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
2174 // instruction with a VGPR dst.
2175 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
2176 MIDstReg)
2177 .addReg(DefReg);
2178 } else {
2179 MRI->replaceRegWith(MIDstReg, DefReg);
2180 }
2181 MI.eraseFromParent();
2182
2183 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
2184 // instruction, so we might as well convert it to the more flexible VOP3-only
2185 // mad/fma form.
2186 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
2187 Def->eraseFromParent();
2188
2189 return true;
2190}
2191
2192static int getOModValue(unsigned Opc, int64_t Val) {
2193 switch (Opc) {
2194 case AMDGPU::V_MUL_F64_e64:
2195 case AMDGPU::V_MUL_F64_pseudo_e64: {
2196 switch (Val) {
2197 case 0x3fe0000000000000: // 0.5
2198 return SIOutMods::DIV2;
2199 case 0x4000000000000000: // 2.0
2200 return SIOutMods::MUL2;
2201 case 0x4010000000000000: // 4.0
2202 return SIOutMods::MUL4;
2203 default:
2204 return SIOutMods::NONE;
2205 }
2206 }
2207 case AMDGPU::V_MUL_F32_e64: {
2208 switch (static_cast<uint32_t>(Val)) {
2209 case 0x3f000000: // 0.5
2210 return SIOutMods::DIV2;
2211 case 0x40000000: // 2.0
2212 return SIOutMods::MUL2;
2213 case 0x40800000: // 4.0
2214 return SIOutMods::MUL4;
2215 default:
2216 return SIOutMods::NONE;
2217 }
2218 }
2219 case AMDGPU::V_MUL_F16_e64:
2220 case AMDGPU::V_MUL_F16_t16_e64:
2221 case AMDGPU::V_MUL_F16_fake16_e64: {
2222 switch (static_cast<uint16_t>(Val)) {
2223 case 0x3800: // 0.5
2224 return SIOutMods::DIV2;
2225 case 0x4000: // 2.0
2226 return SIOutMods::MUL2;
2227 case 0x4400: // 4.0
2228 return SIOutMods::MUL4;
2229 default:
2230 return SIOutMods::NONE;
2231 }
2232 }
2233 default:
2234 llvm_unreachable("invalid mul opcode");
2235 }
2236}
2237
2238// FIXME: Does this really not support denormals with f16?
2239// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
2240// handled, so will anything other than that break?
2241std::pair<const MachineOperand *, int>
2242SIFoldOperandsImpl::isOMod(const MachineInstr &MI) const {
2243 unsigned Op = MI.getOpcode();
2244 switch (Op) {
2245 case AMDGPU::V_MUL_F64_e64:
2246 case AMDGPU::V_MUL_F64_pseudo_e64:
2247 case AMDGPU::V_MUL_F32_e64:
2248 case AMDGPU::V_MUL_F16_t16_e64:
2249 case AMDGPU::V_MUL_F16_fake16_e64:
2250 case AMDGPU::V_MUL_F16_e64: {
2251 // If output denormals are enabled, omod is ignored.
2252 if ((Op == AMDGPU::V_MUL_F32_e64 &&
2254 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
2255 Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
2256 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
2259 MI.mayRaiseFPException())
2260 return std::pair(nullptr, SIOutMods::NONE);
2261
2262 const MachineOperand *RegOp = nullptr;
2263 const MachineOperand *ImmOp = nullptr;
2264 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2265 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2266 if (Src0->isImm()) {
2267 ImmOp = Src0;
2268 RegOp = Src1;
2269 } else if (Src1->isImm()) {
2270 ImmOp = Src1;
2271 RegOp = Src0;
2272 } else
2273 return std::pair(nullptr, SIOutMods::NONE);
2274
2275 int OMod = getOModValue(Op, ImmOp->getImm());
2276 if (OMod == SIOutMods::NONE ||
2277 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
2278 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
2279 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
2280 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
2281 return std::pair(nullptr, SIOutMods::NONE);
2282
2283 return std::pair(RegOp, OMod);
2284 }
2285 case AMDGPU::V_ADD_F64_e64:
2286 case AMDGPU::V_ADD_F64_pseudo_e64:
2287 case AMDGPU::V_ADD_F32_e64:
2288 case AMDGPU::V_ADD_F16_e64:
2289 case AMDGPU::V_ADD_F16_t16_e64:
2290 case AMDGPU::V_ADD_F16_fake16_e64: {
2291 // If output denormals are enabled, omod is ignored.
2292 if ((Op == AMDGPU::V_ADD_F32_e64 &&
2294 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
2295 Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
2296 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
2298 return std::pair(nullptr, SIOutMods::NONE);
2299
2300 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
2301 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2302 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2303
2304 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
2305 Src0->getSubReg() == Src1->getSubReg() &&
2306 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
2307 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
2308 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
2309 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
2310 return std::pair(Src0, SIOutMods::MUL2);
2311
2312 return std::pair(nullptr, SIOutMods::NONE);
2313 }
2314 default:
2315 return std::pair(nullptr, SIOutMods::NONE);
2316 }
2317}
2318
2319// FIXME: Does this need to check IEEE bit on function?
2320bool SIFoldOperandsImpl::tryFoldOMod(MachineInstr &MI) {
2321 const MachineOperand *RegOp;
2322 int OMod;
2323 std::tie(RegOp, OMod) = isOMod(MI);
2324 if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
2325 RegOp->getSubReg() != AMDGPU::NoSubRegister ||
2326 !MRI->hasOneNonDBGUser(RegOp->getReg()))
2327 return false;
2328
2329 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
2330 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
2331 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
2332 return false;
2333
2334 if (Def->mayRaiseFPException())
2335 return false;
2336
2337 // Clamp is applied after omod. If the source already has clamp set, don't
2338 // fold it.
2339 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
2340 return false;
2341
2342 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
2343
2344 DefOMod->setImm(OMod);
2345 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
2346 // Kill flags can be wrong if we replaced a def inside a loop with a def
2347 // outside the loop.
2348 MRI->clearKillFlags(Def->getOperand(0).getReg());
2349 MI.eraseFromParent();
2350
2351 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
2352 // instruction, so we might as well convert it to the more flexible VOP3-only
2353 // mad/fma form.
2354 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
2355 Def->eraseFromParent();
2356
2357 return true;
2358}
2359
2360// Try to fold a reg_sequence with vgpr output and agpr inputs into an
2361// instruction which can take an agpr. So far that means a store.
2362bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
2363 assert(MI.isRegSequence());
2364 auto Reg = MI.getOperand(0).getReg();
2365
2366 if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
2367 !MRI->hasOneNonDBGUse(Reg))
2368 return false;
2369
2371 if (!getRegSeqInit(Defs, Reg))
2372 return false;
2373
2374 for (auto &[Op, SubIdx] : Defs) {
2375 if (!Op->isReg())
2376 return false;
2377 if (TRI->isAGPR(*MRI, Op->getReg()))
2378 continue;
2379 // Maybe this is a COPY from AREG
2380 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
2381 if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
2382 return false;
2383 if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
2384 return false;
2385 }
2386
2387 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
2388 MachineInstr *UseMI = Op->getParent();
2389 while (UseMI->isCopy() && !Op->getSubReg()) {
2390 Reg = UseMI->getOperand(0).getReg();
2391 if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
2392 return false;
2393 Op = &*MRI->use_nodbg_begin(Reg);
2394 UseMI = Op->getParent();
2395 }
2396
2397 if (Op->getSubReg())
2398 return false;
2399
2400 unsigned OpIdx = Op - &UseMI->getOperand(0);
2401 const MCInstrDesc &InstDesc = UseMI->getDesc();
2402 const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx);
2403 if (!OpRC || !TRI->isVectorSuperClass(OpRC))
2404 return false;
2405
2406 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
2407 auto Dst = MRI->createVirtualRegister(NewDstRC);
2408 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
2409 TII->get(AMDGPU::REG_SEQUENCE), Dst);
2410
2411 for (auto &[Def, SubIdx] : Defs) {
2412 Def->setIsKill(false);
2413 if (TRI->isAGPR(*MRI, Def->getReg())) {
2414 RS.add(*Def);
2415 } else { // This is a copy
2416 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
2417 SubDef->getOperand(1).setIsKill(false);
2418 RS.addReg(SubDef->getOperand(1).getReg(), {}, Def->getSubReg());
2419 }
2420 RS.addImm(SubIdx);
2421 }
2422
2423 Op->setReg(Dst);
2424 if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
2425 Op->setReg(Reg);
2426 RS->eraseFromParent();
2427 return false;
2428 }
2429
2430 LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
2431
2432 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
2433 // in which case we can erase them all later in runOnMachineFunction.
2434 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
2435 MI.eraseFromParent();
2436 return true;
2437}
2438
2439/// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and
2440/// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg
2441static bool isAGPRCopy(const SIRegisterInfo &TRI,
2442 const MachineRegisterInfo &MRI, const MachineInstr &Copy,
2443 Register &OutReg, unsigned &OutSubReg) {
2444 assert(Copy.isCopy());
2445
2446 const MachineOperand &CopySrc = Copy.getOperand(1);
2447 Register CopySrcReg = CopySrc.getReg();
2448 if (!CopySrcReg.isVirtual())
2449 return false;
2450
2451 // Common case: copy from AGPR directly, e.g.
2452 // %1:vgpr_32 = COPY %0:agpr_32
2453 if (TRI.isAGPR(MRI, CopySrcReg)) {
2454 OutReg = CopySrcReg;
2455 OutSubReg = CopySrc.getSubReg();
2456 return true;
2457 }
2458
2459 // Sometimes it can also involve two copies, e.g.
2460 // %1:vgpr_256 = COPY %0:agpr_256
2461 // %2:vgpr_32 = COPY %1:vgpr_256.sub0
2462 const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg);
2463 if (!CopySrcDef || !CopySrcDef->isCopy())
2464 return false;
2465
2466 const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1);
2467 Register OtherCopySrcReg = OtherCopySrc.getReg();
2468 if (!OtherCopySrcReg.isVirtual() ||
2469 CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister ||
2470 OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister ||
2471 !TRI.isAGPR(MRI, OtherCopySrcReg))
2472 return false;
2473
2474 OutReg = OtherCopySrcReg;
2475 OutSubReg = CopySrc.getSubReg();
2476 return true;
2477}
2478
2479// Try to hoist an AGPR to VGPR copy across a PHI.
2480// This should allow folding of an AGPR into a consumer which may support it.
2481//
2482// Example 1: LCSSA PHI
2483// loop:
2484// %1:vreg = COPY %0:areg
2485// exit:
2486// %2:vreg = PHI %1:vreg, %loop
2487// =>
2488// loop:
2489// exit:
2490// %1:areg = PHI %0:areg, %loop
2491// %2:vreg = COPY %1:areg
2492//
2493// Example 2: PHI with multiple incoming values:
2494// entry:
2495// %1:vreg = GLOBAL_LOAD(..)
2496// loop:
2497// %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
2498// %3:areg = COPY %2:vreg
2499// %4:areg = (instr using %3:areg)
2500// %5:vreg = COPY %4:areg
2501// =>
2502// entry:
2503// %1:vreg = GLOBAL_LOAD(..)
2504// %2:areg = COPY %1:vreg
2505// loop:
2506// %3:areg = PHI %2:areg, %entry, %X:areg,
2507// %4:areg = (instr using %3:areg)
2508bool SIFoldOperandsImpl::tryFoldPhiAGPR(MachineInstr &PHI) {
2509 assert(PHI.isPHI());
2510
2511 Register PhiOut = PHI.getOperand(0).getReg();
2512 if (!TRI->isVGPR(*MRI, PhiOut))
2513 return false;
2514
2515 // Iterate once over all incoming values of the PHI to check if this PHI is
2516 // eligible, and determine the exact AGPR RC we'll target.
2517 const TargetRegisterClass *ARC = nullptr;
2518 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2519 MachineOperand &MO = PHI.getOperand(K);
2520 MachineInstr *Copy = MRI->getVRegDef(MO.getReg());
2521 if (!Copy || !Copy->isCopy())
2522 continue;
2523
2524 Register AGPRSrc;
2525 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
2526 if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask))
2527 continue;
2528
2529 const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc);
2530 if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
2531 CopyInRC = SubRC;
2532
2533 if (ARC && !ARC->hasSubClassEq(CopyInRC))
2534 return false;
2535 ARC = CopyInRC;
2536 }
2537
2538 if (!ARC)
2539 return false;
2540
2541 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
2542
2543 // Rewrite the PHI's incoming values to ARC.
2544 LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI);
2545 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2546 MachineOperand &MO = PHI.getOperand(K);
2547 Register Reg = MO.getReg();
2548
2550 MachineBasicBlock *InsertMBB = nullptr;
2551
2552 // Look at the def of Reg, ignoring all copies.
2553 unsigned CopyOpc = AMDGPU::COPY;
2554 if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
2555
2556 // Look at pre-existing COPY instructions from ARC: Steal the operand. If
2557 // the copy was single-use, it will be removed by DCE later.
2558 if (Def->isCopy()) {
2559 Register AGPRSrc;
2560 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
2561 if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) {
2562 MO.setReg(AGPRSrc);
2563 MO.setSubReg(AGPRSubReg);
2564 continue;
2565 }
2566
2567 // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
2568 // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
2569 // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
2570 // is unlikely to be profitable.
2571 //
2572 // Note that V_ACCVGPR_WRITE is only used for AGPR_32.
2573 MachineOperand &CopyIn = Def->getOperand(1);
2574 if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) &&
2575 TRI->isSGPRReg(*MRI, CopyIn.getReg()))
2576 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
2577 }
2578
2579 InsertMBB = Def->getParent();
2580 InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator());
2581 } else {
2582 InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
2583 InsertPt = InsertMBB->getFirstTerminator();
2584 }
2585
2586 Register NewReg = MRI->createVirtualRegister(ARC);
2587 MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
2588 TII->get(CopyOpc), NewReg)
2589 .addReg(Reg);
2590 MO.setReg(NewReg);
2591
2592 (void)MI;
2593 LLVM_DEBUG(dbgs() << " Created COPY: " << *MI);
2594 }
2595
2596 // Replace the PHI's result with a new register.
2597 Register NewReg = MRI->createVirtualRegister(ARC);
2598 PHI.getOperand(0).setReg(NewReg);
2599
2600 // COPY that new register back to the original PhiOut register. This COPY will
2601 // usually be folded out later.
2602 MachineBasicBlock *MBB = PHI.getParent();
2603 BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(),
2604 TII->get(AMDGPU::COPY), PhiOut)
2605 .addReg(NewReg);
2606
2607 LLVM_DEBUG(dbgs() << " Done: Folded " << PHI);
2608 return true;
2609}
2610
2611// Attempt to convert VGPR load to an AGPR load.
2612bool SIFoldOperandsImpl::tryFoldLoad(MachineInstr &MI) {
2613 assert(MI.mayLoad());
2614 if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
2615 return false;
2616
2617 MachineOperand &Def = MI.getOperand(0);
2618 if (!Def.isDef())
2619 return false;
2620
2621 Register DefReg = Def.getReg();
2622
2623 if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
2624 return false;
2625
2627 llvm::make_pointer_range(MRI->use_nodbg_instructions(DefReg)));
2628 SmallVector<Register, 8> MoveRegs;
2629
2630 if (Users.empty())
2631 return false;
2632
2633 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
2634 while (!Users.empty()) {
2635 const MachineInstr *I = Users.pop_back_val();
2636 if (!I->isCopy() && !I->isRegSequence())
2637 return false;
2638 Register DstReg = I->getOperand(0).getReg();
2639 // Physical registers may have more than one instruction definitions
2640 if (DstReg.isPhysical())
2641 return false;
2642 if (TRI->isAGPR(*MRI, DstReg))
2643 continue;
2644 MoveRegs.push_back(DstReg);
2645 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg))
2646 Users.push_back(&U);
2647 }
2648
2649 const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
2650 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
2651 if (!TII->isOperandLegal(MI, 0, &Def)) {
2652 MRI->setRegClass(DefReg, RC);
2653 return false;
2654 }
2655
2656 while (!MoveRegs.empty()) {
2657 Register Reg = MoveRegs.pop_back_val();
2658 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
2659 }
2660
2661 LLVM_DEBUG(dbgs() << "Folded " << MI);
2662
2663 return true;
2664}
2665
2666// tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
2667// For GFX90A and later, this is pretty much always a good thing, but for GFX908
2668// there's cases where it can create a lot more AGPR-AGPR copies, which are
2669// expensive on this architecture due to the lack of V_ACCVGPR_MOV.
2670//
2671// This function looks at all AGPR PHIs in a basic block and collects their
2672// operands. Then, it checks for register that are used more than once across
2673// all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
2674// having to create one VGPR temporary per use, which can get very messy if
2675// these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
2676// element).
2677//
2678// Example
2679// a:
2680// %in:agpr_256 = COPY %foo:vgpr_256
2681// c:
2682// %x:agpr_32 = ..
2683// b:
2684// %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
2685// %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
2686// %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
2687// =>
2688// a:
2689// %in:agpr_256 = COPY %foo:vgpr_256
2690// %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
2691// %tmp_agpr:agpr_32 = COPY %tmp
2692// c:
2693// %x:agpr_32 = ..
2694// b:
2695// %0:areg = PHI %tmp_agpr, %a, %x, %c
2696// %1:areg = PHI %tmp_agpr, %a, %y, %c
2697// %2:areg = PHI %tmp_agpr, %a, %z, %c
2698bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
2699 // This is only really needed on GFX908 where AGPR-AGPR copies are
2700 // unreasonably difficult.
2701 if (ST->hasGFX90AInsts())
2702 return false;
2703
2704 // Look at all AGPR Phis and collect the register + subregister used.
2705 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
2706 RegToMO;
2707
2708 for (auto &MI : MBB) {
2709 if (!MI.isPHI())
2710 break;
2711
2712 if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))
2713 continue;
2714
2715 for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
2716 MachineOperand &PhiMO = MI.getOperand(K);
2717 if (!PhiMO.getSubReg())
2718 continue;
2719 RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
2720 }
2721 }
2722
2723 // For all (Reg, SubReg) pair that are used more than once, cache the value in
2724 // a VGPR.
2725 bool Changed = false;
2726 for (const auto &[Entry, MOs] : RegToMO) {
2727 if (MOs.size() == 1)
2728 continue;
2729
2730 const auto [Reg, SubReg] = Entry;
2731 MachineInstr *Def = MRI->getVRegDef(Reg);
2732 MachineBasicBlock *DefMBB = Def->getParent();
2733
2734 // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
2735 // out.
2736 const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
2737 Register TempVGPR =
2738 MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
2739 MachineInstr *VGPRCopy =
2740 BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
2741 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2742 .addReg(Reg, /* flags */ {}, SubReg);
2743
2744 // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
2745 Register TempAGPR = MRI->createVirtualRegister(ARC);
2746 BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(),
2747 TII->get(AMDGPU::COPY), TempAGPR)
2748 .addReg(TempVGPR);
2749
2750 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
2751 for (MachineOperand *MO : MOs) {
2752 MO->setReg(TempAGPR);
2753 MO->setSubReg(AMDGPU::NoSubRegister);
2754 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n");
2755 }
2756
2757 Changed = true;
2758 }
2759
2760 return Changed;
2761}
2762
2763bool SIFoldOperandsImpl::run(MachineFunction &MF) {
2764 this->MF = &MF;
2765 MRI = &MF.getRegInfo();
2766 ST = &MF.getSubtarget<GCNSubtarget>();
2767 TII = ST->getInstrInfo();
2768 TRI = &TII->getRegisterInfo();
2769 MFI = MF.getInfo<SIMachineFunctionInfo>();
2770
2771 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
2772 // correctly handle signed zeros.
2773 //
2774 // FIXME: Also need to check strictfp
2775 bool IsIEEEMode = MFI->getMode().IEEE;
2776 bool HasNSZ = MFI->hasNoSignedZerosFPMath();
2777
2778 bool Changed = false;
2779 for (MachineBasicBlock *MBB : depth_first(&MF)) {
2780 MachineOperand *CurrentKnownM0Val = nullptr;
2781 for (auto &MI : make_early_inc_range(*MBB)) {
2782 Changed |= tryFoldCndMask(MI);
2783
2784 if (tryFoldZeroHighBits(MI)) {
2785 Changed = true;
2786 continue;
2787 }
2788
2789 if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
2790 Changed = true;
2791 continue;
2792 }
2793
2794 if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
2795 Changed = true;
2796 continue;
2797 }
2798
2799 if (MI.mayLoad() && tryFoldLoad(MI)) {
2800 Changed = true;
2801 continue;
2802 }
2803
2804 if (TII->isFoldableCopy(MI)) {
2805 Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
2806 continue;
2807 }
2808
2809 // Saw an unknown clobber of m0, so we no longer know what it is.
2810 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
2811 CurrentKnownM0Val = nullptr;
2812
2813 // TODO: Omod might be OK if there is NSZ only on the source
2814 // instruction, and not the omod multiply.
2815 if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
2816 !tryFoldOMod(MI))
2817 Changed |= tryFoldClamp(MI);
2818 }
2819
2820 Changed |= tryOptimizeAGPRPhis(*MBB);
2821 }
2822
2823 return Changed;
2824}
2825
2828 MFPropsModifier _(*this, MF);
2829
2830 bool Changed = SIFoldOperandsImpl().run(MF);
2831 if (!Changed) {
2832 return PreservedAnalyses::all();
2833 }
2835 PA.preserveSet<CFGAnalyses>();
2836 return PA;
2837}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
#define _
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, FoldCandidate &&Entry)
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static unsigned getMovOpc(bool IsScalar)
static MachineOperand * lookUpCopyChain(const SIInstrInfo &TII, const MachineRegisterInfo &MRI, Register SrcReg)
static bool checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(const FoldableDef &OpToFold)
static bool isPKF32InstrReplicatesLower32BitsOfScalarOperand(const GCNSubtarget *ST, MachineInstr *MI, unsigned OpNo)
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static int Lookup(ArrayRef< TableEntry > Table, unsigned Opcode)
Value * RHS
Value * LHS
Represent the analysis usage information of a pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
const SIInstrInfo * getInstrInfo() const override
bool hasDOTOpSelHazard() const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
const HexagonRegisterInfo & getRegisterInfo() const
ArrayRef< MCOperandInfo > operands() const
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
An RAII based helper class to modify MachineFunctionProperties when running pass.
LLVM_ABI iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg=Register(), bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
LivenessQueryResult
Possible outcome of a register liveness query to computeRegisterLiveness()
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool isCopy() const
const MachineBasicBlock * getParent() const
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
bool isRegSequence() const
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
LLVM_ABI void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
LLVM_ABI void substPhysReg(MCRegister Reg, const TargetRegisterInfo &)
substPhysReg - Substitute the current register with the physical register Reg, taking any existing Su...
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
static MachineOperand CreateFI(int Idx)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
IteratorT begin() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:210
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:224
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:226
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:212
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:209
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:220
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:213
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:227
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:238
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:239
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:223
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:219
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:225
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:215
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:240
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
LLVM_READONLY int getFlatScratchInstSSfromSV(uint16_t Opcode)
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2198
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Op::Description Desc
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
FunctionPass * createSIFoldOperandsLegacyPass()
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
char & SIFoldOperandsLegacyID
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:368
iterator_range< df_iterator< T > > depth_first(const T &G)
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
constexpr uint64_t Make_64(uint32_t High, uint32_t Low)
Make a 64-bit integer from a high / low pair of 32-bit integers.
Definition MathExtras.h:160
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.