LLVM 23.0.0git
SIFoldOperands.cpp
Go to the documentation of this file.
1//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7/// \file
8//===----------------------------------------------------------------------===//
9//
10
11#include "SIFoldOperands.h"
12#include "AMDGPU.h"
13#include "GCNSubtarget.h"
15#include "SIInstrInfo.h"
17#include "SIRegisterInfo.h"
22
23#define DEBUG_TYPE "si-fold-operands"
24using namespace llvm;
25
26namespace {
27
28/// Track a value we may want to fold into downstream users, applying
29/// subregister extracts along the way.
30struct FoldableDef {
31 union {
32 MachineOperand *OpToFold = nullptr;
33 uint64_t ImmToFold;
34 int FrameIndexToFold;
35 };
36
37 /// Register class of the originally defined value.
38 const TargetRegisterClass *DefRC = nullptr;
39
40 /// Track the original defining instruction for the value.
41 const MachineInstr *DefMI = nullptr;
42
43 /// Subregister to apply to the value at the use point.
44 unsigned DefSubReg = AMDGPU::NoSubRegister;
45
46 /// Kind of value stored in the union.
48
49 FoldableDef() = delete;
50 FoldableDef(MachineOperand &FoldOp, const TargetRegisterClass *DefRC,
51 unsigned DefSubReg = AMDGPU::NoSubRegister)
52 : DefRC(DefRC), DefSubReg(DefSubReg), Kind(FoldOp.getType()) {
53
54 if (FoldOp.isImm()) {
55 ImmToFold = FoldOp.getImm();
56 } else if (FoldOp.isFI()) {
57 FrameIndexToFold = FoldOp.getIndex();
58 } else {
59 assert(FoldOp.isReg() || FoldOp.isGlobal());
60 OpToFold = &FoldOp;
61 }
62
63 DefMI = FoldOp.getParent();
64 }
65
66 FoldableDef(int64_t FoldImm, const TargetRegisterClass *DefRC,
67 unsigned DefSubReg = AMDGPU::NoSubRegister)
68 : ImmToFold(FoldImm), DefRC(DefRC), DefSubReg(DefSubReg),
70
71 /// Copy the current def and apply \p SubReg to the value.
72 FoldableDef getWithSubReg(const SIRegisterInfo &TRI, unsigned SubReg) const {
73 FoldableDef Copy(*this);
74 Copy.DefSubReg = TRI.composeSubRegIndices(DefSubReg, SubReg);
75 return Copy;
76 }
77
78 bool isReg() const { return Kind == MachineOperand::MO_Register; }
79
80 Register getReg() const {
81 assert(isReg());
82 return OpToFold->getReg();
83 }
84
85 unsigned getSubReg() const {
86 assert(isReg());
87 return OpToFold->getSubReg();
88 }
89
90 bool isImm() const { return Kind == MachineOperand::MO_Immediate; }
91
92 bool isFI() const {
93 return Kind == MachineOperand::MO_FrameIndex;
94 }
95
96 int getFI() const {
97 assert(isFI());
98 return FrameIndexToFold;
99 }
100
101 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
102
103 /// Return the effective immediate value defined by this instruction, after
104 /// application of any subregister extracts which may exist between the use
105 /// and def instruction.
106 std::optional<int64_t> getEffectiveImmVal() const {
107 assert(isImm());
108 return SIInstrInfo::extractSubregFromImm(ImmToFold, DefSubReg);
109 }
110
111 /// Check if it is legal to fold this effective value into \p MI's \p OpNo
112 /// operand.
113 bool isOperandLegal(const SIInstrInfo &TII, const MachineInstr &MI,
114 unsigned OpIdx) const {
115 switch (Kind) {
117 std::optional<int64_t> ImmToFold = getEffectiveImmVal();
118 if (!ImmToFold)
119 return false;
120
121 // TODO: Should verify the subregister index is supported by the class
122 // TODO: Avoid the temporary MachineOperand
123 MachineOperand TmpOp = MachineOperand::CreateImm(*ImmToFold);
124 return TII.isOperandLegal(MI, OpIdx, &TmpOp);
125 }
127 if (DefSubReg != AMDGPU::NoSubRegister)
128 return false;
129 MachineOperand TmpOp = MachineOperand::CreateFI(FrameIndexToFold);
130 return TII.isOperandLegal(MI, OpIdx, &TmpOp);
131 }
132 default:
133 // TODO: Try to apply DefSubReg, for global address we can extract
134 // low/high.
135 if (DefSubReg != AMDGPU::NoSubRegister)
136 return false;
137 return TII.isOperandLegal(MI, OpIdx, OpToFold);
138 }
139
140 llvm_unreachable("covered MachineOperand kind switch");
141 }
142};
143
144struct FoldCandidate {
146 FoldableDef Def;
147 int ShrinkOpcode;
148 unsigned UseOpNo;
149 bool Commuted;
150
151 FoldCandidate(MachineInstr *MI, unsigned OpNo, FoldableDef Def,
152 bool Commuted = false, int ShrinkOp = -1)
153 : UseMI(MI), Def(Def), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
154 Commuted(Commuted) {}
155
156 bool isFI() const { return Def.isFI(); }
157
158 int getFI() const {
159 assert(isFI());
160 return Def.FrameIndexToFold;
161 }
162
163 bool isImm() const { return Def.isImm(); }
164
165 bool isReg() const { return Def.isReg(); }
166
167 Register getReg() const { return Def.getReg(); }
168
169 bool isGlobal() const { return Def.isGlobal(); }
170
171 bool needsShrink() const { return ShrinkOpcode != -1; }
172};
173
174class SIFoldOperandsImpl {
175public:
176 MachineFunction *MF;
178 const SIInstrInfo *TII;
179 const SIRegisterInfo *TRI;
180 const GCNSubtarget *ST;
181 const SIMachineFunctionInfo *MFI;
182
183 bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
184 const FoldableDef &OpToFold) const;
185
186 // TODO: Just use TII::getVALUOp
187 unsigned convertToVALUOp(unsigned Opc, bool UseVOP3 = false) const {
188 switch (Opc) {
189 case AMDGPU::S_ADD_I32: {
190 if (ST->hasAddNoCarryInsts())
191 return UseVOP3 ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_U32_e32;
192 return UseVOP3 ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
193 }
194 case AMDGPU::S_OR_B32:
195 return UseVOP3 ? AMDGPU::V_OR_B32_e64 : AMDGPU::V_OR_B32_e32;
196 case AMDGPU::S_AND_B32:
197 return UseVOP3 ? AMDGPU::V_AND_B32_e64 : AMDGPU::V_AND_B32_e32;
198 case AMDGPU::S_MUL_I32:
199 return AMDGPU::V_MUL_LO_U32_e64;
200 default:
201 return AMDGPU::INSTRUCTION_LIST_END;
202 }
203 }
204
205 bool foldCopyToVGPROfScalarAddOfFrameIndex(Register DstReg, Register SrcReg,
206 MachineInstr &MI) const;
207
208 bool updateOperand(FoldCandidate &Fold) const;
209
210 bool canUseImmWithOpSel(const MachineInstr *MI, unsigned UseOpNo,
211 int64_t ImmVal) const;
212
213 /// Try to fold immediate \p ImmVal into \p MI's operand at index \p UseOpNo.
214 bool tryFoldImmWithOpSel(MachineInstr *MI, unsigned UseOpNo,
215 int64_t ImmVal) const;
216
217 bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
218 MachineInstr *MI, unsigned OpNo,
219 const FoldableDef &OpToFold) const;
220 bool isUseSafeToFold(const MachineInstr &MI,
221 const MachineOperand &UseMO) const;
222
223 const TargetRegisterClass *getRegSeqInit(
224 MachineInstr &RegSeq,
225 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const;
226
227 const TargetRegisterClass *
228 getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
229 Register UseReg) const;
230
231 std::pair<int64_t, const TargetRegisterClass *>
232 isRegSeqSplat(MachineInstr &RegSeg) const;
233
234 bool tryFoldRegSeqSplat(MachineInstr *UseMI, unsigned UseOpIdx,
235 int64_t SplatVal,
236 const TargetRegisterClass *SplatRC) const;
237
238 bool tryToFoldACImm(const FoldableDef &OpToFold, MachineInstr *UseMI,
239 unsigned UseOpIdx,
240 SmallVectorImpl<FoldCandidate> &FoldList) const;
241 void foldOperand(FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
243 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
244
245 bool tryConstantFoldOp(MachineInstr *MI) const;
246 bool tryFoldCndMask(MachineInstr &MI) const;
247 bool tryFoldZeroHighBits(MachineInstr &MI) const;
248 bool foldInstOperand(MachineInstr &MI, const FoldableDef &OpToFold) const;
249
250 bool foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const;
251 bool tryFoldFoldableCopy(MachineInstr &MI,
252 MachineOperand *&CurrentKnownM0Val) const;
253
254 const MachineOperand *isClamp(const MachineInstr &MI) const;
255 bool tryFoldClamp(MachineInstr &MI);
256
257 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
258 bool tryFoldOMod(MachineInstr &MI);
259 bool tryFoldRegSequence(MachineInstr &MI);
260 bool tryFoldPhiAGPR(MachineInstr &MI);
261 bool tryFoldLoad(MachineInstr &MI);
262
263 bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
264
265public:
266 SIFoldOperandsImpl() = default;
267
268 bool run(MachineFunction &MF);
269};
270
271class SIFoldOperandsLegacy : public MachineFunctionPass {
272public:
273 static char ID;
274
275 SIFoldOperandsLegacy() : MachineFunctionPass(ID) {}
276
277 bool runOnMachineFunction(MachineFunction &MF) override {
278 if (skipFunction(MF.getFunction()))
279 return false;
280 return SIFoldOperandsImpl().run(MF);
281 }
282
283 StringRef getPassName() const override { return "SI Fold Operands"; }
284
285 void getAnalysisUsage(AnalysisUsage &AU) const override {
286 AU.setPreservesCFG();
288 }
289
290 MachineFunctionProperties getRequiredProperties() const override {
291 return MachineFunctionProperties().setIsSSA();
292 }
293};
294
295} // End anonymous namespace.
296
297INITIALIZE_PASS(SIFoldOperandsLegacy, DEBUG_TYPE, "SI Fold Operands", false,
298 false)
299
300char SIFoldOperandsLegacy::ID = 0;
301
302char &llvm::SIFoldOperandsLegacyID = SIFoldOperandsLegacy::ID;
303
306 const MachineOperand &MO) {
307 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
308 if (const TargetRegisterClass *SubRC =
309 TRI.getSubRegisterClass(RC, MO.getSubReg()))
310 RC = SubRC;
311 return RC;
312}
313
314// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
315static unsigned macToMad(unsigned Opc) {
316 switch (Opc) {
317 case AMDGPU::V_MAC_F32_e64:
318 return AMDGPU::V_MAD_F32_e64;
319 case AMDGPU::V_MAC_F16_e64:
320 return AMDGPU::V_MAD_F16_e64;
321 case AMDGPU::V_FMAC_F32_e64:
322 return AMDGPU::V_FMA_F32_e64;
323 case AMDGPU::V_FMAC_F16_e64:
324 return AMDGPU::V_FMA_F16_gfx9_e64;
325 case AMDGPU::V_FMAC_F16_t16_e64:
326 return AMDGPU::V_FMA_F16_gfx9_t16_e64;
327 case AMDGPU::V_FMAC_F16_fake16_e64:
328 return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
329 case AMDGPU::V_FMAC_LEGACY_F32_e64:
330 return AMDGPU::V_FMA_LEGACY_F32_e64;
331 case AMDGPU::V_FMAC_F64_e64:
332 return AMDGPU::V_FMA_F64_e64;
333 }
334 return AMDGPU::INSTRUCTION_LIST_END;
335}
336
337// TODO: Add heuristic that the frame index might not fit in the addressing mode
338// immediate offset to avoid materializing in loops.
339bool SIFoldOperandsImpl::frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
340 const FoldableDef &OpToFold) const {
341 if (!OpToFold.isFI())
342 return false;
343
344 const unsigned Opc = UseMI.getOpcode();
345 switch (Opc) {
346 case AMDGPU::S_ADD_I32:
347 case AMDGPU::S_ADD_U32:
348 case AMDGPU::V_ADD_U32_e32:
349 case AMDGPU::V_ADD_CO_U32_e32:
350 // TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have
351 // to insert the wave size shift at every point we use the index.
352 // TODO: Fix depending on visit order to fold immediates into the operand
353 return UseMI.getOperand(OpNo == 1 ? 2 : 1).isImm() &&
354 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
355 case AMDGPU::V_ADD_U32_e64:
356 case AMDGPU::V_ADD_CO_U32_e64:
357 return UseMI.getOperand(OpNo == 2 ? 3 : 2).isImm() &&
358 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
359 default:
360 break;
361 }
362
363 if (TII->isMUBUF(UseMI))
364 return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
365 if (!TII->isFLATScratch(UseMI))
366 return false;
367
368 int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
369 if (OpNo == SIdx)
370 return true;
371
372 int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
373 return OpNo == VIdx && SIdx == -1;
374}
375
376/// Fold %vgpr = COPY (S_ADD_I32 x, frameindex)
377///
378/// => %vgpr = V_ADD_U32 x, frameindex
379bool SIFoldOperandsImpl::foldCopyToVGPROfScalarAddOfFrameIndex(
380 Register DstReg, Register SrcReg, MachineInstr &MI) const {
381 if (TRI->isVGPR(*MRI, DstReg) && TRI->isSGPRReg(*MRI, SrcReg) &&
382 MRI->hasOneNonDBGUse(SrcReg)) {
383 MachineInstr *Def = MRI->getVRegDef(SrcReg);
384 if (!Def || Def->getNumOperands() != 4)
385 return false;
386
387 MachineOperand *Src0 = &Def->getOperand(1);
388 MachineOperand *Src1 = &Def->getOperand(2);
389
390 // TODO: This is profitable with more operand types, and for more
391 // opcodes. But ultimately this is working around poor / nonexistent
392 // regbankselect.
393 if (!Src0->isFI() && !Src1->isFI())
394 return false;
395
396 if (Src0->isFI())
397 std::swap(Src0, Src1);
398
399 const bool UseVOP3 = !Src0->isImm() || TII->isInlineConstant(*Src0);
400 unsigned NewOp = convertToVALUOp(Def->getOpcode(), UseVOP3);
401 if (NewOp == AMDGPU::INSTRUCTION_LIST_END ||
402 !Def->getOperand(3).isDead()) // Check if scc is dead
403 return false;
404
405 MachineBasicBlock *MBB = Def->getParent();
406 const DebugLoc &DL = Def->getDebugLoc();
407 if (NewOp != AMDGPU::V_ADD_CO_U32_e32) {
408 MachineInstrBuilder Add =
409 BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg);
410
411 if (Add->getDesc().getNumDefs() == 2) {
412 Register CarryOutReg = MRI->createVirtualRegister(TRI->getBoolRC());
413 Add.addDef(CarryOutReg, RegState::Dead);
414 MRI->setRegAllocationHint(CarryOutReg, 0, TRI->getVCC());
415 }
416
417 Add.add(*Src0).add(*Src1).setMIFlags(Def->getFlags());
418 if (AMDGPU::hasNamedOperand(NewOp, AMDGPU::OpName::clamp))
419 Add.addImm(0);
420
421 Def->eraseFromParent();
422 MI.eraseFromParent();
423 return true;
424 }
425
426 assert(NewOp == AMDGPU::V_ADD_CO_U32_e32);
427
429 MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, *Def, 16);
430 if (Liveness == MachineBasicBlock::LQR_Dead) {
431 // TODO: If src1 satisfies operand constraints, use vop3 version.
432 BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg)
433 .add(*Src0)
434 .add(*Src1)
435 .setOperandDead(3) // implicit-def $vcc
436 .setMIFlags(Def->getFlags());
437 Def->eraseFromParent();
438 MI.eraseFromParent();
439 return true;
440 }
441 }
442
443 return false;
444}
445
447 return new SIFoldOperandsLegacy();
448}
449
450bool SIFoldOperandsImpl::canUseImmWithOpSel(const MachineInstr *MI,
451 unsigned UseOpNo,
452 int64_t ImmVal) const {
453 const uint64_t TSFlags = MI->getDesc().TSFlags;
454
455 if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
456 (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) ||
457 (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
458 return false;
459
460 const MachineOperand &Old = MI->getOperand(UseOpNo);
461 int OpNo = MI->getOperandNo(&Old);
462
463 unsigned Opcode = MI->getOpcode();
464 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
465 switch (OpType) {
466 default:
467 return false;
475 // VOP3 packed instructions ignore op_sel source modifiers, we cannot encode
476 // two different constants.
477 if ((TSFlags & SIInstrFlags::VOP3) && !(TSFlags & SIInstrFlags::VOP3P) &&
478 static_cast<uint16_t>(ImmVal) != static_cast<uint16_t>(ImmVal >> 16))
479 return false;
480 break;
481 }
482
483 return true;
484}
485
486bool SIFoldOperandsImpl::tryFoldImmWithOpSel(MachineInstr *MI, unsigned UseOpNo,
487 int64_t ImmVal) const {
488 MachineOperand &Old = MI->getOperand(UseOpNo);
489 unsigned Opcode = MI->getOpcode();
490 int OpNo = MI->getOperandNo(&Old);
491 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
492
493 // If the literal can be inlined as-is, apply it and short-circuit the
494 // tests below. The main motivation for this is to avoid unintuitive
495 // uses of opsel.
496 if (AMDGPU::isInlinableLiteralV216(ImmVal, OpType)) {
497 Old.ChangeToImmediate(ImmVal);
498 return true;
499 }
500
501 // Refer to op_sel/op_sel_hi and check if we can change the immediate and
502 // op_sel in a way that allows an inline constant.
503 AMDGPU::OpName ModName = AMDGPU::OpName::NUM_OPERAND_NAMES;
504 unsigned SrcIdx = ~0;
505 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
506 ModName = AMDGPU::OpName::src0_modifiers;
507 SrcIdx = 0;
508 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
509 ModName = AMDGPU::OpName::src1_modifiers;
510 SrcIdx = 1;
511 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
512 ModName = AMDGPU::OpName::src2_modifiers;
513 SrcIdx = 2;
514 }
515 assert(ModName != AMDGPU::OpName::NUM_OPERAND_NAMES);
516 int ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModName);
517 MachineOperand &Mod = MI->getOperand(ModIdx);
518 unsigned ModVal = Mod.getImm();
519
520 uint16_t ImmLo =
521 static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
522 uint16_t ImmHi =
523 static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
524 uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
525 unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
526
527 // Helper function that attempts to inline the given value with a newly
528 // chosen opsel pattern.
529 auto tryFoldToInline = [&](uint32_t Imm) -> bool {
530 if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) {
531 Mod.setImm(NewModVal | SISrcMods::OP_SEL_1);
532 Old.ChangeToImmediate(Imm);
533 return true;
534 }
535
536 // Try to shuffle the halves around and leverage opsel to get an inline
537 // constant.
538 uint16_t Lo = static_cast<uint16_t>(Imm);
539 uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
540 if (Lo == Hi) {
541 if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) {
542 Mod.setImm(NewModVal);
544 return true;
545 }
546
547 if (static_cast<int16_t>(Lo) < 0) {
548 int32_t SExt = static_cast<int16_t>(Lo);
549 if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) {
550 Mod.setImm(NewModVal);
551 Old.ChangeToImmediate(SExt);
552 return true;
553 }
554 }
555
556 // This check is only useful for integer instructions
557 if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16) {
558 if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) {
559 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
560 Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16);
561 return true;
562 }
563 }
564 } else {
565 uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
566 if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) {
567 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0);
568 Old.ChangeToImmediate(Swapped);
569 return true;
570 }
571 }
572
573 return false;
574 };
575
576 if (tryFoldToInline(Imm))
577 return true;
578
579 // Replace integer addition by subtraction and vice versa if it allows
580 // folding the immediate to an inline constant.
581 //
582 // We should only ever get here for SrcIdx == 1 due to canonicalization
583 // earlier in the pipeline, but we double-check here to be safe / fully
584 // general.
585 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
586 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
587 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
588 unsigned ClampIdx =
589 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
590 bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
591
592 if (!Clamp) {
593 uint16_t NegLo = -static_cast<uint16_t>(Imm);
594 uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
595 uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
596
597 if (tryFoldToInline(NegImm)) {
598 unsigned NegOpcode =
599 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
600 MI->setDesc(TII->get(NegOpcode));
601 return true;
602 }
603 }
604 }
605
606 return false;
607}
608
609bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
610 MachineInstr *MI = Fold.UseMI;
611 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
612 assert(Old.isReg());
613
614 std::optional<int64_t> ImmVal;
615 if (Fold.isImm())
616 ImmVal = Fold.Def.getEffectiveImmVal();
617
618 if (ImmVal && canUseImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal)) {
619 if (tryFoldImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal))
620 return true;
621
622 // We can't represent the candidate as an inline constant. Try as a literal
623 // with the original opsel, checking constant bus limitations.
624 MachineOperand New = MachineOperand::CreateImm(*ImmVal);
625 int OpNo = MI->getOperandNo(&Old);
626 if (!TII->isOperandLegal(*MI, OpNo, &New))
627 return false;
628 Old.ChangeToImmediate(*ImmVal);
629 return true;
630 }
631
632 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
633 MachineBasicBlock *MBB = MI->getParent();
634 auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16);
635 if (Liveness != MachineBasicBlock::LQR_Dead) {
636 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
637 return false;
638 }
639
640 int Op32 = Fold.ShrinkOpcode;
641 MachineOperand &Dst0 = MI->getOperand(0);
642 MachineOperand &Dst1 = MI->getOperand(1);
643 assert(Dst0.isDef() && Dst1.isDef());
644
645 bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(Dst1.getReg());
646
647 const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg());
648 Register NewReg0 = MRI->createVirtualRegister(Dst0RC);
649
650 MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32);
651
652 if (HaveNonDbgCarryUse) {
653 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::COPY),
654 Dst1.getReg())
655 .addReg(AMDGPU::VCC, RegState::Kill);
656 }
657
658 // Keep the old instruction around to avoid breaking iterators, but
659 // replace it with a dummy instruction to remove uses.
660 //
661 // FIXME: We should not invert how this pass looks at operands to avoid
662 // this. Should track set of foldable movs instead of looking for uses
663 // when looking at a use.
664 Dst0.setReg(NewReg0);
665 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
666 MI->removeOperand(I);
667 MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF));
668
669 if (Fold.Commuted)
670 TII->commuteInstruction(*Inst32, false);
671 return true;
672 }
673
674 assert(!Fold.needsShrink() && "not handled");
675
676 if (ImmVal) {
677 if (Old.isTied()) {
678 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
679 if (NewMFMAOpc == -1)
680 return false;
681 MI->setDesc(TII->get(NewMFMAOpc));
682 MI->untieRegOperand(0);
683 const MCInstrDesc &MCID = MI->getDesc();
684 for (unsigned I = 0; I < MI->getNumDefs(); ++I)
686 MI->getOperand(I).setIsEarlyClobber(true);
687 }
688
689 // TODO: Should we try to avoid adding this to the candidate list?
690 MachineOperand New = MachineOperand::CreateImm(*ImmVal);
691 int OpNo = MI->getOperandNo(&Old);
692 if (!TII->isOperandLegal(*MI, OpNo, &New))
693 return false;
694
695 Old.ChangeToImmediate(*ImmVal);
696 return true;
697 }
698
699 if (Fold.isGlobal()) {
700 Old.ChangeToGA(Fold.Def.OpToFold->getGlobal(),
701 Fold.Def.OpToFold->getOffset(),
702 Fold.Def.OpToFold->getTargetFlags());
703 return true;
704 }
705
706 if (Fold.isFI()) {
707 Old.ChangeToFrameIndex(Fold.getFI());
708 return true;
709 }
710
711 MachineOperand *New = Fold.Def.OpToFold;
712
713 // Verify the register is compatible with the operand.
714 if (const TargetRegisterClass *OpRC =
715 TII->getRegClass(MI->getDesc(), Fold.UseOpNo)) {
716 const TargetRegisterClass *NewRC =
717 TRI->getRegClassForReg(*MRI, New->getReg());
718
719 const TargetRegisterClass *ConstrainRC = OpRC;
720 if (New->getSubReg()) {
721 ConstrainRC =
722 TRI->getMatchingSuperRegClass(NewRC, OpRC, New->getSubReg());
723
724 if (!ConstrainRC)
725 return false;
726 }
727
728 if (New->getReg().isVirtual() &&
729 !MRI->constrainRegClass(New->getReg(), ConstrainRC)) {
730 LLVM_DEBUG(dbgs() << "Cannot constrain " << printReg(New->getReg(), TRI)
731 << TRI->getRegClassName(ConstrainRC) << '\n');
732 return false;
733 }
734 }
735
736 // Rework once the VS_16 register class is updated to include proper
737 // 16-bit SGPRs instead of 32-bit ones.
738 if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
739 Old.setSubReg(AMDGPU::NoSubRegister);
740 if (New->getReg().isPhysical()) {
741 Old.substPhysReg(New->getReg(), *TRI);
742 } else {
743 Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
744 Old.setIsUndef(New->isUndef());
745 }
746 return true;
747}
748
750 FoldCandidate &&Entry) {
751 // Skip additional folding on the same operand.
752 for (FoldCandidate &Fold : FoldList)
753 if (Fold.UseMI == Entry.UseMI && Fold.UseOpNo == Entry.UseOpNo)
754 return;
755 LLVM_DEBUG(dbgs() << "Append " << (Entry.Commuted ? "commuted" : "normal")
756 << " operand " << Entry.UseOpNo << "\n " << *Entry.UseMI);
757 FoldList.push_back(Entry);
758}
759
761 MachineInstr *MI, unsigned OpNo,
762 const FoldableDef &FoldOp,
763 bool Commuted = false, int ShrinkOp = -1) {
764 appendFoldCandidate(FoldList,
765 FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
766}
767
768// Returns true if the instruction is a packed F32 instruction and the
769// corresponding scalar operand reads 32 bits and replicates the bits to both
770// channels.
772 const GCNSubtarget *ST, MachineInstr *MI, unsigned OpNo) {
773 if (!ST->hasPKF32InstsReplicatingLower32BitsOfScalarInput())
774 return false;
775 const MCOperandInfo &OpDesc = MI->getDesc().operands()[OpNo];
777}
778
779// Packed FP32 instructions only read 32 bits from a scalar operand (SGPR or
780// literal) and replicates the bits to both channels. Therefore, if the hi and
781// lo are not same, we can't fold it.
783 const FoldableDef &OpToFold) {
784 assert(OpToFold.isImm() && "Expected immediate operand");
785 uint64_t ImmVal = OpToFold.getEffectiveImmVal().value();
786 uint32_t Lo = Lo_32(ImmVal);
787 uint32_t Hi = Hi_32(ImmVal);
788 return Lo == Hi;
789}
790
791bool SIFoldOperandsImpl::tryAddToFoldList(
792 SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo,
793 const FoldableDef &OpToFold) const {
794 const unsigned Opc = MI->getOpcode();
795
796 auto tryToFoldAsFMAAKorMK = [&]() {
797 if (!OpToFold.isImm())
798 return false;
799
800 const bool TryAK = OpNo == 3;
801 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
802 MI->setDesc(TII->get(NewOpc));
803
804 // We have to fold into operand which would be Imm not into OpNo.
805 bool FoldAsFMAAKorMK =
806 tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
807 if (FoldAsFMAAKorMK) {
808 // Untie Src2 of fmac.
809 MI->untieRegOperand(3);
810 // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
811 if (OpNo == 1) {
812 MachineOperand &Op1 = MI->getOperand(1);
813 MachineOperand &Op2 = MI->getOperand(2);
814 Register OldReg = Op1.getReg();
815 // Operand 2 might be an inlinable constant
816 if (Op2.isImm()) {
817 Op1.ChangeToImmediate(Op2.getImm());
818 Op2.ChangeToRegister(OldReg, false);
819 } else {
820 Op1.setReg(Op2.getReg());
821 Op2.setReg(OldReg);
822 }
823 }
824 return true;
825 }
826 MI->setDesc(TII->get(Opc));
827 return false;
828 };
829
830 bool IsLegal = OpToFold.isOperandLegal(*TII, *MI, OpNo);
831 if (!IsLegal && OpToFold.isImm()) {
832 if (std::optional<int64_t> ImmVal = OpToFold.getEffectiveImmVal())
833 IsLegal = canUseImmWithOpSel(MI, OpNo, *ImmVal);
834 }
835
836 if (!IsLegal) {
837 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
838 unsigned NewOpc = macToMad(Opc);
839 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
840 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
841 // to fold the operand.
842 MI->setDesc(TII->get(NewOpc));
843 bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) &&
844 AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel);
845 if (AddOpSel)
846 MI->addOperand(MachineOperand::CreateImm(0));
847 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);
848 if (FoldAsMAD) {
849 MI->untieRegOperand(OpNo);
850 return true;
851 }
852 if (AddOpSel)
853 MI->removeOperand(MI->getNumExplicitOperands() - 1);
854 MI->setDesc(TII->get(Opc));
855 }
856
857 // Special case for s_fmac_f32 if we are trying to fold into Src2.
858 // By transforming into fmaak we can untie Src2 and make folding legal.
859 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
860 if (tryToFoldAsFMAAKorMK())
861 return true;
862 }
863
864 // Special case for s_setreg_b32
865 if (OpToFold.isImm()) {
866 unsigned ImmOpc = 0;
867 if (Opc == AMDGPU::S_SETREG_B32)
868 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
869 else if (Opc == AMDGPU::S_SETREG_B32_mode)
870 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
871 if (ImmOpc) {
872 MI->setDesc(TII->get(ImmOpc));
873 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
874 return true;
875 }
876 }
877
878 // Operand is not legal, so try to commute the instruction to
879 // see if this makes it possible to fold.
880 unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex;
881 bool CanCommute = TII->findCommutedOpIndices(*MI, OpNo, CommuteOpNo);
882 if (!CanCommute)
883 return false;
884
885 MachineOperand &Op = MI->getOperand(OpNo);
886 MachineOperand &CommutedOp = MI->getOperand(CommuteOpNo);
887
888 // One of operands might be an Imm operand, and OpNo may refer to it after
889 // the call of commuteInstruction() below. Such situations are avoided
890 // here explicitly as OpNo must be a register operand to be a candidate
891 // for memory folding.
892 if (!Op.isReg() || !CommutedOp.isReg())
893 return false;
894
895 // The same situation with an immediate could reproduce if both inputs are
896 // the same register.
897 if (Op.isReg() && CommutedOp.isReg() &&
898 (Op.getReg() == CommutedOp.getReg() &&
899 Op.getSubReg() == CommutedOp.getSubReg()))
900 return false;
901
902 if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo))
903 return false;
904
905 int Op32 = -1;
906 if (!OpToFold.isOperandLegal(*TII, *MI, CommuteOpNo)) {
907 if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&
908 Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME
909 (!OpToFold.isImm() && !OpToFold.isFI() && !OpToFold.isGlobal())) {
910 TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo);
911 return false;
912 }
913
914 // Verify the other operand is a VGPR, otherwise we would violate the
915 // constant bus restriction.
916 MachineOperand &OtherOp = MI->getOperand(OpNo);
917 if (!OtherOp.isReg() ||
918 !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
919 return false;
920
921 assert(MI->getOperand(1).isDef());
922
923 // Make sure to get the 32-bit version of the commuted opcode.
924 unsigned MaybeCommutedOpc = MI->getOpcode();
925 Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
926 }
927
928 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, /*Commuted=*/true,
929 Op32);
930 return true;
931 }
932
933 // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
934 // By changing into fmamk we can untie Src2.
935 // If folding for Src0 happens first and it is identical operand to Src1 we
936 // should avoid transforming into fmamk which requires commuting as it would
937 // cause folding into Src1 to fail later on due to wrong OpNo used.
938 if (Opc == AMDGPU::S_FMAC_F32 &&
939 (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
940 if (tryToFoldAsFMAAKorMK())
941 return true;
942 }
943
944 // Special case for PK_F32 instructions if we are trying to fold an imm to
945 // src0 or src1.
946 if (OpToFold.isImm() &&
949 return false;
950
951 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
952 return true;
953}
954
955bool SIFoldOperandsImpl::isUseSafeToFold(const MachineInstr &MI,
956 const MachineOperand &UseMO) const {
957 // Operands of SDWA instructions must be registers.
958 return !TII->isSDWA(MI);
959}
960
962 const MachineRegisterInfo &MRI,
963 Register SrcReg) {
964 MachineOperand *Sub = nullptr;
965 for (MachineInstr *SubDef = MRI.getVRegDef(SrcReg);
966 SubDef && TII.isFoldableCopy(*SubDef);
967 SubDef = MRI.getVRegDef(Sub->getReg())) {
968 unsigned SrcIdx = TII.getFoldableCopySrcIdx(*SubDef);
969 MachineOperand &SrcOp = SubDef->getOperand(SrcIdx);
970
971 if (SrcOp.isImm())
972 return &SrcOp;
973 if (!SrcOp.isReg() || SrcOp.getReg().isPhysical())
974 break;
975 Sub = &SrcOp;
976 // TODO: Support compose
977 if (SrcOp.getSubReg())
978 break;
979 }
980
981 return Sub;
982}
983
984const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
985 MachineInstr &RegSeq,
986 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const {
987
988 assert(RegSeq.isRegSequence());
989
990 const TargetRegisterClass *RC = nullptr;
991
992 for (unsigned I = 1, E = RegSeq.getNumExplicitOperands(); I != E; I += 2) {
993 MachineOperand &SrcOp = RegSeq.getOperand(I);
994 unsigned SubRegIdx = RegSeq.getOperand(I + 1).getImm();
995
996 // Only accept reg_sequence with uniform reg class inputs for simplicity.
997 const TargetRegisterClass *OpRC = getRegOpRC(*MRI, *TRI, SrcOp);
998 if (!RC)
999 RC = OpRC;
1000 else if (!TRI->getCommonSubClass(RC, OpRC))
1001 return nullptr;
1002
1003 if (SrcOp.getSubReg()) {
1004 // TODO: Handle subregister compose
1005 Defs.emplace_back(&SrcOp, SubRegIdx);
1006 continue;
1007 }
1008
1009 MachineOperand *DefSrc = lookUpCopyChain(*TII, *MRI, SrcOp.getReg());
1010 if (DefSrc && (DefSrc->isReg() || DefSrc->isImm())) {
1011 Defs.emplace_back(DefSrc, SubRegIdx);
1012 continue;
1013 }
1014
1015 Defs.emplace_back(&SrcOp, SubRegIdx);
1016 }
1017
1018 return RC;
1019}
1020
1021// Find a def of the UseReg, check if it is a reg_sequence and find initializers
1022// for each subreg, tracking it to an immediate if possible. Returns the
1023// register class of the inputs on success.
1024const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
1025 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
1026 Register UseReg) const {
1027 MachineInstr *Def = MRI->getVRegDef(UseReg);
1028 if (!Def || !Def->isRegSequence())
1029 return nullptr;
1030
1031 return getRegSeqInit(*Def, Defs);
1032}
1033
1034std::pair<int64_t, const TargetRegisterClass *>
1035SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const {
1037 const TargetRegisterClass *SrcRC = getRegSeqInit(RegSeq, Defs);
1038 if (!SrcRC)
1039 return {};
1040
1041 bool TryToMatchSplat64 = false;
1042
1043 int64_t Imm;
1044 for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
1045 const MachineOperand *Op = Defs[I].first;
1046 if (!Op->isImm())
1047 return {};
1048
1049 int64_t SubImm = Op->getImm();
1050 if (!I) {
1051 Imm = SubImm;
1052 continue;
1053 }
1054
1055 if (Imm != SubImm) {
1056 if (I == 1 && (E & 1) == 0) {
1057 // If we have an even number of inputs, there's a chance this is a
1058 // 64-bit element splat broken into 32-bit pieces.
1059 TryToMatchSplat64 = true;
1060 break;
1061 }
1062
1063 return {}; // Can only fold splat constants
1064 }
1065 }
1066
1067 if (!TryToMatchSplat64)
1068 return {Defs[0].first->getImm(), SrcRC};
1069
1070 // Fallback to recognizing 64-bit splats broken into 32-bit pieces
1071 // (i.e. recognize every other other element is 0 for 64-bit immediates)
1072 int64_t SplatVal64;
1073 for (unsigned I = 0, E = Defs.size(); I != E; I += 2) {
1074 const MachineOperand *Op0 = Defs[I].first;
1075 const MachineOperand *Op1 = Defs[I + 1].first;
1076
1077 if (!Op0->isImm() || !Op1->isImm())
1078 return {};
1079
1080 unsigned SubReg0 = Defs[I].second;
1081 unsigned SubReg1 = Defs[I + 1].second;
1082
1083 // Assume we're going to generally encounter reg_sequences with sorted
1084 // subreg indexes, so reject any that aren't consecutive.
1085 if (TRI->getChannelFromSubReg(SubReg0) + 1 !=
1086 TRI->getChannelFromSubReg(SubReg1))
1087 return {};
1088
1089 int64_t MergedVal = Make_64(Op1->getImm(), Op0->getImm());
1090 if (I == 0)
1091 SplatVal64 = MergedVal;
1092 else if (SplatVal64 != MergedVal)
1093 return {};
1094 }
1095
1096 const TargetRegisterClass *RC64 = TRI->getSubRegisterClass(
1097 MRI->getRegClass(RegSeq.getOperand(0).getReg()), AMDGPU::sub0_sub1);
1098
1099 return {SplatVal64, RC64};
1100}
1101
1102bool SIFoldOperandsImpl::tryFoldRegSeqSplat(
1103 MachineInstr *UseMI, unsigned UseOpIdx, int64_t SplatVal,
1104 const TargetRegisterClass *SplatRC) const {
1105 const MCInstrDesc &Desc = UseMI->getDesc();
1106 if (UseOpIdx >= Desc.getNumOperands())
1107 return false;
1108
1109 // Filter out unhandled pseudos.
1110 if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
1111 return false;
1112
1113 int16_t RCID = TII->getOpRegClassID(Desc.operands()[UseOpIdx]);
1114 if (RCID == -1)
1115 return false;
1116
1117 const TargetRegisterClass *OpRC = TRI->getRegClass(RCID);
1118
1119 // Special case 0/-1, since when interpreted as a 64-bit element both halves
1120 // have the same bits. These are the only cases where a splat has the same
1121 // interpretation for 32-bit and 64-bit splats.
1122 if (SplatVal != 0 && SplatVal != -1) {
1123 // We need to figure out the scalar type read by the operand. e.g. the MFMA
1124 // operand will be AReg_128, and we want to check if it's compatible with an
1125 // AReg_32 constant.
1126 uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
1127 switch (OpTy) {
1132 OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
1133 break;
1137 OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
1138 break;
1139 default:
1140 return false;
1141 }
1142
1143 if (!TRI->getCommonSubClass(OpRC, SplatRC))
1144 return false;
1145 }
1146
1147 MachineOperand TmpOp = MachineOperand::CreateImm(SplatVal);
1148 if (!TII->isOperandLegal(*UseMI, UseOpIdx, &TmpOp))
1149 return false;
1150
1151 return true;
1152}
1153
1154bool SIFoldOperandsImpl::tryToFoldACImm(
1155 const FoldableDef &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
1156 SmallVectorImpl<FoldCandidate> &FoldList) const {
1157 const MCInstrDesc &Desc = UseMI->getDesc();
1158 if (UseOpIdx >= Desc.getNumOperands())
1159 return false;
1160
1161 // Filter out unhandled pseudos.
1162 if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
1163 return false;
1164
1165 if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) {
1168 return false;
1169 appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold);
1170 return true;
1171 }
1172
1173 return false;
1174}
1175
1176void SIFoldOperandsImpl::foldOperand(
1177 FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
1178 SmallVectorImpl<FoldCandidate> &FoldList,
1179 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
1180 const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);
1181
1182 if (!isUseSafeToFold(*UseMI, *UseOp))
1183 return;
1184
1185 // FIXME: Fold operands with subregs.
1186 if (UseOp->isReg() && OpToFold.isReg()) {
1187 if (UseOp->isImplicit())
1188 return;
1189 // Allow folding from SGPRs to 16-bit VGPRs.
1190 if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&
1191 (UseOp->getSubReg() != AMDGPU::lo16 ||
1192 !TRI->isSGPRReg(*MRI, OpToFold.getReg())))
1193 return;
1194 }
1195
1196 // Special case for REG_SEQUENCE: We can't fold literals into
1197 // REG_SEQUENCE instructions, so we have to fold them into the
1198 // uses of REG_SEQUENCE.
1199 if (UseMI->isRegSequence()) {
1200 Register RegSeqDstReg = UseMI->getOperand(0).getReg();
1201 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
1202
1203 int64_t SplatVal;
1204 const TargetRegisterClass *SplatRC;
1205 std::tie(SplatVal, SplatRC) = isRegSeqSplat(*UseMI);
1206
1207 // Grab the use operands first
1209 llvm::make_pointer_range(MRI->use_nodbg_operands(RegSeqDstReg)));
1210 for (unsigned I = 0; I != UsesToProcess.size(); ++I) {
1211 MachineOperand *RSUse = UsesToProcess[I];
1212 MachineInstr *RSUseMI = RSUse->getParent();
1213 unsigned OpNo = RSUseMI->getOperandNo(RSUse);
1214
1215 if (SplatRC) {
1216 if (RSUseMI->isCopy()) {
1217 Register DstReg = RSUseMI->getOperand(0).getReg();
1218 append_range(UsesToProcess,
1220 continue;
1221 }
1222 if (tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) {
1223 FoldableDef SplatDef(SplatVal, SplatRC);
1224 appendFoldCandidate(FoldList, RSUseMI, OpNo, SplatDef);
1225 continue;
1226 }
1227 }
1228
1229 // TODO: Handle general compose
1230 if (RSUse->getSubReg() != RegSeqDstSubReg)
1231 continue;
1232
1233 // FIXME: We should avoid recursing here. There should be a cleaner split
1234 // between the in-place mutations and adding to the fold list.
1235 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
1236 CopiesToReplace);
1237 }
1238
1239 return;
1240 }
1241
1242 if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList))
1243 return;
1244
1245 if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) {
1246 // Verify that this is a stack access.
1247 // FIXME: Should probably use stack pseudos before frame lowering.
1248
1249 if (TII->isMUBUF(*UseMI)) {
1250 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
1251 MFI->getScratchRSrcReg())
1252 return;
1253
1254 // Ensure this is either relative to the current frame or the current
1255 // wave.
1256 MachineOperand &SOff =
1257 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
1258 if (!SOff.isImm() || SOff.getImm() != 0)
1259 return;
1260 }
1261
1262 const unsigned Opc = UseMI->getOpcode();
1263 if (TII->isFLATScratch(*UseMI) &&
1264 AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
1265 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
1266 unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
1267 unsigned CPol =
1268 TII->getNamedOperand(*UseMI, AMDGPU::OpName::cpol)->getImm();
1269 if ((CPol & AMDGPU::CPol::SCAL) &&
1271 return;
1272
1273 UseMI->setDesc(TII->get(NewOpc));
1274 }
1275
1276 // A frame index will resolve to a positive constant, so it should always be
1277 // safe to fold the addressing mode, even pre-GFX9.
1278 UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI());
1279
1280 return;
1281 }
1282
1283 bool FoldingImmLike =
1284 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1285
1286 if (FoldingImmLike && UseMI->isCopy()) {
1287 Register DestReg = UseMI->getOperand(0).getReg();
1288 Register SrcReg = UseMI->getOperand(1).getReg();
1289 unsigned UseSubReg = UseMI->getOperand(1).getSubReg();
1290 assert(SrcReg.isVirtual());
1291
1292 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
1293
1294 // Don't fold into a copy to a physical register with the same class. Doing
1295 // so would interfere with the register coalescer's logic which would avoid
1296 // redundant initializations.
1297 if (DestReg.isPhysical() && SrcRC->contains(DestReg))
1298 return;
1299
1300 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
1301 // In order to fold immediates into copies, we need to change the copy to a
1302 // MOV. Find a compatible mov instruction with the value.
1303 for (unsigned MovOp :
1304 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
1305 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_MOV_B16_t16_e64,
1306 AMDGPU::V_ACCVGPR_WRITE_B32_e64, AMDGPU::AV_MOV_B32_IMM_PSEUDO,
1307 AMDGPU::AV_MOV_B64_IMM_PSEUDO}) {
1308 const MCInstrDesc &MovDesc = TII->get(MovOp);
1309 const TargetRegisterClass *MovDstRC =
1310 TRI->getRegClass(TII->getOpRegClassID(MovDesc.operands()[0]));
1311
1312 // Fold if the destination register class of the MOV instruction (ResRC)
1313 // is a superclass of (or equal to) the destination register class of the
1314 // COPY (DestRC). If this condition fails, folding would be illegal.
1315 if (!DestRC->hasSuperClassEq(MovDstRC))
1316 continue;
1317
1318 const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1;
1319
1320 int16_t RegClassID = TII->getOpRegClassID(MovDesc.operands()[SrcIdx]);
1321 if (RegClassID != -1) {
1322 const TargetRegisterClass *MovSrcRC = TRI->getRegClass(RegClassID);
1323
1324 if (UseSubReg)
1325 MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg);
1326
1327 // FIXME: We should be able to directly check immediate operand legality
1328 // for all cases, but gfx908 hacks break.
1329 if (MovOp == AMDGPU::AV_MOV_B32_IMM_PSEUDO &&
1330 (!OpToFold.isImm() ||
1331 !TII->isImmOperandLegal(MovDesc, SrcIdx,
1332 *OpToFold.getEffectiveImmVal())))
1333 break;
1334
1335 if (!MRI->constrainRegClass(SrcReg, MovSrcRC))
1336 break;
1337
1338 // FIXME: This is mutating the instruction only and deferring the actual
1339 // fold of the immediate
1340 } else {
1341 // For the _IMM_PSEUDO cases, there can be value restrictions on the
1342 // immediate to verify. Technically we should always verify this, but it
1343 // only matters for these concrete cases.
1344 // TODO: Handle non-imm case if it's useful.
1345 if (!OpToFold.isImm() ||
1346 !TII->isImmOperandLegal(MovDesc, 1, *OpToFold.getEffectiveImmVal()))
1347 break;
1348 }
1349
1352 while (ImpOpI != ImpOpE) {
1353 MachineInstr::mop_iterator Tmp = ImpOpI;
1354 ImpOpI++;
1356 }
1357 UseMI->setDesc(MovDesc);
1358
1359 if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
1360 const auto &SrcOp = UseMI->getOperand(UseOpIdx);
1361 MachineOperand NewSrcOp(SrcOp);
1362 MachineFunction *MF = UseMI->getMF();
1363 UseMI->removeOperand(1);
1364 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
1365 UseMI->addOperand(NewSrcOp); // src0
1366 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
1367 UseOpIdx = SrcIdx;
1368 UseOp = &UseMI->getOperand(UseOpIdx);
1369 }
1370 CopiesToReplace.push_back(UseMI);
1371 break;
1372 }
1373
1374 // We failed to replace the copy, so give up.
1375 if (UseMI->getOpcode() == AMDGPU::COPY)
1376 return;
1377
1378 } else {
1379 if (UseMI->isCopy() && OpToFold.isReg() &&
1380 UseMI->getOperand(0).getReg().isVirtual() &&
1381 !UseMI->getOperand(1).getSubReg() &&
1382 OpToFold.DefMI->implicit_operands().empty()) {
1383 LLVM_DEBUG(dbgs() << "Folding " << OpToFold.OpToFold << "\n into "
1384 << *UseMI);
1385 unsigned Size = TII->getOpSize(*UseMI, 1);
1386 Register UseReg = OpToFold.getReg();
1388 unsigned SubRegIdx = OpToFold.getSubReg();
1389 // Hack to allow 32-bit SGPRs to be folded into True16 instructions
1390 // Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the
1391 // VS_16RegClass
1392 //
1393 // Excerpt from AMDGPUGenRegisterInfoEnums.inc
1394 // NoSubRegister, //0
1395 // hi16, // 1
1396 // lo16, // 2
1397 // sub0, // 3
1398 // ...
1399 // sub1, // 11
1400 // sub1_hi16, // 12
1401 // sub1_lo16, // 13
1402 static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed");
1403 if (Size == 2 && TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
1404 TRI->isSGPRReg(*MRI, UseReg)) {
1405 // Produce the 32 bit subregister index to which the 16-bit subregister
1406 // is aligned.
1407 if (SubRegIdx > AMDGPU::sub1) {
1408 LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx);
1409 M |= M.getLane(M.getHighestLane() - 1);
1410 SmallVector<unsigned, 4> Indexes;
1411 TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M,
1412 Indexes);
1413 assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover");
1414 SubRegIdx = Indexes[0];
1415 // 32-bit registers do not have a sub0 index
1416 } else if (TII->getOpSize(*UseMI, 1) == 4)
1417 SubRegIdx = 0;
1418 else
1419 SubRegIdx = AMDGPU::sub0;
1420 }
1421 UseMI->getOperand(1).setSubReg(SubRegIdx);
1422 UseMI->getOperand(1).setIsKill(false);
1423 CopiesToReplace.push_back(UseMI);
1424 OpToFold.OpToFold->setIsKill(false);
1425
1426 // Remove kill flags as kills may now be out of order with uses.
1427 MRI->clearKillFlags(UseReg);
1428 if (foldCopyToAGPRRegSequence(UseMI))
1429 return;
1430 }
1431
1432 unsigned UseOpc = UseMI->getOpcode();
1433 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
1434 (UseOpc == AMDGPU::V_READLANE_B32 &&
1435 (int)UseOpIdx ==
1436 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
1437 // %vgpr = V_MOV_B32 imm
1438 // %sgpr = V_READFIRSTLANE_B32 %vgpr
1439 // =>
1440 // %sgpr = S_MOV_B32 imm
1441 if (FoldingImmLike) {
1443 UseMI->getOperand(UseOpIdx).getReg(),
1444 *OpToFold.DefMI, *UseMI))
1445 return;
1446
1447 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
1449
1450 if (OpToFold.isImm()) {
1452 *OpToFold.getEffectiveImmVal());
1453 } else if (OpToFold.isFI())
1454 UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getFI());
1455 else {
1456 assert(OpToFold.isGlobal());
1457 UseMI->getOperand(1).ChangeToGA(OpToFold.OpToFold->getGlobal(),
1458 OpToFold.OpToFold->getOffset(),
1459 OpToFold.OpToFold->getTargetFlags());
1460 }
1461 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1462 return;
1463 }
1464
1465 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
1467 UseMI->getOperand(UseOpIdx).getReg(),
1468 *OpToFold.DefMI, *UseMI))
1469 return;
1470
1471 // %vgpr = COPY %sgpr0
1472 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
1473 // =>
1474 // %sgpr1 = COPY %sgpr0
1475 UseMI->setDesc(TII->get(AMDGPU::COPY));
1476 UseMI->getOperand(1).setReg(OpToFold.getReg());
1477 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1478 UseMI->getOperand(1).setIsKill(false);
1479 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1481 return;
1482 }
1483 }
1484
1485 const MCInstrDesc &UseDesc = UseMI->getDesc();
1486
1487 // Don't fold into target independent nodes. Target independent opcodes
1488 // don't have defined register classes.
1489 if (UseDesc.isVariadic() || UseOp->isImplicit() ||
1490 UseDesc.operands()[UseOpIdx].RegClass == -1)
1491 return;
1492 }
1493
1494 // FIXME: We could try to change the instruction from 64-bit to 32-bit
1495 // to enable more folding opportunities. The shrink operands pass
1496 // already does this.
1497
1498 tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold);
1499}
1500
1501static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
1503 switch (Opcode) {
1504 case AMDGPU::S_ADD_I32:
1505 case AMDGPU::S_ADD_U32:
1506 Result = LHS + RHS;
1507 return true;
1508 case AMDGPU::S_SUB_I32:
1509 case AMDGPU::S_SUB_U32:
1510 Result = LHS - RHS;
1511 return true;
1512 case AMDGPU::V_AND_B32_e64:
1513 case AMDGPU::V_AND_B32_e32:
1514 case AMDGPU::S_AND_B32:
1515 Result = LHS & RHS;
1516 return true;
1517 case AMDGPU::V_OR_B32_e64:
1518 case AMDGPU::V_OR_B32_e32:
1519 case AMDGPU::S_OR_B32:
1520 Result = LHS | RHS;
1521 return true;
1522 case AMDGPU::V_XOR_B32_e64:
1523 case AMDGPU::V_XOR_B32_e32:
1524 case AMDGPU::S_XOR_B32:
1525 Result = LHS ^ RHS;
1526 return true;
1527 case AMDGPU::S_XNOR_B32:
1528 Result = ~(LHS ^ RHS);
1529 return true;
1530 case AMDGPU::S_NAND_B32:
1531 Result = ~(LHS & RHS);
1532 return true;
1533 case AMDGPU::S_NOR_B32:
1534 Result = ~(LHS | RHS);
1535 return true;
1536 case AMDGPU::S_ANDN2_B32:
1537 Result = LHS & ~RHS;
1538 return true;
1539 case AMDGPU::S_ORN2_B32:
1540 Result = LHS | ~RHS;
1541 return true;
1542 case AMDGPU::V_LSHL_B32_e64:
1543 case AMDGPU::V_LSHL_B32_e32:
1544 case AMDGPU::S_LSHL_B32:
1545 // The instruction ignores the high bits for out of bounds shifts.
1546 Result = LHS << (RHS & 31);
1547 return true;
1548 case AMDGPU::V_LSHLREV_B32_e64:
1549 case AMDGPU::V_LSHLREV_B32_e32:
1550 Result = RHS << (LHS & 31);
1551 return true;
1552 case AMDGPU::V_LSHR_B32_e64:
1553 case AMDGPU::V_LSHR_B32_e32:
1554 case AMDGPU::S_LSHR_B32:
1555 Result = LHS >> (RHS & 31);
1556 return true;
1557 case AMDGPU::V_LSHRREV_B32_e64:
1558 case AMDGPU::V_LSHRREV_B32_e32:
1559 Result = RHS >> (LHS & 31);
1560 return true;
1561 case AMDGPU::V_ASHR_I32_e64:
1562 case AMDGPU::V_ASHR_I32_e32:
1563 case AMDGPU::S_ASHR_I32:
1564 Result = static_cast<int32_t>(LHS) >> (RHS & 31);
1565 return true;
1566 case AMDGPU::V_ASHRREV_I32_e64:
1567 case AMDGPU::V_ASHRREV_I32_e32:
1568 Result = static_cast<int32_t>(RHS) >> (LHS & 31);
1569 return true;
1570 default:
1571 return false;
1572 }
1573}
1574
1575static unsigned getMovOpc(bool IsScalar) {
1576 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1577}
1578
1579// Try to simplify operations with a constant that may appear after instruction
1580// selection.
1581// TODO: See if a frame index with a fixed offset can fold.
1582bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
1583 if (!MI->allImplicitDefsAreDead())
1584 return false;
1585
1586 unsigned Opc = MI->getOpcode();
1587
1588 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1589 if (Src0Idx == -1)
1590 return false;
1591
1592 MachineOperand *Src0 = &MI->getOperand(Src0Idx);
1593 std::optional<int64_t> Src0Imm = TII->getImmOrMaterializedImm(*Src0);
1594
1595 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1596 Opc == AMDGPU::S_NOT_B32) &&
1597 Src0Imm) {
1598 MI->getOperand(1).ChangeToImmediate(~*Src0Imm);
1599 TII->mutateAndCleanupImplicit(
1600 *MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
1601 return true;
1602 }
1603
1604 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1605 if (Src1Idx == -1)
1606 return false;
1607
1608 MachineOperand *Src1 = &MI->getOperand(Src1Idx);
1609 std::optional<int64_t> Src1Imm = TII->getImmOrMaterializedImm(*Src1);
1610
1611 if (!Src0Imm && !Src1Imm)
1612 return false;
1613
1614 // and k0, k1 -> v_mov_b32 (k0 & k1)
1615 // or k0, k1 -> v_mov_b32 (k0 | k1)
1616 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1617 if (Src0Imm && Src1Imm) {
1618 int32_t NewImm;
1619 if (!evalBinaryInstruction(Opc, NewImm, *Src0Imm, *Src1Imm))
1620 return false;
1621
1622 bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg());
1623
1624 // Be careful to change the right operand, src0 may belong to a different
1625 // instruction.
1626 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1627 MI->removeOperand(Src1Idx);
1628 TII->mutateAndCleanupImplicit(*MI, TII->get(getMovOpc(IsSGPR)));
1629 return true;
1630 }
1631
1632 // S_SUB_* is not commutable, so handle it before the commutability gate.
1633 // Only `x - 0 -> copy x` is valid; `0 - x` is a negation, not a copy.
1634 if (Opc == AMDGPU::S_SUB_I32 || Opc == AMDGPU::S_SUB_U32) {
1635 if (Src1Imm && static_cast<int32_t>(*Src1Imm) == 0) {
1636 // y = sub x, 0 => y = copy x
1637 MI->removeOperand(Src1Idx);
1638 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1639 return true;
1640 }
1641 return false;
1642 }
1643
1644 if (!MI->isCommutable())
1645 return false;
1646
1647 if (Src0Imm && !Src1Imm) {
1648 std::swap(Src0, Src1);
1649 std::swap(Src0Idx, Src1Idx);
1650 std::swap(Src0Imm, Src1Imm);
1651 }
1652
1653 int32_t Src1Val = static_cast<int32_t>(*Src1Imm);
1654 if (Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_ADD_U32) {
1655 if (Src1Val == 0) {
1656 // y = add x, 0 => y = copy x
1657 MI->removeOperand(Src1Idx);
1658 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1659 return true;
1660 }
1661 return false;
1662 }
1663
1664 if (Opc == AMDGPU::V_OR_B32_e64 ||
1665 Opc == AMDGPU::V_OR_B32_e32 ||
1666 Opc == AMDGPU::S_OR_B32) {
1667 if (Src1Val == 0) {
1668 // y = or x, 0 => y = copy x
1669 MI->removeOperand(Src1Idx);
1670 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1671 } else if (Src1Val == -1) {
1672 // y = or x, -1 => y = v_mov_b32 -1
1673 MI->removeOperand(Src0Idx);
1674 TII->mutateAndCleanupImplicit(
1675 *MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
1676 } else
1677 return false;
1678
1679 return true;
1680 }
1681
1682 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1683 Opc == AMDGPU::S_AND_B32) {
1684 if (Src1Val == 0) {
1685 // y = and x, 0 => y = v_mov_b32 0
1686 MI->removeOperand(Src0Idx);
1687 TII->mutateAndCleanupImplicit(
1688 *MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
1689 } else if (Src1Val == -1) {
1690 // y = and x, -1 => y = copy x
1691 MI->removeOperand(Src1Idx);
1692 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1693 } else
1694 return false;
1695
1696 return true;
1697 }
1698
1699 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1700 Opc == AMDGPU::S_XOR_B32) {
1701 if (Src1Val == 0) {
1702 // y = xor x, 0 => y = copy x
1703 MI->removeOperand(Src1Idx);
1704 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1705 return true;
1706 }
1707 }
1708
1709 return false;
1710}
1711
1712// Try to fold an instruction into a simpler one
1713bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
1714 unsigned Opc = MI.getOpcode();
1715 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1716 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1717 return false;
1718
1719 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1720 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1721 if (!Src1->isIdenticalTo(*Src0)) {
1722 std::optional<int64_t> Src1Imm = TII->getImmOrMaterializedImm(*Src1);
1723 if (!Src1Imm)
1724 return false;
1725
1726 std::optional<int64_t> Src0Imm = TII->getImmOrMaterializedImm(*Src0);
1727 if (!Src0Imm || *Src0Imm != *Src1Imm)
1728 return false;
1729 }
1730
1731 int Src1ModIdx =
1732 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1733 int Src0ModIdx =
1734 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1735 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1736 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1737 return false;
1738
1739 LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1740 auto &NewDesc =
1741 TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
1742 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1743 if (Src2Idx != -1)
1744 MI.removeOperand(Src2Idx);
1745 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1746 if (Src1ModIdx != -1)
1747 MI.removeOperand(Src1ModIdx);
1748 if (Src0ModIdx != -1)
1749 MI.removeOperand(Src0ModIdx);
1750 TII->mutateAndCleanupImplicit(MI, NewDesc);
1751 LLVM_DEBUG(dbgs() << MI);
1752 return true;
1753}
1754
1755bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
1756 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1757 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1758 return false;
1759
1760 std::optional<int64_t> Src0Imm =
1761 TII->getImmOrMaterializedImm(MI.getOperand(1));
1762 if (!Src0Imm || *Src0Imm != 0xffff || !MI.getOperand(2).isReg())
1763 return false;
1764
1765 Register Src1 = MI.getOperand(2).getReg();
1766 MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1767 if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
1768 return false;
1769
1770 Register Dst = MI.getOperand(0).getReg();
1771 MRI->replaceRegWith(Dst, Src1);
1772 if (!MI.getOperand(2).isKill())
1773 MRI->clearKillFlags(Src1);
1774 MI.eraseFromParent();
1775 return true;
1776}
1777
1778bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
1779 const FoldableDef &OpToFold) const {
1780 // We need mutate the operands of new mov instructions to add implicit
1781 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1782 // this.
1783 SmallVector<MachineInstr *, 4> CopiesToReplace;
1785 MachineOperand &Dst = MI.getOperand(0);
1786 bool Changed = false;
1787
1788 if (OpToFold.isImm()) {
1789 for (auto &UseMI :
1790 make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
1791 // Folding the immediate may reveal operations that can be constant
1792 // folded or replaced with a copy. This can happen for example after
1793 // frame indices are lowered to constants or from splitting 64-bit
1794 // constants.
1795 //
1796 // We may also encounter cases where one or both operands are
1797 // immediates materialized into a register, which would ordinarily not
1798 // be folded due to multiple uses or operand constraints.
1799 if (tryConstantFoldOp(&UseMI)) {
1800 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
1801 Changed = true;
1802 }
1803 }
1804 }
1805
1807 llvm::make_pointer_range(MRI->use_nodbg_operands(Dst.getReg())));
1808 for (auto *U : UsesToProcess) {
1809 MachineInstr *UseMI = U->getParent();
1810
1811 FoldableDef SubOpToFold = OpToFold.getWithSubReg(*TRI, U->getSubReg());
1812 foldOperand(SubOpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
1813 CopiesToReplace);
1814 }
1815
1816 if (CopiesToReplace.empty() && FoldList.empty())
1817 return Changed;
1818
1819 MachineFunction *MF = MI.getMF();
1820 // Make sure we add EXEC uses to any new v_mov instructions created.
1821 for (MachineInstr *Copy : CopiesToReplace)
1822 Copy->addImplicitDefUseOperands(*MF);
1823
1824 SetVector<MachineInstr *> ConstantFoldCandidates;
1825 for (FoldCandidate &Fold : FoldList) {
1826 assert(!Fold.isReg() || Fold.Def.OpToFold);
1827 if (Fold.isReg() && Fold.getReg().isVirtual()) {
1828 Register Reg = Fold.getReg();
1829 const MachineInstr *DefMI = Fold.Def.DefMI;
1830 if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
1831 execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
1832 continue;
1833 }
1834 if (updateOperand(Fold)) {
1835 // Clear kill flags.
1836 if (Fold.isReg()) {
1837 assert(Fold.Def.OpToFold && Fold.isReg());
1838 // FIXME: Probably shouldn't bother trying to fold if not an
1839 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1840 // copies.
1841 MRI->clearKillFlags(Fold.getReg());
1842 }
1843 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1844 << static_cast<int>(Fold.UseOpNo) << " of "
1845 << *Fold.UseMI);
1846
1847 if (Fold.isImm())
1848 ConstantFoldCandidates.insert(Fold.UseMI);
1849
1850 } else if (Fold.Commuted) {
1851 // Restoring instruction's original operand order if fold has failed.
1852 TII->commuteInstruction(*Fold.UseMI, false);
1853 }
1854 }
1855
1856 for (MachineInstr *MI : ConstantFoldCandidates) {
1857 if (tryConstantFoldOp(MI)) {
1858 LLVM_DEBUG(dbgs() << "Constant folded " << *MI);
1859 Changed = true;
1860 }
1861 }
1862 return true;
1863}
1864
1865/// Fold %agpr = COPY (REG_SEQUENCE x_MOV_B32, ...) into REG_SEQUENCE
1866/// (V_ACCVGPR_WRITE_B32_e64) ... depending on the reg_sequence input values.
1867bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const {
1868 // It is very tricky to store a value into an AGPR. v_accvgpr_write_b32 can
1869 // only accept VGPR or inline immediate. Recreate a reg_sequence with its
1870 // initializers right here, so we will rematerialize immediates and avoid
1871 // copies via different reg classes.
1872 const TargetRegisterClass *DefRC =
1873 MRI->getRegClass(CopyMI->getOperand(0).getReg());
1874 if (!TRI->isAGPRClass(DefRC))
1875 return false;
1876
1877 Register UseReg = CopyMI->getOperand(1).getReg();
1878 MachineInstr *RegSeq = MRI->getVRegDef(UseReg);
1879 if (!RegSeq || !RegSeq->isRegSequence())
1880 return false;
1881
1882 const DebugLoc &DL = CopyMI->getDebugLoc();
1883 MachineBasicBlock &MBB = *CopyMI->getParent();
1884
1885 MachineInstrBuilder B(*MBB.getParent(), CopyMI);
1886 DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
1887
1888 const TargetRegisterClass *UseRC =
1889 MRI->getRegClass(CopyMI->getOperand(1).getReg());
1890
1891 // Value, subregindex for new REG_SEQUENCE
1893
1894 unsigned NumRegSeqOperands = RegSeq->getNumOperands();
1895 unsigned NumFoldable = 0;
1896
1897 for (unsigned I = 1; I != NumRegSeqOperands; I += 2) {
1898 MachineOperand &RegOp = RegSeq->getOperand(I);
1899 unsigned SubRegIdx = RegSeq->getOperand(I + 1).getImm();
1900
1901 if (RegOp.getSubReg()) {
1902 // TODO: Handle subregister compose
1903 NewDefs.emplace_back(&RegOp, SubRegIdx);
1904 continue;
1905 }
1906
1907 MachineOperand *Lookup = lookUpCopyChain(*TII, *MRI, RegOp.getReg());
1908 if (!Lookup)
1909 Lookup = &RegOp;
1910
1911 if (Lookup->isImm()) {
1912 // Check if this is an agpr_32 subregister.
1913 const TargetRegisterClass *DestSuperRC = TRI->getMatchingSuperRegClass(
1914 DefRC, &AMDGPU::AGPR_32RegClass, SubRegIdx);
1915 if (DestSuperRC &&
1916 TII->isInlineConstant(*Lookup, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1917 ++NumFoldable;
1918 NewDefs.emplace_back(Lookup, SubRegIdx);
1919 continue;
1920 }
1921 }
1922
1923 const TargetRegisterClass *InputRC =
1924 Lookup->isReg() ? MRI->getRegClass(Lookup->getReg())
1925 : MRI->getRegClass(RegOp.getReg());
1926
1927 // TODO: Account for Lookup->getSubReg()
1928
1929 // If we can't find a matching super class, this is an SGPR->AGPR or
1930 // VGPR->AGPR subreg copy (or something constant-like we have to materialize
1931 // in the AGPR). We can't directly copy from SGPR to AGPR on gfx908, so we
1932 // want to rewrite to copy to an intermediate VGPR class.
1933 const TargetRegisterClass *MatchRC =
1934 TRI->getMatchingSuperRegClass(DefRC, InputRC, SubRegIdx);
1935 if (!MatchRC) {
1936 ++NumFoldable;
1937 NewDefs.emplace_back(&RegOp, SubRegIdx);
1938 continue;
1939 }
1940
1941 NewDefs.emplace_back(&RegOp, SubRegIdx);
1942 }
1943
1944 // Do not clone a reg_sequence and merely change the result register class.
1945 if (NumFoldable == 0)
1946 return false;
1947
1948 CopyMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
1949 for (unsigned I = CopyMI->getNumOperands() - 1; I > 0; --I)
1950 CopyMI->removeOperand(I);
1951
1952 for (auto [Def, DestSubIdx] : NewDefs) {
1953 if (!Def->isReg()) {
1954 // TODO: Should we use single write for each repeated value like in
1955 // register case?
1956 Register Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1957 BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
1958 .add(*Def);
1959 B.addReg(Tmp);
1960 } else {
1961 TargetInstrInfo::RegSubRegPair Src = getRegSubRegPair(*Def);
1962 Def->setIsKill(false);
1963
1964 Register &VGPRCopy = VGPRCopies[Src];
1965 if (!VGPRCopy) {
1966 const TargetRegisterClass *VGPRUseSubRC =
1967 TRI->getSubRegisterClass(UseRC, DestSubIdx);
1968
1969 // We cannot build a reg_sequence out of the same registers, they
1970 // must be copied. Better do it here before copyPhysReg() created
1971 // several reads to do the AGPR->VGPR->AGPR copy.
1972
1973 // Direct copy from SGPR to AGPR is not possible on gfx908. To avoid
1974 // creation of exploded copies SGPR->VGPR->AGPR in the copyPhysReg()
1975 // later, create a copy here and track if we already have such a copy.
1976 const TargetRegisterClass *SubRC =
1977 TRI->getSubRegisterClass(MRI->getRegClass(Src.Reg), Src.SubReg);
1978 if (!VGPRUseSubRC->hasSubClassEq(SubRC)) {
1979 // TODO: Try to reconstrain class
1980 VGPRCopy = MRI->createVirtualRegister(VGPRUseSubRC);
1981 BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), VGPRCopy).add(*Def);
1982 B.addReg(VGPRCopy);
1983 } else {
1984 // If it is already a VGPR, do not copy the register.
1985 B.add(*Def);
1986 }
1987 } else {
1988 B.addReg(VGPRCopy);
1989 }
1990 }
1991
1992 B.addImm(DestSubIdx);
1993 }
1994
1995 LLVM_DEBUG(dbgs() << "Folded " << *CopyMI);
1996 return true;
1997}
1998
1999bool SIFoldOperandsImpl::tryFoldFoldableCopy(
2000 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
2001 Register DstReg = MI.getOperand(0).getReg();
2002 // Specially track simple redefs of m0 to the same value in a block, so we
2003 // can erase the later ones.
2004 if (DstReg == AMDGPU::M0) {
2005 MachineOperand &NewM0Val = MI.getOperand(1);
2006 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
2007 MI.eraseFromParent();
2008 return true;
2009 }
2010
2011 // We aren't tracking other physical registers
2012 CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical())
2013 ? nullptr
2014 : &NewM0Val;
2015 return false;
2016 }
2017
2018 MachineOperand *OpToFoldPtr;
2019 if (MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e64) {
2020 // Folding when any src_modifiers are non-zero is unsupported
2021 if (TII->hasAnyModifiersSet(MI))
2022 return false;
2023 OpToFoldPtr = &MI.getOperand(2);
2024 } else
2025 OpToFoldPtr = &MI.getOperand(1);
2026 MachineOperand &OpToFold = *OpToFoldPtr;
2027 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
2028
2029 // FIXME: We could also be folding things like TargetIndexes.
2030 if (!FoldingImm && !OpToFold.isReg())
2031 return false;
2032
2033 // Fold virtual registers and constant physical registers.
2034 if (OpToFold.isReg() && OpToFold.getReg().isPhysical() &&
2035 !TRI->isConstantPhysReg(OpToFold.getReg()))
2036 return false;
2037
2038 // Prevent folding operands backwards in the function. For example,
2039 // the COPY opcode must not be replaced by 1 in this example:
2040 //
2041 // %3 = COPY %vgpr0; VGPR_32:%3
2042 // ...
2043 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
2044 if (!DstReg.isVirtual())
2045 return false;
2046
2047 const TargetRegisterClass *DstRC =
2048 MRI->getRegClass(MI.getOperand(0).getReg());
2049
2050 // True16: Fix malformed 16-bit sgpr COPY produced by peephole-opt
2051 // Can remove this code if proper 16-bit SGPRs are implemented
2052 // Example: Pre-peephole-opt
2053 // %29:sgpr_lo16 = COPY %16.lo16:sreg_32
2054 // %32:sreg_32 = COPY %29:sgpr_lo16
2055 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2056 // Post-peephole-opt and DCE
2057 // %32:sreg_32 = COPY %16.lo16:sreg_32
2058 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2059 // After this transform
2060 // %32:sreg_32 = COPY %16:sreg_32
2061 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2062 // After the fold operands pass
2063 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %16:sreg_32
2064 if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() &&
2065 OpToFold.getSubReg()) {
2066 if (DstRC == &AMDGPU::SReg_32RegClass &&
2067 DstRC == MRI->getRegClass(OpToFold.getReg())) {
2068 assert(OpToFold.getSubReg() == AMDGPU::lo16);
2069 OpToFold.setSubReg(0);
2070 }
2071 }
2072
2073 // Fold copy to AGPR through reg_sequence
2074 // TODO: Handle with subregister extract
2075 if (OpToFold.isReg() && MI.isCopy() && !MI.getOperand(1).getSubReg()) {
2076 if (foldCopyToAGPRRegSequence(&MI))
2077 return true;
2078 }
2079
2080 FoldableDef Def(OpToFold, DstRC);
2081 bool Changed = foldInstOperand(MI, Def);
2082
2083 // If we managed to fold all uses of this copy then we might as well
2084 // delete it now.
2085 // The only reason we need to follow chains of copies here is that
2086 // tryFoldRegSequence looks forward through copies before folding a
2087 // REG_SEQUENCE into its eventual users.
2088 auto *InstToErase = &MI;
2089 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2090 auto &SrcOp = InstToErase->getOperand(1);
2091 auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
2092 InstToErase->eraseFromParent();
2093 Changed = true;
2094 InstToErase = nullptr;
2095 if (!SrcReg || SrcReg.isPhysical())
2096 break;
2097 InstToErase = MRI->getVRegDef(SrcReg);
2098 if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
2099 break;
2100 }
2101
2102 if (InstToErase && InstToErase->isRegSequence() &&
2103 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2104 InstToErase->eraseFromParent();
2105 Changed = true;
2106 }
2107
2108 if (Changed)
2109 return true;
2110
2111 // Run this after foldInstOperand to avoid turning scalar additions into
2112 // vector additions when the result scalar result could just be folded into
2113 // the user(s).
2114 return OpToFold.isReg() &&
2115 foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI);
2116}
2117
2118// Clamp patterns are canonically selected to v_max_* instructions, so only
2119// handle them.
2120const MachineOperand *
2121SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const {
2122 unsigned Op = MI.getOpcode();
2123 switch (Op) {
2124 case AMDGPU::V_MAX_F32_e64:
2125 case AMDGPU::V_MAX_F16_e64:
2126 case AMDGPU::V_MAX_F16_t16_e64:
2127 case AMDGPU::V_MAX_F16_fake16_e64:
2128 case AMDGPU::V_MAX_F64_e64:
2129 case AMDGPU::V_MAX_NUM_F64_e64:
2130 case AMDGPU::V_PK_MAX_F16:
2131 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2132 case AMDGPU::V_PK_MAX_NUM_BF16: {
2133 if (MI.mayRaiseFPException())
2134 return nullptr;
2135
2136 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
2137 return nullptr;
2138
2139 // Make sure sources are identical.
2140 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2141 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2142 if (!Src0->isReg() || !Src1->isReg() ||
2143 Src0->getReg() != Src1->getReg() ||
2144 Src0->getSubReg() != Src1->getSubReg() ||
2145 Src0->getSubReg() != AMDGPU::NoSubRegister)
2146 return nullptr;
2147
2148 // Can't fold up if we have modifiers.
2149 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
2150 return nullptr;
2151
2152 unsigned Src0Mods
2153 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
2154 unsigned Src1Mods
2155 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
2156
2157 // Having a 0 op_sel_hi would require swizzling the output in the source
2158 // instruction, which we can't do.
2159 unsigned UnsetMods =
2160 (Op == AMDGPU::V_PK_MAX_F16 || Op == AMDGPU::V_PK_MAX_NUM_BF16)
2162 : 0u;
2163 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
2164 return nullptr;
2165 return Src0;
2166 }
2167 default:
2168 return nullptr;
2169 }
2170}
2171
2172// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
2173bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
2174 const MachineOperand *ClampSrc = isClamp(MI);
2175 if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
2176 return false;
2177
2178 if (!ClampSrc->getReg().isVirtual())
2179 return false;
2180
2181 // Look through COPY. COPY only observed with True16.
2182 Register DefSrcReg = TRI->lookThruCopyLike(ClampSrc->getReg(), MRI);
2183 MachineInstr *Def =
2184 MRI->getVRegDef(DefSrcReg.isVirtual() ? DefSrcReg : ClampSrc->getReg());
2185
2186 // The type of clamp must be compatible.
2187 if (TII->getClampMask(*Def) != TII->getClampMask(MI))
2188 return false;
2189
2190 if (Def->mayRaiseFPException())
2191 return false;
2192
2193 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
2194 if (!DefClamp)
2195 return false;
2196
2197 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
2198
2199 // Clamp is applied after omod, so it is OK if omod is set.
2200 DefClamp->setImm(1);
2201
2202 Register DefReg = Def->getOperand(0).getReg();
2203 Register MIDstReg = MI.getOperand(0).getReg();
2204 if (TRI->isSGPRReg(*MRI, DefReg)) {
2205 // Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
2206 // instruction with a VGPR dst.
2207 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
2208 MIDstReg)
2209 .addReg(DefReg);
2210 } else {
2211 MRI->replaceRegWith(MIDstReg, DefReg);
2212 }
2213 MI.eraseFromParent();
2214
2215 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
2216 // instruction, so we might as well convert it to the more flexible VOP3-only
2217 // mad/fma form.
2218 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
2219 Def->eraseFromParent();
2220
2221 return true;
2222}
2223
2224static int getOModValue(unsigned Opc, int64_t Val) {
2225 switch (Opc) {
2226 case AMDGPU::V_MUL_F64_e64:
2227 case AMDGPU::V_MUL_F64_pseudo_e64: {
2228 switch (Val) {
2229 case 0x3fe0000000000000: // 0.5
2230 return SIOutMods::DIV2;
2231 case 0x4000000000000000: // 2.0
2232 return SIOutMods::MUL2;
2233 case 0x4010000000000000: // 4.0
2234 return SIOutMods::MUL4;
2235 default:
2236 return SIOutMods::NONE;
2237 }
2238 }
2239 case AMDGPU::V_MUL_F32_e64: {
2240 switch (static_cast<uint32_t>(Val)) {
2241 case 0x3f000000: // 0.5
2242 return SIOutMods::DIV2;
2243 case 0x40000000: // 2.0
2244 return SIOutMods::MUL2;
2245 case 0x40800000: // 4.0
2246 return SIOutMods::MUL4;
2247 default:
2248 return SIOutMods::NONE;
2249 }
2250 }
2251 case AMDGPU::V_MUL_F16_e64:
2252 case AMDGPU::V_MUL_F16_t16_e64:
2253 case AMDGPU::V_MUL_F16_fake16_e64: {
2254 switch (static_cast<uint16_t>(Val)) {
2255 case 0x3800: // 0.5
2256 return SIOutMods::DIV2;
2257 case 0x4000: // 2.0
2258 return SIOutMods::MUL2;
2259 case 0x4400: // 4.0
2260 return SIOutMods::MUL4;
2261 default:
2262 return SIOutMods::NONE;
2263 }
2264 }
2265 default:
2266 llvm_unreachable("invalid mul opcode");
2267 }
2268}
2269
2270// FIXME: Does this really not support denormals with f16?
2271// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
2272// handled, so will anything other than that break?
2273std::pair<const MachineOperand *, int>
2274SIFoldOperandsImpl::isOMod(const MachineInstr &MI) const {
2275 unsigned Op = MI.getOpcode();
2276 switch (Op) {
2277 case AMDGPU::V_MUL_F64_e64:
2278 case AMDGPU::V_MUL_F64_pseudo_e64:
2279 case AMDGPU::V_MUL_F32_e64:
2280 case AMDGPU::V_MUL_F16_t16_e64:
2281 case AMDGPU::V_MUL_F16_fake16_e64:
2282 case AMDGPU::V_MUL_F16_e64: {
2283 // If output denormals are enabled, omod is ignored.
2284 if ((Op == AMDGPU::V_MUL_F32_e64 &&
2286 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
2287 Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
2288 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
2291 MI.mayRaiseFPException())
2292 return std::pair(nullptr, SIOutMods::NONE);
2293
2294 const MachineOperand *RegOp = nullptr;
2295 const MachineOperand *ImmOp = nullptr;
2296 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2297 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2298 if (Src0->isImm()) {
2299 ImmOp = Src0;
2300 RegOp = Src1;
2301 } else if (Src1->isImm()) {
2302 ImmOp = Src1;
2303 RegOp = Src0;
2304 } else
2305 return std::pair(nullptr, SIOutMods::NONE);
2306
2307 int OMod = getOModValue(Op, ImmOp->getImm());
2308 if (OMod == SIOutMods::NONE ||
2309 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
2310 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
2311 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
2312 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
2313 return std::pair(nullptr, SIOutMods::NONE);
2314
2315 return std::pair(RegOp, OMod);
2316 }
2317 case AMDGPU::V_ADD_F64_e64:
2318 case AMDGPU::V_ADD_F64_pseudo_e64:
2319 case AMDGPU::V_ADD_F32_e64:
2320 case AMDGPU::V_ADD_F16_e64:
2321 case AMDGPU::V_ADD_F16_t16_e64:
2322 case AMDGPU::V_ADD_F16_fake16_e64: {
2323 // If output denormals are enabled, omod is ignored.
2324 if ((Op == AMDGPU::V_ADD_F32_e64 &&
2326 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
2327 Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
2328 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
2330 return std::pair(nullptr, SIOutMods::NONE);
2331
2332 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
2333 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2334 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2335
2336 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
2337 Src0->getSubReg() == Src1->getSubReg() &&
2338 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
2339 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
2340 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
2341 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
2342 return std::pair(Src0, SIOutMods::MUL2);
2343
2344 return std::pair(nullptr, SIOutMods::NONE);
2345 }
2346 default:
2347 return std::pair(nullptr, SIOutMods::NONE);
2348 }
2349}
2350
2351// FIXME: Does this need to check IEEE bit on function?
2352bool SIFoldOperandsImpl::tryFoldOMod(MachineInstr &MI) {
2353 const MachineOperand *RegOp;
2354 int OMod;
2355 std::tie(RegOp, OMod) = isOMod(MI);
2356 if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
2357 RegOp->getSubReg() != AMDGPU::NoSubRegister ||
2358 !MRI->hasOneNonDBGUser(RegOp->getReg()))
2359 return false;
2360
2361 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
2362 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
2363 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
2364 return false;
2365
2366 if (Def->mayRaiseFPException())
2367 return false;
2368
2369 // Clamp is applied after omod. If the source already has clamp set, don't
2370 // fold it.
2371 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
2372 return false;
2373
2374 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
2375
2376 DefOMod->setImm(OMod);
2377 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
2378 // Kill flags can be wrong if we replaced a def inside a loop with a def
2379 // outside the loop.
2380 MRI->clearKillFlags(Def->getOperand(0).getReg());
2381 MI.eraseFromParent();
2382
2383 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
2384 // instruction, so we might as well convert it to the more flexible VOP3-only
2385 // mad/fma form.
2386 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
2387 Def->eraseFromParent();
2388
2389 return true;
2390}
2391
2392// Try to fold a reg_sequence with vgpr output and agpr inputs into an
2393// instruction which can take an agpr. So far that means a store.
2394bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
2395 assert(MI.isRegSequence());
2396 auto Reg = MI.getOperand(0).getReg();
2397
2398 if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
2399 !MRI->hasOneNonDBGUse(Reg))
2400 return false;
2401
2403 if (!getRegSeqInit(Defs, Reg))
2404 return false;
2405
2406 for (auto &[Op, SubIdx] : Defs) {
2407 if (!Op->isReg())
2408 return false;
2409 if (TRI->isAGPR(*MRI, Op->getReg()))
2410 continue;
2411 // Maybe this is a COPY from AREG
2412 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
2413 if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
2414 return false;
2415 if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
2416 return false;
2417 }
2418
2419 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
2420 MachineInstr *UseMI = Op->getParent();
2421 while (UseMI->isCopy() && !Op->getSubReg()) {
2422 Reg = UseMI->getOperand(0).getReg();
2423 if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
2424 return false;
2425 Op = &*MRI->use_nodbg_begin(Reg);
2426 UseMI = Op->getParent();
2427 }
2428
2429 if (Op->getSubReg())
2430 return false;
2431
2432 unsigned OpIdx = Op - &UseMI->getOperand(0);
2433 const MCInstrDesc &InstDesc = UseMI->getDesc();
2434 const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx);
2435 if (!OpRC || !TRI->isVectorSuperClass(OpRC))
2436 return false;
2437
2438 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
2439 auto Dst = MRI->createVirtualRegister(NewDstRC);
2440 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
2441 TII->get(AMDGPU::REG_SEQUENCE), Dst);
2442
2443 for (auto &[Def, SubIdx] : Defs) {
2444 Def->setIsKill(false);
2445 if (TRI->isAGPR(*MRI, Def->getReg())) {
2446 RS.add(*Def);
2447 } else { // This is a copy
2448 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
2449 SubDef->getOperand(1).setIsKill(false);
2450 RS.addReg(SubDef->getOperand(1).getReg(), {}, Def->getSubReg());
2451 }
2452 RS.addImm(SubIdx);
2453 }
2454
2455 Op->setReg(Dst);
2456 if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
2457 Op->setReg(Reg);
2458 RS->eraseFromParent();
2459 return false;
2460 }
2461
2462 LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
2463
2464 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
2465 // in which case we can erase them all later in runOnMachineFunction.
2466 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
2467 MI.eraseFromParent();
2468 return true;
2469}
2470
2471/// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and
2472/// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg
2473static bool isAGPRCopy(const SIRegisterInfo &TRI,
2474 const MachineRegisterInfo &MRI, const MachineInstr &Copy,
2475 Register &OutReg, unsigned &OutSubReg) {
2476 assert(Copy.isCopy());
2477
2478 const MachineOperand &CopySrc = Copy.getOperand(1);
2479 Register CopySrcReg = CopySrc.getReg();
2480 if (!CopySrcReg.isVirtual())
2481 return false;
2482
2483 // Common case: copy from AGPR directly, e.g.
2484 // %1:vgpr_32 = COPY %0:agpr_32
2485 if (TRI.isAGPR(MRI, CopySrcReg)) {
2486 OutReg = CopySrcReg;
2487 OutSubReg = CopySrc.getSubReg();
2488 return true;
2489 }
2490
2491 // Sometimes it can also involve two copies, e.g.
2492 // %1:vgpr_256 = COPY %0:agpr_256
2493 // %2:vgpr_32 = COPY %1:vgpr_256.sub0
2494 const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg);
2495 if (!CopySrcDef || !CopySrcDef->isCopy())
2496 return false;
2497
2498 const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1);
2499 Register OtherCopySrcReg = OtherCopySrc.getReg();
2500 if (!OtherCopySrcReg.isVirtual() ||
2501 CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister ||
2502 OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister ||
2503 !TRI.isAGPR(MRI, OtherCopySrcReg))
2504 return false;
2505
2506 OutReg = OtherCopySrcReg;
2507 OutSubReg = CopySrc.getSubReg();
2508 return true;
2509}
2510
2511// Try to hoist an AGPR to VGPR copy across a PHI.
2512// This should allow folding of an AGPR into a consumer which may support it.
2513//
2514// Example 1: LCSSA PHI
2515// loop:
2516// %1:vreg = COPY %0:areg
2517// exit:
2518// %2:vreg = PHI %1:vreg, %loop
2519// =>
2520// loop:
2521// exit:
2522// %1:areg = PHI %0:areg, %loop
2523// %2:vreg = COPY %1:areg
2524//
2525// Example 2: PHI with multiple incoming values:
2526// entry:
2527// %1:vreg = GLOBAL_LOAD(..)
2528// loop:
2529// %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
2530// %3:areg = COPY %2:vreg
2531// %4:areg = (instr using %3:areg)
2532// %5:vreg = COPY %4:areg
2533// =>
2534// entry:
2535// %1:vreg = GLOBAL_LOAD(..)
2536// %2:areg = COPY %1:vreg
2537// loop:
2538// %3:areg = PHI %2:areg, %entry, %X:areg,
2539// %4:areg = (instr using %3:areg)
2540bool SIFoldOperandsImpl::tryFoldPhiAGPR(MachineInstr &PHI) {
2541 assert(PHI.isPHI());
2542
2543 Register PhiOut = PHI.getOperand(0).getReg();
2544 if (!TRI->isVGPR(*MRI, PhiOut))
2545 return false;
2546
2547 // Iterate once over all incoming values of the PHI to check if this PHI is
2548 // eligible, and determine the exact AGPR RC we'll target.
2549 const TargetRegisterClass *ARC = nullptr;
2550 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2551 MachineOperand &MO = PHI.getOperand(K);
2552 MachineInstr *Copy = MRI->getVRegDef(MO.getReg());
2553 if (!Copy || !Copy->isCopy())
2554 continue;
2555
2556 Register AGPRSrc;
2557 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
2558 if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask))
2559 continue;
2560
2561 const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc);
2562 if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
2563 CopyInRC = SubRC;
2564
2565 if (ARC && !ARC->hasSubClassEq(CopyInRC))
2566 return false;
2567 ARC = CopyInRC;
2568 }
2569
2570 if (!ARC)
2571 return false;
2572
2573 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
2574
2575 // Rewrite the PHI's incoming values to ARC.
2576 LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI);
2577 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2578 MachineOperand &MO = PHI.getOperand(K);
2579 Register Reg = MO.getReg();
2580
2582 MachineBasicBlock *InsertMBB = nullptr;
2583
2584 // Look at the def of Reg, ignoring all copies.
2585 unsigned CopyOpc = AMDGPU::COPY;
2586 if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
2587
2588 // Look at pre-existing COPY instructions from ARC: Steal the operand. If
2589 // the copy was single-use, it will be removed by DCE later.
2590 if (Def->isCopy()) {
2591 Register AGPRSrc;
2592 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
2593 if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) {
2594 MO.setReg(AGPRSrc);
2595 MO.setSubReg(AGPRSubReg);
2596 continue;
2597 }
2598
2599 // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
2600 // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
2601 // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
2602 // is unlikely to be profitable.
2603 //
2604 // Note that V_ACCVGPR_WRITE is only used for AGPR_32.
2605 MachineOperand &CopyIn = Def->getOperand(1);
2606 if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) &&
2607 TRI->isSGPRReg(*MRI, CopyIn.getReg()))
2608 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
2609 }
2610
2611 InsertMBB = Def->getParent();
2612 InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator());
2613 } else {
2614 InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
2615 InsertPt = InsertMBB->getFirstTerminator();
2616 }
2617
2618 Register NewReg = MRI->createVirtualRegister(ARC);
2619 MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
2620 TII->get(CopyOpc), NewReg)
2621 .addReg(Reg);
2622 MO.setReg(NewReg);
2623
2624 (void)MI;
2625 LLVM_DEBUG(dbgs() << " Created COPY: " << *MI);
2626 }
2627
2628 // Replace the PHI's result with a new register.
2629 Register NewReg = MRI->createVirtualRegister(ARC);
2630 PHI.getOperand(0).setReg(NewReg);
2631
2632 // COPY that new register back to the original PhiOut register. This COPY will
2633 // usually be folded out later.
2634 MachineBasicBlock *MBB = PHI.getParent();
2635 BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(),
2636 TII->get(AMDGPU::COPY), PhiOut)
2637 .addReg(NewReg);
2638
2639 LLVM_DEBUG(dbgs() << " Done: Folded " << PHI);
2640 return true;
2641}
2642
2643// Attempt to convert VGPR load to an AGPR load.
2644bool SIFoldOperandsImpl::tryFoldLoad(MachineInstr &MI) {
2645 assert(MI.mayLoad());
2646 if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
2647 return false;
2648
2649 MachineOperand &Def = MI.getOperand(0);
2650 if (!Def.isDef())
2651 return false;
2652
2653 Register DefReg = Def.getReg();
2654
2655 if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
2656 return false;
2657
2660 SmallVector<Register, 8> MoveRegs;
2661
2662 if (Users.empty())
2663 return false;
2664
2665 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
2666 while (!Users.empty()) {
2667 const MachineInstr *I = Users.pop_back_val();
2668 if (!I->isCopy() && !I->isRegSequence())
2669 return false;
2670 Register DstReg = I->getOperand(0).getReg();
2671 // Physical registers may have more than one instruction definitions
2672 if (DstReg.isPhysical())
2673 return false;
2674 if (TRI->isAGPR(*MRI, DstReg))
2675 continue;
2676 MoveRegs.push_back(DstReg);
2677 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg))
2678 Users.push_back(&U);
2679 }
2680
2681 const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
2682 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
2683 if (!TII->isOperandLegal(MI, 0, &Def)) {
2684 MRI->setRegClass(DefReg, RC);
2685 return false;
2686 }
2687
2688 while (!MoveRegs.empty()) {
2689 Register Reg = MoveRegs.pop_back_val();
2690 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
2691 }
2692
2693 LLVM_DEBUG(dbgs() << "Folded " << MI);
2694
2695 return true;
2696}
2697
2698// tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
2699// For GFX90A and later, this is pretty much always a good thing, but for GFX908
2700// there's cases where it can create a lot more AGPR-AGPR copies, which are
2701// expensive on this architecture due to the lack of V_ACCVGPR_MOV.
2702//
2703// This function looks at all AGPR PHIs in a basic block and collects their
2704// operands. Then, it checks for register that are used more than once across
2705// all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
2706// having to create one VGPR temporary per use, which can get very messy if
2707// these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
2708// element).
2709//
2710// Example
2711// a:
2712// %in:agpr_256 = COPY %foo:vgpr_256
2713// c:
2714// %x:agpr_32 = ..
2715// b:
2716// %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
2717// %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
2718// %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
2719// =>
2720// a:
2721// %in:agpr_256 = COPY %foo:vgpr_256
2722// %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
2723// %tmp_agpr:agpr_32 = COPY %tmp
2724// c:
2725// %x:agpr_32 = ..
2726// b:
2727// %0:areg = PHI %tmp_agpr, %a, %x, %c
2728// %1:areg = PHI %tmp_agpr, %a, %y, %c
2729// %2:areg = PHI %tmp_agpr, %a, %z, %c
2730bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
2731 // This is only really needed on GFX908 where AGPR-AGPR copies are
2732 // unreasonably difficult.
2733 if (ST->hasGFX90AInsts())
2734 return false;
2735
2736 // Look at all AGPR Phis and collect the register + subregister used.
2737 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
2738 RegToMO;
2739
2740 for (auto &MI : MBB) {
2741 if (!MI.isPHI())
2742 break;
2743
2744 if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))
2745 continue;
2746
2747 for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
2748 MachineOperand &PhiMO = MI.getOperand(K);
2749 if (!PhiMO.getSubReg())
2750 continue;
2751 RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
2752 }
2753 }
2754
2755 // For all (Reg, SubReg) pair that are used more than once, cache the value in
2756 // a VGPR.
2757 bool Changed = false;
2758 for (const auto &[Entry, MOs] : RegToMO) {
2759 if (MOs.size() == 1)
2760 continue;
2761
2762 const auto [Reg, SubReg] = Entry;
2763 MachineInstr *Def = MRI->getVRegDef(Reg);
2764 MachineBasicBlock *DefMBB = Def->getParent();
2765
2766 // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
2767 // out.
2768 const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
2769 Register TempVGPR =
2770 MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
2771 MachineInstr *VGPRCopy =
2772 BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
2773 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2774 .addReg(Reg, /* flags */ {}, SubReg);
2775
2776 // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
2777 Register TempAGPR = MRI->createVirtualRegister(ARC);
2778 BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(),
2779 TII->get(AMDGPU::COPY), TempAGPR)
2780 .addReg(TempVGPR);
2781
2782 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
2783 for (MachineOperand *MO : MOs) {
2784 MO->setReg(TempAGPR);
2785 MO->setSubReg(AMDGPU::NoSubRegister);
2786 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n");
2787 }
2788
2789 Changed = true;
2790 }
2791
2792 return Changed;
2793}
2794
2795bool SIFoldOperandsImpl::run(MachineFunction &MF) {
2796 this->MF = &MF;
2797 MRI = &MF.getRegInfo();
2798 ST = &MF.getSubtarget<GCNSubtarget>();
2799 TII = ST->getInstrInfo();
2800 TRI = &TII->getRegisterInfo();
2801 MFI = MF.getInfo<SIMachineFunctionInfo>();
2802
2803 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
2804 // correctly handle signed zeros.
2805 //
2806 // FIXME: Also need to check strictfp
2807 bool IsIEEEMode = MFI->getMode().IEEE;
2808
2809 bool Changed = false;
2810 for (MachineBasicBlock *MBB : depth_first(&MF)) {
2811 MachineOperand *CurrentKnownM0Val = nullptr;
2812 for (auto &MI : make_early_inc_range(*MBB)) {
2813 Changed |= tryFoldCndMask(MI);
2814
2815 if (tryFoldZeroHighBits(MI)) {
2816 Changed = true;
2817 continue;
2818 }
2819
2820 if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
2821 Changed = true;
2822 continue;
2823 }
2824
2825 if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
2826 Changed = true;
2827 continue;
2828 }
2829
2830 if (MI.mayLoad() && tryFoldLoad(MI)) {
2831 Changed = true;
2832 continue;
2833 }
2834
2835 if (TII->isFoldableCopy(MI)) {
2836 Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
2837 continue;
2838 }
2839
2840 // Saw an unknown clobber of m0, so we no longer know what it is.
2841 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
2842 CurrentKnownM0Val = nullptr;
2843
2844 // TODO: Omod might be OK if there is NSZ only on the source
2845 // instruction, and not the omod multiply.
2846 if (IsIEEEMode || !MI.getFlag(MachineInstr::FmNsz) || !tryFoldOMod(MI))
2847 Changed |= tryFoldClamp(MI);
2848 }
2849
2850 Changed |= tryOptimizeAGPRPhis(*MBB);
2851 }
2852
2853 return Changed;
2854}
2855
2858 MFPropsModifier _(*this, MF);
2859
2860 bool Changed = SIFoldOperandsImpl().run(MF);
2861 if (!Changed) {
2862 return PreservedAnalyses::all();
2863 }
2865 PA.preserveSet<CFGAnalyses>();
2866 return PA;
2867}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
#define _
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, FoldCandidate &&Entry)
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static unsigned getMovOpc(bool IsScalar)
static MachineOperand * lookUpCopyChain(const SIInstrInfo &TII, const MachineRegisterInfo &MRI, Register SrcReg)
static bool checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(const FoldableDef &OpToFold)
static bool isPKF32InstrReplicatesLower32BitsOfScalarOperand(const GCNSubtarget *ST, MachineInstr *MI, unsigned OpNo)
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
#define LLVM_DEBUG(...)
Definition Debug.h:119
static int Lookup(ArrayRef< TableEntry > Table, unsigned Opcode)
Value * RHS
Value * LHS
Represent the analysis usage information of a pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
const SIInstrInfo * getInstrInfo() const override
bool hasDOTOpSelHazard() const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
const HexagonRegisterInfo & getRegisterInfo() const
ArrayRef< MCOperandInfo > operands() const
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
An RAII based helper class to modify MachineFunctionProperties when running pass.
LLVM_ABI iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg=Register(), bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
LivenessQueryResult
Possible outcome of a register liveness query to computeRegisterLiveness()
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool isCopy() const
const MachineBasicBlock * getParent() const
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
void clearFlag(MIFlag Flag)
clearFlag - Clear a MI flag.
bool isRegSequence() const
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
LLVM_ABI void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
LLVM_ABI void substPhysReg(MCRegister Reg, const TargetRegisterInfo &)
substPhysReg - Substitute the current register with the physical register Reg, taking any existing Su...
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
static MachineOperand CreateFI(int Idx)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
use_nodbg_iterator use_nodbg_begin(Register RegNo) const
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
iterator_range< use_nodbg_iterator > use_nodbg_operands(Register Reg) const
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLVM_ABI bool hasOneNonDBGUser(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug instruction using the specified regis...
iterator_range< use_instr_nodbg_iterator > use_nodbg_instructions(Register Reg) const
void setRegAllocationHint(Register VReg, unsigned Type, Register PrefReg)
setRegAllocationHint - Specify a register allocation hint for the specified virtual register.
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
Register getReg() const
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
IteratorT begin() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType)
LLVM_READONLY int32_t getMFMAEarlyClobberOp(uint32_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY int32_t getVOPe32(uint32_t Opcode)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:227
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:213
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:210
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:221
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:214
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:228
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:239
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:240
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:224
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:220
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:226
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:216
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:241
LLVM_READONLY int32_t getFlatScratchInstSSfromSV(uint32_t Opcode)
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Op::Description Desc
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
FunctionPass * createSIFoldOperandsLegacyPass()
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
char & SIFoldOperandsLegacyID
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:368
iterator_range< df_iterator< T > > depth_first(const T &G)
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
constexpr uint64_t Make_64(uint32_t High, uint32_t Low)
Make a 64-bit integer from a high / low pair of 32-bit integers.
Definition MathExtras.h:160
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.