LLVM 22.0.0git
SIFoldOperands.cpp
Go to the documentation of this file.
1//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7/// \file
8//===----------------------------------------------------------------------===//
9//
10
11#include "SIFoldOperands.h"
12#include "AMDGPU.h"
13#include "GCNSubtarget.h"
15#include "SIInstrInfo.h"
17#include "SIRegisterInfo.h"
22
23#define DEBUG_TYPE "si-fold-operands"
24using namespace llvm;
25
26namespace {
27
28/// Track a value we may want to fold into downstream users, applying
29/// subregister extracts along the way.
30struct FoldableDef {
31 union {
32 MachineOperand *OpToFold = nullptr;
33 uint64_t ImmToFold;
34 int FrameIndexToFold;
35 };
36
37 /// Register class of the originally defined value.
38 const TargetRegisterClass *DefRC = nullptr;
39
40 /// Track the original defining instruction for the value.
41 const MachineInstr *DefMI = nullptr;
42
43 /// Subregister to apply to the value at the use point.
44 unsigned DefSubReg = AMDGPU::NoSubRegister;
45
46 /// Kind of value stored in the union.
48
49 FoldableDef() = delete;
50 FoldableDef(MachineOperand &FoldOp, const TargetRegisterClass *DefRC,
51 unsigned DefSubReg = AMDGPU::NoSubRegister)
52 : DefRC(DefRC), DefSubReg(DefSubReg), Kind(FoldOp.getType()) {
53
54 if (FoldOp.isImm()) {
55 ImmToFold = FoldOp.getImm();
56 } else if (FoldOp.isFI()) {
57 FrameIndexToFold = FoldOp.getIndex();
58 } else {
59 assert(FoldOp.isReg() || FoldOp.isGlobal());
60 OpToFold = &FoldOp;
61 }
62
63 DefMI = FoldOp.getParent();
64 }
65
66 FoldableDef(int64_t FoldImm, const TargetRegisterClass *DefRC,
67 unsigned DefSubReg = AMDGPU::NoSubRegister)
68 : ImmToFold(FoldImm), DefRC(DefRC), DefSubReg(DefSubReg),
70
71 /// Copy the current def and apply \p SubReg to the value.
72 FoldableDef getWithSubReg(const SIRegisterInfo &TRI, unsigned SubReg) const {
73 FoldableDef Copy(*this);
74 Copy.DefSubReg = TRI.composeSubRegIndices(DefSubReg, SubReg);
75 return Copy;
76 }
77
78 bool isReg() const { return Kind == MachineOperand::MO_Register; }
79
80 Register getReg() const {
81 assert(isReg());
82 return OpToFold->getReg();
83 }
84
85 unsigned getSubReg() const {
86 assert(isReg());
87 return OpToFold->getSubReg();
88 }
89
90 bool isImm() const { return Kind == MachineOperand::MO_Immediate; }
91
92 bool isFI() const {
93 return Kind == MachineOperand::MO_FrameIndex;
94 }
95
96 int getFI() const {
97 assert(isFI());
98 return FrameIndexToFold;
99 }
100
101 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
102
103 /// Return the effective immediate value defined by this instruction, after
104 /// application of any subregister extracts which may exist between the use
105 /// and def instruction.
106 std::optional<int64_t> getEffectiveImmVal() const {
107 assert(isImm());
108 return SIInstrInfo::extractSubregFromImm(ImmToFold, DefSubReg);
109 }
110
111 /// Check if it is legal to fold this effective value into \p MI's \p OpNo
112 /// operand.
113 bool isOperandLegal(const SIInstrInfo &TII, const MachineInstr &MI,
114 unsigned OpIdx) const {
115 switch (Kind) {
117 std::optional<int64_t> ImmToFold = getEffectiveImmVal();
118 if (!ImmToFold)
119 return false;
120
121 // TODO: Should verify the subregister index is supported by the class
122 // TODO: Avoid the temporary MachineOperand
123 MachineOperand TmpOp = MachineOperand::CreateImm(*ImmToFold);
124 return TII.isOperandLegal(MI, OpIdx, &TmpOp);
125 }
127 if (DefSubReg != AMDGPU::NoSubRegister)
128 return false;
129 MachineOperand TmpOp = MachineOperand::CreateFI(FrameIndexToFold);
130 return TII.isOperandLegal(MI, OpIdx, &TmpOp);
131 }
132 default:
133 // TODO: Try to apply DefSubReg, for global address we can extract
134 // low/high.
135 if (DefSubReg != AMDGPU::NoSubRegister)
136 return false;
137 return TII.isOperandLegal(MI, OpIdx, OpToFold);
138 }
139
140 llvm_unreachable("covered MachineOperand kind switch");
141 }
142};
143
144struct FoldCandidate {
146 FoldableDef Def;
147 int ShrinkOpcode;
148 unsigned UseOpNo;
149 bool Commuted;
150
151 FoldCandidate(MachineInstr *MI, unsigned OpNo, FoldableDef Def,
152 bool Commuted = false, int ShrinkOp = -1)
153 : UseMI(MI), Def(Def), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
154 Commuted(Commuted) {}
155
156 bool isFI() const { return Def.isFI(); }
157
158 int getFI() const {
159 assert(isFI());
160 return Def.FrameIndexToFold;
161 }
162
163 bool isImm() const { return Def.isImm(); }
164
165 bool isReg() const { return Def.isReg(); }
166
167 Register getReg() const { return Def.getReg(); }
168
169 bool isGlobal() const { return Def.isGlobal(); }
170
171 bool needsShrink() const { return ShrinkOpcode != -1; }
172};
173
174class SIFoldOperandsImpl {
175public:
176 MachineFunction *MF;
178 const SIInstrInfo *TII;
179 const SIRegisterInfo *TRI;
180 const GCNSubtarget *ST;
181 const SIMachineFunctionInfo *MFI;
182
183 bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
184 const FoldableDef &OpToFold) const;
185
186 // TODO: Just use TII::getVALUOp
187 unsigned convertToVALUOp(unsigned Opc, bool UseVOP3 = false) const {
188 switch (Opc) {
189 case AMDGPU::S_ADD_I32: {
190 if (ST->hasAddNoCarry())
191 return UseVOP3 ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_U32_e32;
192 return UseVOP3 ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
193 }
194 case AMDGPU::S_OR_B32:
195 return UseVOP3 ? AMDGPU::V_OR_B32_e64 : AMDGPU::V_OR_B32_e32;
196 case AMDGPU::S_AND_B32:
197 return UseVOP3 ? AMDGPU::V_AND_B32_e64 : AMDGPU::V_AND_B32_e32;
198 case AMDGPU::S_MUL_I32:
199 return AMDGPU::V_MUL_LO_U32_e64;
200 default:
201 return AMDGPU::INSTRUCTION_LIST_END;
202 }
203 }
204
205 bool foldCopyToVGPROfScalarAddOfFrameIndex(Register DstReg, Register SrcReg,
206 MachineInstr &MI) const;
207
208 bool updateOperand(FoldCandidate &Fold) const;
209
210 bool canUseImmWithOpSel(const MachineInstr *MI, unsigned UseOpNo,
211 int64_t ImmVal) const;
212
213 /// Try to fold immediate \p ImmVal into \p MI's operand at index \p UseOpNo.
214 bool tryFoldImmWithOpSel(MachineInstr *MI, unsigned UseOpNo,
215 int64_t ImmVal) const;
216
217 bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
218 MachineInstr *MI, unsigned OpNo,
219 const FoldableDef &OpToFold) const;
220 bool isUseSafeToFold(const MachineInstr &MI,
221 const MachineOperand &UseMO) const;
222
223 const TargetRegisterClass *getRegSeqInit(
224 MachineInstr &RegSeq,
225 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const;
226
227 const TargetRegisterClass *
228 getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
229 Register UseReg) const;
230
231 std::pair<int64_t, const TargetRegisterClass *>
232 isRegSeqSplat(MachineInstr &RegSeg) const;
233
234 bool tryFoldRegSeqSplat(MachineInstr *UseMI, unsigned UseOpIdx,
235 int64_t SplatVal,
236 const TargetRegisterClass *SplatRC) const;
237
238 bool tryToFoldACImm(const FoldableDef &OpToFold, MachineInstr *UseMI,
239 unsigned UseOpIdx,
240 SmallVectorImpl<FoldCandidate> &FoldList) const;
241 void foldOperand(FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
243 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
244
245 std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const;
246 bool tryConstantFoldOp(MachineInstr *MI) const;
247 bool tryFoldCndMask(MachineInstr &MI) const;
248 bool tryFoldZeroHighBits(MachineInstr &MI) const;
249 bool foldInstOperand(MachineInstr &MI, const FoldableDef &OpToFold) const;
250
251 bool foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const;
252 bool tryFoldFoldableCopy(MachineInstr &MI,
253 MachineOperand *&CurrentKnownM0Val) const;
254
255 const MachineOperand *isClamp(const MachineInstr &MI) const;
256 bool tryFoldClamp(MachineInstr &MI);
257
258 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
259 bool tryFoldOMod(MachineInstr &MI);
260 bool tryFoldRegSequence(MachineInstr &MI);
261 bool tryFoldPhiAGPR(MachineInstr &MI);
262 bool tryFoldLoad(MachineInstr &MI);
263
264 bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
265
266public:
267 SIFoldOperandsImpl() = default;
268
269 bool run(MachineFunction &MF);
270};
271
272class SIFoldOperandsLegacy : public MachineFunctionPass {
273public:
274 static char ID;
275
276 SIFoldOperandsLegacy() : MachineFunctionPass(ID) {}
277
278 bool runOnMachineFunction(MachineFunction &MF) override {
279 if (skipFunction(MF.getFunction()))
280 return false;
281 return SIFoldOperandsImpl().run(MF);
282 }
283
284 StringRef getPassName() const override { return "SI Fold Operands"; }
285
286 void getAnalysisUsage(AnalysisUsage &AU) const override {
287 AU.setPreservesCFG();
289 }
290
291 MachineFunctionProperties getRequiredProperties() const override {
292 return MachineFunctionProperties().setIsSSA();
293 }
294};
295
296} // End anonymous namespace.
297
298INITIALIZE_PASS(SIFoldOperandsLegacy, DEBUG_TYPE, "SI Fold Operands", false,
299 false)
300
301char SIFoldOperandsLegacy::ID = 0;
302
303char &llvm::SIFoldOperandsLegacyID = SIFoldOperandsLegacy::ID;
304
307 const MachineOperand &MO) {
308 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
309 if (const TargetRegisterClass *SubRC =
310 TRI.getSubRegisterClass(RC, MO.getSubReg()))
311 RC = SubRC;
312 return RC;
313}
314
315// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
316static unsigned macToMad(unsigned Opc) {
317 switch (Opc) {
318 case AMDGPU::V_MAC_F32_e64:
319 return AMDGPU::V_MAD_F32_e64;
320 case AMDGPU::V_MAC_F16_e64:
321 return AMDGPU::V_MAD_F16_e64;
322 case AMDGPU::V_FMAC_F32_e64:
323 return AMDGPU::V_FMA_F32_e64;
324 case AMDGPU::V_FMAC_F16_e64:
325 return AMDGPU::V_FMA_F16_gfx9_e64;
326 case AMDGPU::V_FMAC_F16_t16_e64:
327 return AMDGPU::V_FMA_F16_gfx9_t16_e64;
328 case AMDGPU::V_FMAC_F16_fake16_e64:
329 return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
330 case AMDGPU::V_FMAC_LEGACY_F32_e64:
331 return AMDGPU::V_FMA_LEGACY_F32_e64;
332 case AMDGPU::V_FMAC_F64_e64:
333 return AMDGPU::V_FMA_F64_e64;
334 }
335 return AMDGPU::INSTRUCTION_LIST_END;
336}
337
338// TODO: Add heuristic that the frame index might not fit in the addressing mode
339// immediate offset to avoid materializing in loops.
340bool SIFoldOperandsImpl::frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
341 const FoldableDef &OpToFold) const {
342 if (!OpToFold.isFI())
343 return false;
344
345 const unsigned Opc = UseMI.getOpcode();
346 switch (Opc) {
347 case AMDGPU::S_ADD_I32:
348 case AMDGPU::S_ADD_U32:
349 case AMDGPU::V_ADD_U32_e32:
350 case AMDGPU::V_ADD_CO_U32_e32:
351 // TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have
352 // to insert the wave size shift at every point we use the index.
353 // TODO: Fix depending on visit order to fold immediates into the operand
354 return UseMI.getOperand(OpNo == 1 ? 2 : 1).isImm() &&
355 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
356 case AMDGPU::V_ADD_U32_e64:
357 case AMDGPU::V_ADD_CO_U32_e64:
358 return UseMI.getOperand(OpNo == 2 ? 3 : 2).isImm() &&
359 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
360 default:
361 break;
362 }
363
364 if (TII->isMUBUF(UseMI))
365 return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
366 if (!TII->isFLATScratch(UseMI))
367 return false;
368
369 int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
370 if (OpNo == SIdx)
371 return true;
372
373 int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
374 return OpNo == VIdx && SIdx == -1;
375}
376
377/// Fold %vgpr = COPY (S_ADD_I32 x, frameindex)
378///
379/// => %vgpr = V_ADD_U32 x, frameindex
380bool SIFoldOperandsImpl::foldCopyToVGPROfScalarAddOfFrameIndex(
381 Register DstReg, Register SrcReg, MachineInstr &MI) const {
382 if (TRI->isVGPR(*MRI, DstReg) && TRI->isSGPRReg(*MRI, SrcReg) &&
383 MRI->hasOneNonDBGUse(SrcReg)) {
384 MachineInstr *Def = MRI->getVRegDef(SrcReg);
385 if (!Def || Def->getNumOperands() != 4)
386 return false;
387
388 MachineOperand *Src0 = &Def->getOperand(1);
389 MachineOperand *Src1 = &Def->getOperand(2);
390
391 // TODO: This is profitable with more operand types, and for more
392 // opcodes. But ultimately this is working around poor / nonexistent
393 // regbankselect.
394 if (!Src0->isFI() && !Src1->isFI())
395 return false;
396
397 if (Src0->isFI())
398 std::swap(Src0, Src1);
399
400 const bool UseVOP3 = !Src0->isImm() || TII->isInlineConstant(*Src0);
401 unsigned NewOp = convertToVALUOp(Def->getOpcode(), UseVOP3);
402 if (NewOp == AMDGPU::INSTRUCTION_LIST_END ||
403 !Def->getOperand(3).isDead()) // Check if scc is dead
404 return false;
405
406 MachineBasicBlock *MBB = Def->getParent();
407 const DebugLoc &DL = Def->getDebugLoc();
408 if (NewOp != AMDGPU::V_ADD_CO_U32_e32) {
409 MachineInstrBuilder Add =
410 BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg);
411
412 if (Add->getDesc().getNumDefs() == 2) {
413 Register CarryOutReg = MRI->createVirtualRegister(TRI->getBoolRC());
414 Add.addDef(CarryOutReg, RegState::Dead);
415 MRI->setRegAllocationHint(CarryOutReg, 0, TRI->getVCC());
416 }
417
418 Add.add(*Src0).add(*Src1).setMIFlags(Def->getFlags());
419 if (AMDGPU::hasNamedOperand(NewOp, AMDGPU::OpName::clamp))
420 Add.addImm(0);
421
422 Def->eraseFromParent();
423 MI.eraseFromParent();
424 return true;
425 }
426
427 assert(NewOp == AMDGPU::V_ADD_CO_U32_e32);
428
430 MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, *Def, 16);
431 if (Liveness == MachineBasicBlock::LQR_Dead) {
432 // TODO: If src1 satisfies operand constraints, use vop3 version.
433 BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg)
434 .add(*Src0)
435 .add(*Src1)
436 .setOperandDead(3) // implicit-def $vcc
437 .setMIFlags(Def->getFlags());
438 Def->eraseFromParent();
439 MI.eraseFromParent();
440 return true;
441 }
442 }
443
444 return false;
445}
446
448 return new SIFoldOperandsLegacy();
449}
450
451bool SIFoldOperandsImpl::canUseImmWithOpSel(const MachineInstr *MI,
452 unsigned UseOpNo,
453 int64_t ImmVal) const {
454 const uint64_t TSFlags = MI->getDesc().TSFlags;
455
456 if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
457 (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) ||
458 (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
459 return false;
460
461 const MachineOperand &Old = MI->getOperand(UseOpNo);
462 int OpNo = MI->getOperandNo(&Old);
463
464 unsigned Opcode = MI->getOpcode();
465 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
466 switch (OpType) {
467 default:
468 return false;
476 // VOP3 packed instructions ignore op_sel source modifiers, we cannot encode
477 // two different constants.
478 if ((TSFlags & SIInstrFlags::VOP3) && !(TSFlags & SIInstrFlags::VOP3P) &&
479 static_cast<uint16_t>(ImmVal) != static_cast<uint16_t>(ImmVal >> 16))
480 return false;
481 break;
482 }
483
484 return true;
485}
486
487bool SIFoldOperandsImpl::tryFoldImmWithOpSel(MachineInstr *MI, unsigned UseOpNo,
488 int64_t ImmVal) const {
489 MachineOperand &Old = MI->getOperand(UseOpNo);
490 unsigned Opcode = MI->getOpcode();
491 int OpNo = MI->getOperandNo(&Old);
492 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
493
494 // If the literal can be inlined as-is, apply it and short-circuit the
495 // tests below. The main motivation for this is to avoid unintuitive
496 // uses of opsel.
497 if (AMDGPU::isInlinableLiteralV216(ImmVal, OpType)) {
498 Old.ChangeToImmediate(ImmVal);
499 return true;
500 }
501
502 // Refer to op_sel/op_sel_hi and check if we can change the immediate and
503 // op_sel in a way that allows an inline constant.
504 AMDGPU::OpName ModName = AMDGPU::OpName::NUM_OPERAND_NAMES;
505 unsigned SrcIdx = ~0;
506 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
507 ModName = AMDGPU::OpName::src0_modifiers;
508 SrcIdx = 0;
509 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
510 ModName = AMDGPU::OpName::src1_modifiers;
511 SrcIdx = 1;
512 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
513 ModName = AMDGPU::OpName::src2_modifiers;
514 SrcIdx = 2;
515 }
516 assert(ModName != AMDGPU::OpName::NUM_OPERAND_NAMES);
517 int ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModName);
518 MachineOperand &Mod = MI->getOperand(ModIdx);
519 unsigned ModVal = Mod.getImm();
520
521 uint16_t ImmLo =
522 static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
523 uint16_t ImmHi =
524 static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
525 uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
526 unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
527
528 // Helper function that attempts to inline the given value with a newly
529 // chosen opsel pattern.
530 auto tryFoldToInline = [&](uint32_t Imm) -> bool {
531 if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) {
532 Mod.setImm(NewModVal | SISrcMods::OP_SEL_1);
533 Old.ChangeToImmediate(Imm);
534 return true;
535 }
536
537 // Try to shuffle the halves around and leverage opsel to get an inline
538 // constant.
539 uint16_t Lo = static_cast<uint16_t>(Imm);
540 uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
541 if (Lo == Hi) {
542 if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) {
543 Mod.setImm(NewModVal);
545 return true;
546 }
547
548 if (static_cast<int16_t>(Lo) < 0) {
549 int32_t SExt = static_cast<int16_t>(Lo);
550 if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) {
551 Mod.setImm(NewModVal);
552 Old.ChangeToImmediate(SExt);
553 return true;
554 }
555 }
556
557 // This check is only useful for integer instructions
558 if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16) {
559 if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) {
560 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
561 Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16);
562 return true;
563 }
564 }
565 } else {
566 uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
567 if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) {
568 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0);
569 Old.ChangeToImmediate(Swapped);
570 return true;
571 }
572 }
573
574 return false;
575 };
576
577 if (tryFoldToInline(Imm))
578 return true;
579
580 // Replace integer addition by subtraction and vice versa if it allows
581 // folding the immediate to an inline constant.
582 //
583 // We should only ever get here for SrcIdx == 1 due to canonicalization
584 // earlier in the pipeline, but we double-check here to be safe / fully
585 // general.
586 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
587 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
588 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
589 unsigned ClampIdx =
590 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
591 bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
592
593 if (!Clamp) {
594 uint16_t NegLo = -static_cast<uint16_t>(Imm);
595 uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
596 uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
597
598 if (tryFoldToInline(NegImm)) {
599 unsigned NegOpcode =
600 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
601 MI->setDesc(TII->get(NegOpcode));
602 return true;
603 }
604 }
605 }
606
607 return false;
608}
609
610bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
611 MachineInstr *MI = Fold.UseMI;
612 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
613 assert(Old.isReg());
614
615 std::optional<int64_t> ImmVal;
616 if (Fold.isImm())
617 ImmVal = Fold.Def.getEffectiveImmVal();
618
619 if (ImmVal && canUseImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal)) {
620 if (tryFoldImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal))
621 return true;
622
623 // We can't represent the candidate as an inline constant. Try as a literal
624 // with the original opsel, checking constant bus limitations.
625 MachineOperand New = MachineOperand::CreateImm(*ImmVal);
626 int OpNo = MI->getOperandNo(&Old);
627 if (!TII->isOperandLegal(*MI, OpNo, &New))
628 return false;
629 Old.ChangeToImmediate(*ImmVal);
630 return true;
631 }
632
633 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
634 MachineBasicBlock *MBB = MI->getParent();
635 auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16);
636 if (Liveness != MachineBasicBlock::LQR_Dead) {
637 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
638 return false;
639 }
640
641 int Op32 = Fold.ShrinkOpcode;
642 MachineOperand &Dst0 = MI->getOperand(0);
643 MachineOperand &Dst1 = MI->getOperand(1);
644 assert(Dst0.isDef() && Dst1.isDef());
645
646 bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(Dst1.getReg());
647
648 const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg());
649 Register NewReg0 = MRI->createVirtualRegister(Dst0RC);
650
651 MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32);
652
653 if (HaveNonDbgCarryUse) {
654 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::COPY),
655 Dst1.getReg())
656 .addReg(AMDGPU::VCC, RegState::Kill);
657 }
658
659 // Keep the old instruction around to avoid breaking iterators, but
660 // replace it with a dummy instruction to remove uses.
661 //
662 // FIXME: We should not invert how this pass looks at operands to avoid
663 // this. Should track set of foldable movs instead of looking for uses
664 // when looking at a use.
665 Dst0.setReg(NewReg0);
666 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
667 MI->removeOperand(I);
668 MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF));
669
670 if (Fold.Commuted)
671 TII->commuteInstruction(*Inst32, false);
672 return true;
673 }
674
675 assert(!Fold.needsShrink() && "not handled");
676
677 if (ImmVal) {
678 if (Old.isTied()) {
679 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
680 if (NewMFMAOpc == -1)
681 return false;
682 MI->setDesc(TII->get(NewMFMAOpc));
683 MI->untieRegOperand(0);
684 }
685
686 // TODO: Should we try to avoid adding this to the candidate list?
687 MachineOperand New = MachineOperand::CreateImm(*ImmVal);
688 int OpNo = MI->getOperandNo(&Old);
689 if (!TII->isOperandLegal(*MI, OpNo, &New))
690 return false;
691
692 Old.ChangeToImmediate(*ImmVal);
693 return true;
694 }
695
696 if (Fold.isGlobal()) {
697 Old.ChangeToGA(Fold.Def.OpToFold->getGlobal(),
698 Fold.Def.OpToFold->getOffset(),
699 Fold.Def.OpToFold->getTargetFlags());
700 return true;
701 }
702
703 if (Fold.isFI()) {
704 Old.ChangeToFrameIndex(Fold.getFI());
705 return true;
706 }
707
708 MachineOperand *New = Fold.Def.OpToFold;
709
710 // Verify the register is compatible with the operand.
711 if (const TargetRegisterClass *OpRC =
712 TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI)) {
713 const TargetRegisterClass *NewRC =
714 TRI->getRegClassForReg(*MRI, New->getReg());
715
716 const TargetRegisterClass *ConstrainRC = OpRC;
717 if (New->getSubReg()) {
718 ConstrainRC =
719 TRI->getMatchingSuperRegClass(NewRC, OpRC, New->getSubReg());
720
721 if (!ConstrainRC)
722 return false;
723 }
724
725 if (!MRI->constrainRegClass(New->getReg(), ConstrainRC)) {
726 LLVM_DEBUG(dbgs() << "Cannot constrain " << printReg(New->getReg(), TRI)
727 << TRI->getRegClassName(ConstrainRC) << '\n');
728 return false;
729 }
730 }
731
732 // Rework once the VS_16 register class is updated to include proper
733 // 16-bit SGPRs instead of 32-bit ones.
734 if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
735 Old.setSubReg(AMDGPU::NoSubRegister);
736 if (New->getReg().isPhysical()) {
737 Old.substPhysReg(New->getReg(), *TRI);
738 } else {
739 Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
740 Old.setIsUndef(New->isUndef());
741 }
742 return true;
743}
744
746 FoldCandidate &&Entry) {
747 // Skip additional folding on the same operand.
748 for (FoldCandidate &Fold : FoldList)
749 if (Fold.UseMI == Entry.UseMI && Fold.UseOpNo == Entry.UseOpNo)
750 return;
751 LLVM_DEBUG(dbgs() << "Append " << (Entry.Commuted ? "commuted" : "normal")
752 << " operand " << Entry.UseOpNo << "\n " << *Entry.UseMI);
753 FoldList.push_back(Entry);
754}
755
757 MachineInstr *MI, unsigned OpNo,
758 const FoldableDef &FoldOp,
759 bool Commuted = false, int ShrinkOp = -1) {
760 appendFoldCandidate(FoldList,
761 FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
762}
763
764bool SIFoldOperandsImpl::tryAddToFoldList(
765 SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo,
766 const FoldableDef &OpToFold) const {
767 const unsigned Opc = MI->getOpcode();
768
769 auto tryToFoldAsFMAAKorMK = [&]() {
770 if (!OpToFold.isImm())
771 return false;
772
773 const bool TryAK = OpNo == 3;
774 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
775 MI->setDesc(TII->get(NewOpc));
776
777 // We have to fold into operand which would be Imm not into OpNo.
778 bool FoldAsFMAAKorMK =
779 tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
780 if (FoldAsFMAAKorMK) {
781 // Untie Src2 of fmac.
782 MI->untieRegOperand(3);
783 // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
784 if (OpNo == 1) {
785 MachineOperand &Op1 = MI->getOperand(1);
786 MachineOperand &Op2 = MI->getOperand(2);
787 Register OldReg = Op1.getReg();
788 // Operand 2 might be an inlinable constant
789 if (Op2.isImm()) {
790 Op1.ChangeToImmediate(Op2.getImm());
791 Op2.ChangeToRegister(OldReg, false);
792 } else {
793 Op1.setReg(Op2.getReg());
794 Op2.setReg(OldReg);
795 }
796 }
797 return true;
798 }
799 MI->setDesc(TII->get(Opc));
800 return false;
801 };
802
803 bool IsLegal = OpToFold.isOperandLegal(*TII, *MI, OpNo);
804 if (!IsLegal && OpToFold.isImm()) {
805 if (std::optional<int64_t> ImmVal = OpToFold.getEffectiveImmVal())
806 IsLegal = canUseImmWithOpSel(MI, OpNo, *ImmVal);
807 }
808
809 if (!IsLegal) {
810 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
811 unsigned NewOpc = macToMad(Opc);
812 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
813 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
814 // to fold the operand.
815 MI->setDesc(TII->get(NewOpc));
816 bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) &&
817 AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel);
818 if (AddOpSel)
819 MI->addOperand(MachineOperand::CreateImm(0));
820 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);
821 if (FoldAsMAD) {
822 MI->untieRegOperand(OpNo);
823 return true;
824 }
825 if (AddOpSel)
826 MI->removeOperand(MI->getNumExplicitOperands() - 1);
827 MI->setDesc(TII->get(Opc));
828 }
829
830 // Special case for s_fmac_f32 if we are trying to fold into Src2.
831 // By transforming into fmaak we can untie Src2 and make folding legal.
832 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
833 if (tryToFoldAsFMAAKorMK())
834 return true;
835 }
836
837 // Special case for s_setreg_b32
838 if (OpToFold.isImm()) {
839 unsigned ImmOpc = 0;
840 if (Opc == AMDGPU::S_SETREG_B32)
841 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
842 else if (Opc == AMDGPU::S_SETREG_B32_mode)
843 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
844 if (ImmOpc) {
845 MI->setDesc(TII->get(ImmOpc));
846 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
847 return true;
848 }
849 }
850
851 // Operand is not legal, so try to commute the instruction to
852 // see if this makes it possible to fold.
853 unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex;
854 bool CanCommute = TII->findCommutedOpIndices(*MI, OpNo, CommuteOpNo);
855 if (!CanCommute)
856 return false;
857
858 MachineOperand &Op = MI->getOperand(OpNo);
859 MachineOperand &CommutedOp = MI->getOperand(CommuteOpNo);
860
861 // One of operands might be an Imm operand, and OpNo may refer to it after
862 // the call of commuteInstruction() below. Such situations are avoided
863 // here explicitly as OpNo must be a register operand to be a candidate
864 // for memory folding.
865 if (!Op.isReg() || !CommutedOp.isReg())
866 return false;
867
868 // The same situation with an immediate could reproduce if both inputs are
869 // the same register.
870 if (Op.isReg() && CommutedOp.isReg() &&
871 (Op.getReg() == CommutedOp.getReg() &&
872 Op.getSubReg() == CommutedOp.getSubReg()))
873 return false;
874
875 if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo))
876 return false;
877
878 int Op32 = -1;
879 if (!OpToFold.isOperandLegal(*TII, *MI, CommuteOpNo)) {
880 if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&
881 Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME
882 (!OpToFold.isImm() && !OpToFold.isFI() && !OpToFold.isGlobal())) {
883 TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo);
884 return false;
885 }
886
887 // Verify the other operand is a VGPR, otherwise we would violate the
888 // constant bus restriction.
889 MachineOperand &OtherOp = MI->getOperand(OpNo);
890 if (!OtherOp.isReg() ||
891 !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
892 return false;
893
894 assert(MI->getOperand(1).isDef());
895
896 // Make sure to get the 32-bit version of the commuted opcode.
897 unsigned MaybeCommutedOpc = MI->getOpcode();
898 Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
899 }
900
901 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, /*Commuted=*/true,
902 Op32);
903 return true;
904 }
905
906 // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
907 // By changing into fmamk we can untie Src2.
908 // If folding for Src0 happens first and it is identical operand to Src1 we
909 // should avoid transforming into fmamk which requires commuting as it would
910 // cause folding into Src1 to fail later on due to wrong OpNo used.
911 if (Opc == AMDGPU::S_FMAC_F32 &&
912 (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
913 if (tryToFoldAsFMAAKorMK())
914 return true;
915 }
916
917 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
918 return true;
919}
920
921bool SIFoldOperandsImpl::isUseSafeToFold(const MachineInstr &MI,
922 const MachineOperand &UseMO) const {
923 // Operands of SDWA instructions must be registers.
924 return !TII->isSDWA(MI);
925}
926
929 Register SrcReg) {
930 MachineOperand *Sub = nullptr;
931 for (MachineInstr *SubDef = MRI.getVRegDef(SrcReg);
932 SubDef && TII.isFoldableCopy(*SubDef);
933 SubDef = MRI.getVRegDef(Sub->getReg())) {
934 MachineOperand &SrcOp = SubDef->getOperand(1);
935 if (SrcOp.isImm())
936 return &SrcOp;
937 if (!SrcOp.isReg() || SrcOp.getReg().isPhysical())
938 break;
939 Sub = &SrcOp;
940 // TODO: Support compose
941 if (SrcOp.getSubReg())
942 break;
943 }
944
945 return Sub;
946}
947
948const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
949 MachineInstr &RegSeq,
950 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const {
951
952 assert(RegSeq.isRegSequence());
953
954 const TargetRegisterClass *RC = nullptr;
955
956 for (unsigned I = 1, E = RegSeq.getNumExplicitOperands(); I != E; I += 2) {
957 MachineOperand &SrcOp = RegSeq.getOperand(I);
958 unsigned SubRegIdx = RegSeq.getOperand(I + 1).getImm();
959
960 // Only accept reg_sequence with uniform reg class inputs for simplicity.
961 const TargetRegisterClass *OpRC = getRegOpRC(*MRI, *TRI, SrcOp);
962 if (!RC)
963 RC = OpRC;
964 else if (!TRI->getCommonSubClass(RC, OpRC))
965 return nullptr;
966
967 if (SrcOp.getSubReg()) {
968 // TODO: Handle subregister compose
969 Defs.emplace_back(&SrcOp, SubRegIdx);
970 continue;
971 }
972
973 MachineOperand *DefSrc = lookUpCopyChain(*TII, *MRI, SrcOp.getReg());
974 if (DefSrc && (DefSrc->isReg() || DefSrc->isImm())) {
975 Defs.emplace_back(DefSrc, SubRegIdx);
976 continue;
977 }
978
979 Defs.emplace_back(&SrcOp, SubRegIdx);
980 }
981
982 return RC;
983}
984
985// Find a def of the UseReg, check if it is a reg_sequence and find initializers
986// for each subreg, tracking it to an immediate if possible. Returns the
987// register class of the inputs on success.
988const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
989 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
990 Register UseReg) const {
991 MachineInstr *Def = MRI->getVRegDef(UseReg);
992 if (!Def || !Def->isRegSequence())
993 return nullptr;
994
995 return getRegSeqInit(*Def, Defs);
996}
997
998std::pair<int64_t, const TargetRegisterClass *>
999SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const {
1001 const TargetRegisterClass *SrcRC = getRegSeqInit(RegSeq, Defs);
1002 if (!SrcRC)
1003 return {};
1004
1005 bool TryToMatchSplat64 = false;
1006
1007 int64_t Imm;
1008 for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
1009 const MachineOperand *Op = Defs[I].first;
1010 if (!Op->isImm())
1011 return {};
1012
1013 int64_t SubImm = Op->getImm();
1014 if (!I) {
1015 Imm = SubImm;
1016 continue;
1017 }
1018
1019 if (Imm != SubImm) {
1020 if (I == 1 && (E & 1) == 0) {
1021 // If we have an even number of inputs, there's a chance this is a
1022 // 64-bit element splat broken into 32-bit pieces.
1023 TryToMatchSplat64 = true;
1024 break;
1025 }
1026
1027 return {}; // Can only fold splat constants
1028 }
1029 }
1030
1031 if (!TryToMatchSplat64)
1032 return {Defs[0].first->getImm(), SrcRC};
1033
1034 // Fallback to recognizing 64-bit splats broken into 32-bit pieces
1035 // (i.e. recognize every other other element is 0 for 64-bit immediates)
1036 int64_t SplatVal64;
1037 for (unsigned I = 0, E = Defs.size(); I != E; I += 2) {
1038 const MachineOperand *Op0 = Defs[I].first;
1039 const MachineOperand *Op1 = Defs[I + 1].first;
1040
1041 if (!Op0->isImm() || !Op1->isImm())
1042 return {};
1043
1044 unsigned SubReg0 = Defs[I].second;
1045 unsigned SubReg1 = Defs[I + 1].second;
1046
1047 // Assume we're going to generally encounter reg_sequences with sorted
1048 // subreg indexes, so reject any that aren't consecutive.
1049 if (TRI->getChannelFromSubReg(SubReg0) + 1 !=
1050 TRI->getChannelFromSubReg(SubReg1))
1051 return {};
1052
1053 int64_t MergedVal = Make_64(Op1->getImm(), Op0->getImm());
1054 if (I == 0)
1055 SplatVal64 = MergedVal;
1056 else if (SplatVal64 != MergedVal)
1057 return {};
1058 }
1059
1060 const TargetRegisterClass *RC64 = TRI->getSubRegisterClass(
1061 MRI->getRegClass(RegSeq.getOperand(0).getReg()), AMDGPU::sub0_sub1);
1062
1063 return {SplatVal64, RC64};
1064}
1065
1066bool SIFoldOperandsImpl::tryFoldRegSeqSplat(
1067 MachineInstr *UseMI, unsigned UseOpIdx, int64_t SplatVal,
1068 const TargetRegisterClass *SplatRC) const {
1069 const MCInstrDesc &Desc = UseMI->getDesc();
1070 if (UseOpIdx >= Desc.getNumOperands())
1071 return false;
1072
1073 // Filter out unhandled pseudos.
1074 if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
1075 return false;
1076
1077 int16_t RCID = Desc.operands()[UseOpIdx].RegClass;
1078 if (RCID == -1)
1079 return false;
1080
1081 const TargetRegisterClass *OpRC = TRI->getRegClass(RCID);
1082
1083 // Special case 0/-1, since when interpreted as a 64-bit element both halves
1084 // have the same bits. These are the only cases where a splat has the same
1085 // interpretation for 32-bit and 64-bit splats.
1086 if (SplatVal != 0 && SplatVal != -1) {
1087 // We need to figure out the scalar type read by the operand. e.g. the MFMA
1088 // operand will be AReg_128, and we want to check if it's compatible with an
1089 // AReg_32 constant.
1090 uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
1091 switch (OpTy) {
1096 OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
1097 break;
1101 OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
1102 break;
1103 default:
1104 return false;
1105 }
1106
1107 if (!TRI->getCommonSubClass(OpRC, SplatRC))
1108 return false;
1109 }
1110
1111 MachineOperand TmpOp = MachineOperand::CreateImm(SplatVal);
1112 if (!TII->isOperandLegal(*UseMI, UseOpIdx, &TmpOp))
1113 return false;
1114
1115 return true;
1116}
1117
1118bool SIFoldOperandsImpl::tryToFoldACImm(
1119 const FoldableDef &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
1120 SmallVectorImpl<FoldCandidate> &FoldList) const {
1121 const MCInstrDesc &Desc = UseMI->getDesc();
1122 if (UseOpIdx >= Desc.getNumOperands())
1123 return false;
1124
1125 // Filter out unhandled pseudos.
1126 if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
1127 return false;
1128
1129 MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
1130 if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) {
1131 appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold);
1132 return true;
1133 }
1134
1135 // TODO: Verify the following code handles subregisters correctly.
1136 // TODO: Handle extract of global reference
1137 if (UseOp.getSubReg())
1138 return false;
1139
1140 if (!OpToFold.isReg())
1141 return false;
1142
1143 Register UseReg = OpToFold.getReg();
1144 if (!UseReg.isVirtual())
1145 return false;
1146
1147 // Maybe it is just a COPY of an immediate itself.
1148
1149 // FIXME: Remove this handling. There is already special case folding of
1150 // immediate into copy in foldOperand. This is looking for the def of the
1151 // value the folding started from in the first place.
1152 MachineInstr *Def = MRI->getVRegDef(UseReg);
1153 if (Def && TII->isFoldableCopy(*Def)) {
1154 MachineOperand &DefOp = Def->getOperand(1);
1155 if (DefOp.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
1156 FoldableDef FoldableImm(DefOp.getImm(), OpToFold.DefRC,
1157 OpToFold.DefSubReg);
1158 appendFoldCandidate(FoldList, UseMI, UseOpIdx, FoldableImm);
1159 return true;
1160 }
1161 }
1162
1163 return false;
1164}
1165
1166void SIFoldOperandsImpl::foldOperand(
1167 FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
1168 SmallVectorImpl<FoldCandidate> &FoldList,
1169 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
1170 const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);
1171
1172 if (!isUseSafeToFold(*UseMI, *UseOp))
1173 return;
1174
1175 // FIXME: Fold operands with subregs.
1176 if (UseOp->isReg() && OpToFold.isReg()) {
1177 if (UseOp->isImplicit())
1178 return;
1179 // Allow folding from SGPRs to 16-bit VGPRs.
1180 if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&
1181 (UseOp->getSubReg() != AMDGPU::lo16 ||
1182 !TRI->isSGPRReg(*MRI, OpToFold.getReg())))
1183 return;
1184 }
1185
1186 // Special case for REG_SEQUENCE: We can't fold literals into
1187 // REG_SEQUENCE instructions, so we have to fold them into the
1188 // uses of REG_SEQUENCE.
1189 if (UseMI->isRegSequence()) {
1190 Register RegSeqDstReg = UseMI->getOperand(0).getReg();
1191 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
1192
1193 int64_t SplatVal;
1194 const TargetRegisterClass *SplatRC;
1195 std::tie(SplatVal, SplatRC) = isRegSeqSplat(*UseMI);
1196
1197 // Grab the use operands first
1199 llvm::make_pointer_range(MRI->use_nodbg_operands(RegSeqDstReg)));
1200 for (unsigned I = 0; I != UsesToProcess.size(); ++I) {
1201 MachineOperand *RSUse = UsesToProcess[I];
1202 MachineInstr *RSUseMI = RSUse->getParent();
1203 unsigned OpNo = RSUseMI->getOperandNo(RSUse);
1204
1205 if (SplatRC) {
1206 if (RSUseMI->isCopy()) {
1207 Register DstReg = RSUseMI->getOperand(0).getReg();
1208 append_range(UsesToProcess,
1209 make_pointer_range(MRI->use_nodbg_operands(DstReg)));
1210 continue;
1211 }
1212 if (tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) {
1213 FoldableDef SplatDef(SplatVal, SplatRC);
1214 appendFoldCandidate(FoldList, RSUseMI, OpNo, SplatDef);
1215 continue;
1216 }
1217 }
1218
1219 // TODO: Handle general compose
1220 if (RSUse->getSubReg() != RegSeqDstSubReg)
1221 continue;
1222
1223 // FIXME: We should avoid recursing here. There should be a cleaner split
1224 // between the in-place mutations and adding to the fold list.
1225 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
1226 CopiesToReplace);
1227 }
1228
1229 return;
1230 }
1231
1232 if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList))
1233 return;
1234
1235 if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) {
1236 // Verify that this is a stack access.
1237 // FIXME: Should probably use stack pseudos before frame lowering.
1238
1239 if (TII->isMUBUF(*UseMI)) {
1240 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
1241 MFI->getScratchRSrcReg())
1242 return;
1243
1244 // Ensure this is either relative to the current frame or the current
1245 // wave.
1246 MachineOperand &SOff =
1247 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
1248 if (!SOff.isImm() || SOff.getImm() != 0)
1249 return;
1250 }
1251
1252 const unsigned Opc = UseMI->getOpcode();
1253 if (TII->isFLATScratch(*UseMI) &&
1254 AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
1255 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
1256 unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
1257 unsigned CPol =
1258 TII->getNamedOperand(*UseMI, AMDGPU::OpName::cpol)->getImm();
1259 if ((CPol & AMDGPU::CPol::SCAL) &&
1261 return;
1262
1263 UseMI->setDesc(TII->get(NewOpc));
1264 }
1265
1266 // A frame index will resolve to a positive constant, so it should always be
1267 // safe to fold the addressing mode, even pre-GFX9.
1268 UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI());
1269
1270 return;
1271 }
1272
1273 bool FoldingImmLike =
1274 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1275
1276 if (FoldingImmLike && UseMI->isCopy()) {
1277 Register DestReg = UseMI->getOperand(0).getReg();
1278 Register SrcReg = UseMI->getOperand(1).getReg();
1279 unsigned UseSubReg = UseMI->getOperand(1).getSubReg();
1280 assert(SrcReg.isVirtual());
1281
1282 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
1283
1284 // Don't fold into a copy to a physical register with the same class. Doing
1285 // so would interfere with the register coalescer's logic which would avoid
1286 // redundant initializations.
1287 if (DestReg.isPhysical() && SrcRC->contains(DestReg))
1288 return;
1289
1290 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
1291 // In order to fold immediates into copies, we need to change the copy to a
1292 // MOV. Find a compatible mov instruction with the value.
1293 for (unsigned MovOp :
1294 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
1295 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_MOV_B16_t16_e64,
1296 AMDGPU::V_ACCVGPR_WRITE_B32_e64, AMDGPU::AV_MOV_B32_IMM_PSEUDO,
1297 AMDGPU::AV_MOV_B64_IMM_PSEUDO}) {
1298 const MCInstrDesc &MovDesc = TII->get(MovOp);
1299 assert(MovDesc.getNumDefs() > 0 && MovDesc.operands()[0].RegClass != -1);
1300
1301 const TargetRegisterClass *MovDstRC =
1302 TRI->getRegClass(MovDesc.operands()[0].RegClass);
1303
1304 // Fold if the destination register class of the MOV instruction (ResRC)
1305 // is a superclass of (or equal to) the destination register class of the
1306 // COPY (DestRC). If this condition fails, folding would be illegal.
1307 if (!DestRC->hasSuperClassEq(MovDstRC))
1308 continue;
1309
1310 const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1;
1311 const TargetRegisterClass *MovSrcRC =
1312 TRI->getRegClass(MovDesc.operands()[SrcIdx].RegClass);
1313 if (MovSrcRC) {
1314 if (UseSubReg)
1315 MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg);
1316
1317 // FIXME: We should be able to directly check immediate operand legality
1318 // for all cases, but gfx908 hacks break.
1319 if (MovOp == AMDGPU::AV_MOV_B32_IMM_PSEUDO &&
1320 (!OpToFold.isImm() ||
1321 !TII->isImmOperandLegal(MovDesc, SrcIdx,
1322 *OpToFold.getEffectiveImmVal())))
1323 break;
1324
1325 if (!MRI->constrainRegClass(SrcReg, MovSrcRC))
1326 break;
1327
1328 // FIXME: This is mutating the instruction only and deferring the actual
1329 // fold of the immediate
1330 } else {
1331 // For the _IMM_PSEUDO cases, there can be value restrictions on the
1332 // immediate to verify. Technically we should always verify this, but it
1333 // only matters for these concrete cases.
1334 // TODO: Handle non-imm case if it's useful.
1335 if (!OpToFold.isImm() ||
1336 !TII->isImmOperandLegal(MovDesc, 1, *OpToFold.getEffectiveImmVal()))
1337 break;
1338 }
1339
1342 while (ImpOpI != ImpOpE) {
1343 MachineInstr::mop_iterator Tmp = ImpOpI;
1344 ImpOpI++;
1346 }
1347 UseMI->setDesc(MovDesc);
1348
1349 if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
1350 const auto &SrcOp = UseMI->getOperand(UseOpIdx);
1351 MachineOperand NewSrcOp(SrcOp);
1352 MachineFunction *MF = UseMI->getParent()->getParent();
1353 UseMI->removeOperand(1);
1354 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
1355 UseMI->addOperand(NewSrcOp); // src0
1356 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
1357 UseOpIdx = SrcIdx;
1358 UseOp = &UseMI->getOperand(UseOpIdx);
1359 }
1360 CopiesToReplace.push_back(UseMI);
1361 break;
1362 }
1363
1364 // We failed to replace the copy, so give up.
1365 if (UseMI->getOpcode() == AMDGPU::COPY)
1366 return;
1367
1368 } else {
1369 if (UseMI->isCopy() && OpToFold.isReg() &&
1370 UseMI->getOperand(0).getReg().isVirtual() &&
1371 !UseMI->getOperand(1).getSubReg() &&
1372 OpToFold.DefMI->implicit_operands().empty()) {
1373 LLVM_DEBUG(dbgs() << "Folding " << OpToFold.OpToFold << "\n into "
1374 << *UseMI);
1375 unsigned Size = TII->getOpSize(*UseMI, 1);
1376 Register UseReg = OpToFold.getReg();
1378 unsigned SubRegIdx = OpToFold.getSubReg();
1379 // Hack to allow 32-bit SGPRs to be folded into True16 instructions
1380 // Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the
1381 // VS_16RegClass
1382 //
1383 // Excerpt from AMDGPUGenRegisterInfo.inc
1384 // NoSubRegister, //0
1385 // hi16, // 1
1386 // lo16, // 2
1387 // sub0, // 3
1388 // ...
1389 // sub1, // 11
1390 // sub1_hi16, // 12
1391 // sub1_lo16, // 13
1392 static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed");
1393 if (Size == 2 && TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
1394 TRI->isSGPRReg(*MRI, UseReg)) {
1395 // Produce the 32 bit subregister index to which the 16-bit subregister
1396 // is aligned.
1397 if (SubRegIdx > AMDGPU::sub1) {
1398 LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx);
1399 M |= M.getLane(M.getHighestLane() - 1);
1400 SmallVector<unsigned, 4> Indexes;
1401 TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M,
1402 Indexes);
1403 assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover");
1404 SubRegIdx = Indexes[0];
1405 // 32-bit registers do not have a sub0 index
1406 } else if (TII->getOpSize(*UseMI, 1) == 4)
1407 SubRegIdx = 0;
1408 else
1409 SubRegIdx = AMDGPU::sub0;
1410 }
1411 UseMI->getOperand(1).setSubReg(SubRegIdx);
1412 UseMI->getOperand(1).setIsKill(false);
1413 CopiesToReplace.push_back(UseMI);
1414 OpToFold.OpToFold->setIsKill(false);
1415
1416 // Remove kill flags as kills may now be out of order with uses.
1417 MRI->clearKillFlags(UseReg);
1418 if (foldCopyToAGPRRegSequence(UseMI))
1419 return;
1420 }
1421
1422 unsigned UseOpc = UseMI->getOpcode();
1423 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
1424 (UseOpc == AMDGPU::V_READLANE_B32 &&
1425 (int)UseOpIdx ==
1426 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
1427 // %vgpr = V_MOV_B32 imm
1428 // %sgpr = V_READFIRSTLANE_B32 %vgpr
1429 // =>
1430 // %sgpr = S_MOV_B32 imm
1431 if (FoldingImmLike) {
1433 UseMI->getOperand(UseOpIdx).getReg(),
1434 *OpToFold.DefMI, *UseMI))
1435 return;
1436
1437 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
1438
1439 if (OpToFold.isImm()) {
1441 *OpToFold.getEffectiveImmVal());
1442 } else if (OpToFold.isFI())
1443 UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getFI());
1444 else {
1445 assert(OpToFold.isGlobal());
1446 UseMI->getOperand(1).ChangeToGA(OpToFold.OpToFold->getGlobal(),
1447 OpToFold.OpToFold->getOffset(),
1448 OpToFold.OpToFold->getTargetFlags());
1449 }
1450 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1451 return;
1452 }
1453
1454 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
1456 UseMI->getOperand(UseOpIdx).getReg(),
1457 *OpToFold.DefMI, *UseMI))
1458 return;
1459
1460 // %vgpr = COPY %sgpr0
1461 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
1462 // =>
1463 // %sgpr1 = COPY %sgpr0
1464 UseMI->setDesc(TII->get(AMDGPU::COPY));
1465 UseMI->getOperand(1).setReg(OpToFold.getReg());
1466 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1467 UseMI->getOperand(1).setIsKill(false);
1468 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1469 return;
1470 }
1471 }
1472
1473 const MCInstrDesc &UseDesc = UseMI->getDesc();
1474
1475 // Don't fold into target independent nodes. Target independent opcodes
1476 // don't have defined register classes.
1477 if (UseDesc.isVariadic() || UseOp->isImplicit() ||
1478 UseDesc.operands()[UseOpIdx].RegClass == -1)
1479 return;
1480 }
1481
1482 // FIXME: We could try to change the instruction from 64-bit to 32-bit
1483 // to enable more folding opportunities. The shrink operands pass
1484 // already does this.
1485
1486 tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold);
1487}
1488
1489static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
1491 switch (Opcode) {
1492 case AMDGPU::V_AND_B32_e64:
1493 case AMDGPU::V_AND_B32_e32:
1494 case AMDGPU::S_AND_B32:
1495 Result = LHS & RHS;
1496 return true;
1497 case AMDGPU::V_OR_B32_e64:
1498 case AMDGPU::V_OR_B32_e32:
1499 case AMDGPU::S_OR_B32:
1500 Result = LHS | RHS;
1501 return true;
1502 case AMDGPU::V_XOR_B32_e64:
1503 case AMDGPU::V_XOR_B32_e32:
1504 case AMDGPU::S_XOR_B32:
1505 Result = LHS ^ RHS;
1506 return true;
1507 case AMDGPU::S_XNOR_B32:
1508 Result = ~(LHS ^ RHS);
1509 return true;
1510 case AMDGPU::S_NAND_B32:
1511 Result = ~(LHS & RHS);
1512 return true;
1513 case AMDGPU::S_NOR_B32:
1514 Result = ~(LHS | RHS);
1515 return true;
1516 case AMDGPU::S_ANDN2_B32:
1517 Result = LHS & ~RHS;
1518 return true;
1519 case AMDGPU::S_ORN2_B32:
1520 Result = LHS | ~RHS;
1521 return true;
1522 case AMDGPU::V_LSHL_B32_e64:
1523 case AMDGPU::V_LSHL_B32_e32:
1524 case AMDGPU::S_LSHL_B32:
1525 // The instruction ignores the high bits for out of bounds shifts.
1526 Result = LHS << (RHS & 31);
1527 return true;
1528 case AMDGPU::V_LSHLREV_B32_e64:
1529 case AMDGPU::V_LSHLREV_B32_e32:
1530 Result = RHS << (LHS & 31);
1531 return true;
1532 case AMDGPU::V_LSHR_B32_e64:
1533 case AMDGPU::V_LSHR_B32_e32:
1534 case AMDGPU::S_LSHR_B32:
1535 Result = LHS >> (RHS & 31);
1536 return true;
1537 case AMDGPU::V_LSHRREV_B32_e64:
1538 case AMDGPU::V_LSHRREV_B32_e32:
1539 Result = RHS >> (LHS & 31);
1540 return true;
1541 case AMDGPU::V_ASHR_I32_e64:
1542 case AMDGPU::V_ASHR_I32_e32:
1543 case AMDGPU::S_ASHR_I32:
1544 Result = static_cast<int32_t>(LHS) >> (RHS & 31);
1545 return true;
1546 case AMDGPU::V_ASHRREV_I32_e64:
1547 case AMDGPU::V_ASHRREV_I32_e32:
1548 Result = static_cast<int32_t>(RHS) >> (LHS & 31);
1549 return true;
1550 default:
1551 return false;
1552 }
1553}
1554
1555static unsigned getMovOpc(bool IsScalar) {
1556 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1557}
1558
1559static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
1560 MI.setDesc(NewDesc);
1561
1562 // Remove any leftover implicit operands from mutating the instruction. e.g.
1563 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
1564 // anymore.
1565 const MCInstrDesc &Desc = MI.getDesc();
1566 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
1567 Desc.implicit_defs().size();
1568
1569 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
1570 MI.removeOperand(I);
1571}
1572
1573std::optional<int64_t>
1574SIFoldOperandsImpl::getImmOrMaterializedImm(MachineOperand &Op) const {
1575 if (Op.isImm())
1576 return Op.getImm();
1577
1578 if (!Op.isReg() || !Op.getReg().isVirtual())
1579 return std::nullopt;
1580
1581 const MachineInstr *Def = MRI->getVRegDef(Op.getReg());
1582 if (Def && Def->isMoveImmediate()) {
1583 const MachineOperand &ImmSrc = Def->getOperand(1);
1584 if (ImmSrc.isImm())
1585 return TII->extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg());
1586 }
1587
1588 return std::nullopt;
1589}
1590
1591// Try to simplify operations with a constant that may appear after instruction
1592// selection.
1593// TODO: See if a frame index with a fixed offset can fold.
1594bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
1595 if (!MI->allImplicitDefsAreDead())
1596 return false;
1597
1598 unsigned Opc = MI->getOpcode();
1599
1600 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1601 if (Src0Idx == -1)
1602 return false;
1603
1604 MachineOperand *Src0 = &MI->getOperand(Src0Idx);
1605 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
1606
1607 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1608 Opc == AMDGPU::S_NOT_B32) &&
1609 Src0Imm) {
1610 MI->getOperand(1).ChangeToImmediate(~*Src0Imm);
1611 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
1612 return true;
1613 }
1614
1615 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1616 if (Src1Idx == -1)
1617 return false;
1618
1619 MachineOperand *Src1 = &MI->getOperand(Src1Idx);
1620 std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
1621
1622 if (!Src0Imm && !Src1Imm)
1623 return false;
1624
1625 // and k0, k1 -> v_mov_b32 (k0 & k1)
1626 // or k0, k1 -> v_mov_b32 (k0 | k1)
1627 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1628 if (Src0Imm && Src1Imm) {
1629 int32_t NewImm;
1630 if (!evalBinaryInstruction(Opc, NewImm, *Src0Imm, *Src1Imm))
1631 return false;
1632
1633 bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg());
1634
1635 // Be careful to change the right operand, src0 may belong to a different
1636 // instruction.
1637 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1638 MI->removeOperand(Src1Idx);
1639 mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
1640 return true;
1641 }
1642
1643 if (!MI->isCommutable())
1644 return false;
1645
1646 if (Src0Imm && !Src1Imm) {
1647 std::swap(Src0, Src1);
1648 std::swap(Src0Idx, Src1Idx);
1649 std::swap(Src0Imm, Src1Imm);
1650 }
1651
1652 int32_t Src1Val = static_cast<int32_t>(*Src1Imm);
1653 if (Opc == AMDGPU::V_OR_B32_e64 ||
1654 Opc == AMDGPU::V_OR_B32_e32 ||
1655 Opc == AMDGPU::S_OR_B32) {
1656 if (Src1Val == 0) {
1657 // y = or x, 0 => y = copy x
1658 MI->removeOperand(Src1Idx);
1659 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1660 } else if (Src1Val == -1) {
1661 // y = or x, -1 => y = v_mov_b32 -1
1662 MI->removeOperand(Src1Idx);
1663 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
1664 } else
1665 return false;
1666
1667 return true;
1668 }
1669
1670 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1671 Opc == AMDGPU::S_AND_B32) {
1672 if (Src1Val == 0) {
1673 // y = and x, 0 => y = v_mov_b32 0
1674 MI->removeOperand(Src0Idx);
1675 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
1676 } else if (Src1Val == -1) {
1677 // y = and x, -1 => y = copy x
1678 MI->removeOperand(Src1Idx);
1679 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1680 } else
1681 return false;
1682
1683 return true;
1684 }
1685
1686 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1687 Opc == AMDGPU::S_XOR_B32) {
1688 if (Src1Val == 0) {
1689 // y = xor x, 0 => y = copy x
1690 MI->removeOperand(Src1Idx);
1691 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1692 return true;
1693 }
1694 }
1695
1696 return false;
1697}
1698
1699// Try to fold an instruction into a simpler one
1700bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
1701 unsigned Opc = MI.getOpcode();
1702 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1703 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1704 return false;
1705
1706 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1707 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1708 if (!Src1->isIdenticalTo(*Src0)) {
1709 std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
1710 if (!Src1Imm)
1711 return false;
1712
1713 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
1714 if (!Src0Imm || *Src0Imm != *Src1Imm)
1715 return false;
1716 }
1717
1718 int Src1ModIdx =
1719 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1720 int Src0ModIdx =
1721 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1722 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1723 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1724 return false;
1725
1726 LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1727 auto &NewDesc =
1728 TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
1729 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1730 if (Src2Idx != -1)
1731 MI.removeOperand(Src2Idx);
1732 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1733 if (Src1ModIdx != -1)
1734 MI.removeOperand(Src1ModIdx);
1735 if (Src0ModIdx != -1)
1736 MI.removeOperand(Src0ModIdx);
1737 mutateCopyOp(MI, NewDesc);
1738 LLVM_DEBUG(dbgs() << MI);
1739 return true;
1740}
1741
1742bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
1743 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1744 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1745 return false;
1746
1747 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(MI.getOperand(1));
1748 if (!Src0Imm || *Src0Imm != 0xffff || !MI.getOperand(2).isReg())
1749 return false;
1750
1751 Register Src1 = MI.getOperand(2).getReg();
1752 MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1753 if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
1754 return false;
1755
1756 Register Dst = MI.getOperand(0).getReg();
1757 MRI->replaceRegWith(Dst, Src1);
1758 if (!MI.getOperand(2).isKill())
1759 MRI->clearKillFlags(Src1);
1760 MI.eraseFromParent();
1761 return true;
1762}
1763
1764bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
1765 const FoldableDef &OpToFold) const {
1766 // We need mutate the operands of new mov instructions to add implicit
1767 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1768 // this.
1769 SmallVector<MachineInstr *, 4> CopiesToReplace;
1771 MachineOperand &Dst = MI.getOperand(0);
1772 bool Changed = false;
1773
1774 if (OpToFold.isImm()) {
1775 for (auto &UseMI :
1776 make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
1777 // Folding the immediate may reveal operations that can be constant
1778 // folded or replaced with a copy. This can happen for example after
1779 // frame indices are lowered to constants or from splitting 64-bit
1780 // constants.
1781 //
1782 // We may also encounter cases where one or both operands are
1783 // immediates materialized into a register, which would ordinarily not
1784 // be folded due to multiple uses or operand constraints.
1785 if (tryConstantFoldOp(&UseMI)) {
1786 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
1787 Changed = true;
1788 }
1789 }
1790 }
1791
1793 llvm::make_pointer_range(MRI->use_nodbg_operands(Dst.getReg())));
1794 for (auto *U : UsesToProcess) {
1795 MachineInstr *UseMI = U->getParent();
1796
1797 FoldableDef SubOpToFold = OpToFold.getWithSubReg(*TRI, U->getSubReg());
1798 foldOperand(SubOpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
1799 CopiesToReplace);
1800 }
1801
1802 if (CopiesToReplace.empty() && FoldList.empty())
1803 return Changed;
1804
1805 MachineFunction *MF = MI.getParent()->getParent();
1806 // Make sure we add EXEC uses to any new v_mov instructions created.
1807 for (MachineInstr *Copy : CopiesToReplace)
1808 Copy->addImplicitDefUseOperands(*MF);
1809
1810 SetVector<MachineInstr *> ConstantFoldCandidates;
1811 for (FoldCandidate &Fold : FoldList) {
1812 assert(!Fold.isReg() || Fold.Def.OpToFold);
1813 if (Fold.isReg() && Fold.getReg().isVirtual()) {
1814 Register Reg = Fold.getReg();
1815 const MachineInstr *DefMI = Fold.Def.DefMI;
1816 if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
1817 execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
1818 continue;
1819 }
1820 if (updateOperand(Fold)) {
1821 // Clear kill flags.
1822 if (Fold.isReg()) {
1823 assert(Fold.Def.OpToFold && Fold.isReg());
1824 // FIXME: Probably shouldn't bother trying to fold if not an
1825 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1826 // copies.
1827 MRI->clearKillFlags(Fold.getReg());
1828 }
1829 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1830 << static_cast<int>(Fold.UseOpNo) << " of "
1831 << *Fold.UseMI);
1832
1833 if (Fold.isImm())
1834 ConstantFoldCandidates.insert(Fold.UseMI);
1835
1836 } else if (Fold.Commuted) {
1837 // Restoring instruction's original operand order if fold has failed.
1838 TII->commuteInstruction(*Fold.UseMI, false);
1839 }
1840 }
1841
1842 for (MachineInstr *MI : ConstantFoldCandidates) {
1843 if (tryConstantFoldOp(MI)) {
1844 LLVM_DEBUG(dbgs() << "Constant folded " << *MI);
1845 Changed = true;
1846 }
1847 }
1848 return true;
1849}
1850
1851/// Fold %agpr = COPY (REG_SEQUENCE x_MOV_B32, ...) into REG_SEQUENCE
1852/// (V_ACCVGPR_WRITE_B32_e64) ... depending on the reg_sequence input values.
1853bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const {
1854 // It is very tricky to store a value into an AGPR. v_accvgpr_write_b32 can
1855 // only accept VGPR or inline immediate. Recreate a reg_sequence with its
1856 // initializers right here, so we will rematerialize immediates and avoid
1857 // copies via different reg classes.
1858 const TargetRegisterClass *DefRC =
1859 MRI->getRegClass(CopyMI->getOperand(0).getReg());
1860 if (!TRI->isAGPRClass(DefRC))
1861 return false;
1862
1863 Register UseReg = CopyMI->getOperand(1).getReg();
1864 MachineInstr *RegSeq = MRI->getVRegDef(UseReg);
1865 if (!RegSeq || !RegSeq->isRegSequence())
1866 return false;
1867
1868 const DebugLoc &DL = CopyMI->getDebugLoc();
1869 MachineBasicBlock &MBB = *CopyMI->getParent();
1870
1871 MachineInstrBuilder B(*MBB.getParent(), CopyMI);
1872 DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
1873
1874 const TargetRegisterClass *UseRC =
1875 MRI->getRegClass(CopyMI->getOperand(1).getReg());
1876
1877 // Value, subregindex for new REG_SEQUENCE
1879
1880 unsigned NumRegSeqOperands = RegSeq->getNumOperands();
1881 unsigned NumFoldable = 0;
1882
1883 for (unsigned I = 1; I != NumRegSeqOperands; I += 2) {
1884 MachineOperand &RegOp = RegSeq->getOperand(I);
1885 unsigned SubRegIdx = RegSeq->getOperand(I + 1).getImm();
1886
1887 if (RegOp.getSubReg()) {
1888 // TODO: Handle subregister compose
1889 NewDefs.emplace_back(&RegOp, SubRegIdx);
1890 continue;
1891 }
1892
1893 MachineOperand *Lookup = lookUpCopyChain(*TII, *MRI, RegOp.getReg());
1894 if (!Lookup)
1895 Lookup = &RegOp;
1896
1897 if (Lookup->isImm()) {
1898 // Check if this is an agpr_32 subregister.
1899 const TargetRegisterClass *DestSuperRC = TRI->getMatchingSuperRegClass(
1900 DefRC, &AMDGPU::AGPR_32RegClass, SubRegIdx);
1901 if (DestSuperRC &&
1902 TII->isInlineConstant(*Lookup, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1903 ++NumFoldable;
1904 NewDefs.emplace_back(Lookup, SubRegIdx);
1905 continue;
1906 }
1907 }
1908
1909 const TargetRegisterClass *InputRC =
1910 Lookup->isReg() ? MRI->getRegClass(Lookup->getReg())
1911 : MRI->getRegClass(RegOp.getReg());
1912
1913 // TODO: Account for Lookup->getSubReg()
1914
1915 // If we can't find a matching super class, this is an SGPR->AGPR or
1916 // VGPR->AGPR subreg copy (or something constant-like we have to materialize
1917 // in the AGPR). We can't directly copy from SGPR to AGPR on gfx908, so we
1918 // want to rewrite to copy to an intermediate VGPR class.
1919 const TargetRegisterClass *MatchRC =
1920 TRI->getMatchingSuperRegClass(DefRC, InputRC, SubRegIdx);
1921 if (!MatchRC) {
1922 ++NumFoldable;
1923 NewDefs.emplace_back(&RegOp, SubRegIdx);
1924 continue;
1925 }
1926
1927 NewDefs.emplace_back(&RegOp, SubRegIdx);
1928 }
1929
1930 // Do not clone a reg_sequence and merely change the result register class.
1931 if (NumFoldable == 0)
1932 return false;
1933
1934 CopyMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
1935 for (unsigned I = CopyMI->getNumOperands() - 1; I > 0; --I)
1936 CopyMI->removeOperand(I);
1937
1938 for (auto [Def, DestSubIdx] : NewDefs) {
1939 if (!Def->isReg()) {
1940 // TODO: Should we use single write for each repeated value like in
1941 // register case?
1942 Register Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1943 BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
1944 .add(*Def);
1945 B.addReg(Tmp);
1946 } else {
1947 TargetInstrInfo::RegSubRegPair Src = getRegSubRegPair(*Def);
1948 Def->setIsKill(false);
1949
1950 Register &VGPRCopy = VGPRCopies[Src];
1951 if (!VGPRCopy) {
1952 const TargetRegisterClass *VGPRUseSubRC =
1953 TRI->getSubRegisterClass(UseRC, DestSubIdx);
1954
1955 // We cannot build a reg_sequence out of the same registers, they
1956 // must be copied. Better do it here before copyPhysReg() created
1957 // several reads to do the AGPR->VGPR->AGPR copy.
1958
1959 // Direct copy from SGPR to AGPR is not possible on gfx908. To avoid
1960 // creation of exploded copies SGPR->VGPR->AGPR in the copyPhysReg()
1961 // later, create a copy here and track if we already have such a copy.
1962 const TargetRegisterClass *SubRC =
1963 TRI->getSubRegisterClass(MRI->getRegClass(Src.Reg), Src.SubReg);
1964 if (!VGPRUseSubRC->hasSubClassEq(SubRC)) {
1965 // TODO: Try to reconstrain class
1966 VGPRCopy = MRI->createVirtualRegister(VGPRUseSubRC);
1967 BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), VGPRCopy).add(*Def);
1968 B.addReg(VGPRCopy);
1969 } else {
1970 // If it is already a VGPR, do not copy the register.
1971 B.add(*Def);
1972 }
1973 } else {
1974 B.addReg(VGPRCopy);
1975 }
1976 }
1977
1978 B.addImm(DestSubIdx);
1979 }
1980
1981 LLVM_DEBUG(dbgs() << "Folded " << *CopyMI);
1982 return true;
1983}
1984
1985bool SIFoldOperandsImpl::tryFoldFoldableCopy(
1986 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
1987 Register DstReg = MI.getOperand(0).getReg();
1988 // Specially track simple redefs of m0 to the same value in a block, so we
1989 // can erase the later ones.
1990 if (DstReg == AMDGPU::M0) {
1991 MachineOperand &NewM0Val = MI.getOperand(1);
1992 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
1993 MI.eraseFromParent();
1994 return true;
1995 }
1996
1997 // We aren't tracking other physical registers
1998 CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical())
1999 ? nullptr
2000 : &NewM0Val;
2001 return false;
2002 }
2003
2004 MachineOperand *OpToFoldPtr;
2005 if (MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e64) {
2006 // Folding when any src_modifiers are non-zero is unsupported
2007 if (TII->hasAnyModifiersSet(MI))
2008 return false;
2009 OpToFoldPtr = &MI.getOperand(2);
2010 } else
2011 OpToFoldPtr = &MI.getOperand(1);
2012 MachineOperand &OpToFold = *OpToFoldPtr;
2013 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
2014
2015 // FIXME: We could also be folding things like TargetIndexes.
2016 if (!FoldingImm && !OpToFold.isReg())
2017 return false;
2018
2019 // Fold virtual registers and constant physical registers.
2020 if (OpToFold.isReg() && OpToFold.getReg().isPhysical() &&
2021 !TRI->isConstantPhysReg(OpToFold.getReg()))
2022 return false;
2023
2024 // Prevent folding operands backwards in the function. For example,
2025 // the COPY opcode must not be replaced by 1 in this example:
2026 //
2027 // %3 = COPY %vgpr0; VGPR_32:%3
2028 // ...
2029 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
2030 if (!DstReg.isVirtual())
2031 return false;
2032
2033 const TargetRegisterClass *DstRC =
2034 MRI->getRegClass(MI.getOperand(0).getReg());
2035
2036 // True16: Fix malformed 16-bit sgpr COPY produced by peephole-opt
2037 // Can remove this code if proper 16-bit SGPRs are implemented
2038 // Example: Pre-peephole-opt
2039 // %29:sgpr_lo16 = COPY %16.lo16:sreg_32
2040 // %32:sreg_32 = COPY %29:sgpr_lo16
2041 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2042 // Post-peephole-opt and DCE
2043 // %32:sreg_32 = COPY %16.lo16:sreg_32
2044 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2045 // After this transform
2046 // %32:sreg_32 = COPY %16:sreg_32
2047 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2048 // After the fold operands pass
2049 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %16:sreg_32
2050 if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() &&
2051 OpToFold.getSubReg()) {
2052 if (DstRC == &AMDGPU::SReg_32RegClass &&
2053 DstRC == MRI->getRegClass(OpToFold.getReg())) {
2054 assert(OpToFold.getSubReg() == AMDGPU::lo16);
2055 OpToFold.setSubReg(0);
2056 }
2057 }
2058
2059 // Fold copy to AGPR through reg_sequence
2060 // TODO: Handle with subregister extract
2061 if (OpToFold.isReg() && MI.isCopy() && !MI.getOperand(1).getSubReg()) {
2062 if (foldCopyToAGPRRegSequence(&MI))
2063 return true;
2064 }
2065
2066 FoldableDef Def(OpToFold, DstRC);
2067 bool Changed = foldInstOperand(MI, Def);
2068
2069 // If we managed to fold all uses of this copy then we might as well
2070 // delete it now.
2071 // The only reason we need to follow chains of copies here is that
2072 // tryFoldRegSequence looks forward through copies before folding a
2073 // REG_SEQUENCE into its eventual users.
2074 auto *InstToErase = &MI;
2075 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2076 auto &SrcOp = InstToErase->getOperand(1);
2077 auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
2078 InstToErase->eraseFromParent();
2079 Changed = true;
2080 InstToErase = nullptr;
2081 if (!SrcReg || SrcReg.isPhysical())
2082 break;
2083 InstToErase = MRI->getVRegDef(SrcReg);
2084 if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
2085 break;
2086 }
2087
2088 if (InstToErase && InstToErase->isRegSequence() &&
2089 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2090 InstToErase->eraseFromParent();
2091 Changed = true;
2092 }
2093
2094 if (Changed)
2095 return true;
2096
2097 // Run this after foldInstOperand to avoid turning scalar additions into
2098 // vector additions when the result scalar result could just be folded into
2099 // the user(s).
2100 return OpToFold.isReg() &&
2101 foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI);
2102}
2103
2104// Clamp patterns are canonically selected to v_max_* instructions, so only
2105// handle them.
2106const MachineOperand *
2107SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const {
2108 unsigned Op = MI.getOpcode();
2109 switch (Op) {
2110 case AMDGPU::V_MAX_F32_e64:
2111 case AMDGPU::V_MAX_F16_e64:
2112 case AMDGPU::V_MAX_F16_t16_e64:
2113 case AMDGPU::V_MAX_F16_fake16_e64:
2114 case AMDGPU::V_MAX_F64_e64:
2115 case AMDGPU::V_MAX_NUM_F64_e64:
2116 case AMDGPU::V_PK_MAX_F16:
2117 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2118 case AMDGPU::V_PK_MAX_NUM_BF16: {
2119 if (MI.mayRaiseFPException())
2120 return nullptr;
2121
2122 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
2123 return nullptr;
2124
2125 // Make sure sources are identical.
2126 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2127 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2128 if (!Src0->isReg() || !Src1->isReg() ||
2129 Src0->getReg() != Src1->getReg() ||
2130 Src0->getSubReg() != Src1->getSubReg() ||
2131 Src0->getSubReg() != AMDGPU::NoSubRegister)
2132 return nullptr;
2133
2134 // Can't fold up if we have modifiers.
2135 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
2136 return nullptr;
2137
2138 unsigned Src0Mods
2139 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
2140 unsigned Src1Mods
2141 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
2142
2143 // Having a 0 op_sel_hi would require swizzling the output in the source
2144 // instruction, which we can't do.
2145 unsigned UnsetMods =
2146 (Op == AMDGPU::V_PK_MAX_F16 || Op == AMDGPU::V_PK_MAX_NUM_BF16)
2148 : 0u;
2149 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
2150 return nullptr;
2151 return Src0;
2152 }
2153 default:
2154 return nullptr;
2155 }
2156}
2157
2158// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
2159bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
2160 const MachineOperand *ClampSrc = isClamp(MI);
2161 if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
2162 return false;
2163
2164 if (!ClampSrc->getReg().isVirtual())
2165 return false;
2166
2167 // Look through COPY. COPY only observed with True16.
2168 Register DefSrcReg = TRI->lookThruCopyLike(ClampSrc->getReg(), MRI);
2169 MachineInstr *Def =
2170 MRI->getVRegDef(DefSrcReg.isVirtual() ? DefSrcReg : ClampSrc->getReg());
2171
2172 // The type of clamp must be compatible.
2173 if (TII->getClampMask(*Def) != TII->getClampMask(MI))
2174 return false;
2175
2176 if (Def->mayRaiseFPException())
2177 return false;
2178
2179 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
2180 if (!DefClamp)
2181 return false;
2182
2183 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
2184
2185 // Clamp is applied after omod, so it is OK if omod is set.
2186 DefClamp->setImm(1);
2187
2188 Register DefReg = Def->getOperand(0).getReg();
2189 Register MIDstReg = MI.getOperand(0).getReg();
2190 if (TRI->isSGPRReg(*MRI, DefReg)) {
2191 // Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
2192 // instruction with a VGPR dst.
2193 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
2194 MIDstReg)
2195 .addReg(DefReg);
2196 } else {
2197 MRI->replaceRegWith(MIDstReg, DefReg);
2198 }
2199 MI.eraseFromParent();
2200
2201 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
2202 // instruction, so we might as well convert it to the more flexible VOP3-only
2203 // mad/fma form.
2204 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
2205 Def->eraseFromParent();
2206
2207 return true;
2208}
2209
2210static int getOModValue(unsigned Opc, int64_t Val) {
2211 switch (Opc) {
2212 case AMDGPU::V_MUL_F64_e64:
2213 case AMDGPU::V_MUL_F64_pseudo_e64: {
2214 switch (Val) {
2215 case 0x3fe0000000000000: // 0.5
2216 return SIOutMods::DIV2;
2217 case 0x4000000000000000: // 2.0
2218 return SIOutMods::MUL2;
2219 case 0x4010000000000000: // 4.0
2220 return SIOutMods::MUL4;
2221 default:
2222 return SIOutMods::NONE;
2223 }
2224 }
2225 case AMDGPU::V_MUL_F32_e64: {
2226 switch (static_cast<uint32_t>(Val)) {
2227 case 0x3f000000: // 0.5
2228 return SIOutMods::DIV2;
2229 case 0x40000000: // 2.0
2230 return SIOutMods::MUL2;
2231 case 0x40800000: // 4.0
2232 return SIOutMods::MUL4;
2233 default:
2234 return SIOutMods::NONE;
2235 }
2236 }
2237 case AMDGPU::V_MUL_F16_e64:
2238 case AMDGPU::V_MUL_F16_t16_e64:
2239 case AMDGPU::V_MUL_F16_fake16_e64: {
2240 switch (static_cast<uint16_t>(Val)) {
2241 case 0x3800: // 0.5
2242 return SIOutMods::DIV2;
2243 case 0x4000: // 2.0
2244 return SIOutMods::MUL2;
2245 case 0x4400: // 4.0
2246 return SIOutMods::MUL4;
2247 default:
2248 return SIOutMods::NONE;
2249 }
2250 }
2251 default:
2252 llvm_unreachable("invalid mul opcode");
2253 }
2254}
2255
2256// FIXME: Does this really not support denormals with f16?
2257// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
2258// handled, so will anything other than that break?
2259std::pair<const MachineOperand *, int>
2260SIFoldOperandsImpl::isOMod(const MachineInstr &MI) const {
2261 unsigned Op = MI.getOpcode();
2262 switch (Op) {
2263 case AMDGPU::V_MUL_F64_e64:
2264 case AMDGPU::V_MUL_F64_pseudo_e64:
2265 case AMDGPU::V_MUL_F32_e64:
2266 case AMDGPU::V_MUL_F16_t16_e64:
2267 case AMDGPU::V_MUL_F16_fake16_e64:
2268 case AMDGPU::V_MUL_F16_e64: {
2269 // If output denormals are enabled, omod is ignored.
2270 if ((Op == AMDGPU::V_MUL_F32_e64 &&
2272 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
2273 Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
2274 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
2277 MI.mayRaiseFPException())
2278 return std::pair(nullptr, SIOutMods::NONE);
2279
2280 const MachineOperand *RegOp = nullptr;
2281 const MachineOperand *ImmOp = nullptr;
2282 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2283 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2284 if (Src0->isImm()) {
2285 ImmOp = Src0;
2286 RegOp = Src1;
2287 } else if (Src1->isImm()) {
2288 ImmOp = Src1;
2289 RegOp = Src0;
2290 } else
2291 return std::pair(nullptr, SIOutMods::NONE);
2292
2293 int OMod = getOModValue(Op, ImmOp->getImm());
2294 if (OMod == SIOutMods::NONE ||
2295 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
2296 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
2297 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
2298 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
2299 return std::pair(nullptr, SIOutMods::NONE);
2300
2301 return std::pair(RegOp, OMod);
2302 }
2303 case AMDGPU::V_ADD_F64_e64:
2304 case AMDGPU::V_ADD_F64_pseudo_e64:
2305 case AMDGPU::V_ADD_F32_e64:
2306 case AMDGPU::V_ADD_F16_e64:
2307 case AMDGPU::V_ADD_F16_t16_e64:
2308 case AMDGPU::V_ADD_F16_fake16_e64: {
2309 // If output denormals are enabled, omod is ignored.
2310 if ((Op == AMDGPU::V_ADD_F32_e64 &&
2312 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
2313 Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
2314 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
2316 return std::pair(nullptr, SIOutMods::NONE);
2317
2318 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
2319 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2320 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2321
2322 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
2323 Src0->getSubReg() == Src1->getSubReg() &&
2324 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
2325 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
2326 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
2327 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
2328 return std::pair(Src0, SIOutMods::MUL2);
2329
2330 return std::pair(nullptr, SIOutMods::NONE);
2331 }
2332 default:
2333 return std::pair(nullptr, SIOutMods::NONE);
2334 }
2335}
2336
2337// FIXME: Does this need to check IEEE bit on function?
2338bool SIFoldOperandsImpl::tryFoldOMod(MachineInstr &MI) {
2339 const MachineOperand *RegOp;
2340 int OMod;
2341 std::tie(RegOp, OMod) = isOMod(MI);
2342 if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
2343 RegOp->getSubReg() != AMDGPU::NoSubRegister ||
2344 !MRI->hasOneNonDBGUser(RegOp->getReg()))
2345 return false;
2346
2347 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
2348 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
2349 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
2350 return false;
2351
2352 if (Def->mayRaiseFPException())
2353 return false;
2354
2355 // Clamp is applied after omod. If the source already has clamp set, don't
2356 // fold it.
2357 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
2358 return false;
2359
2360 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
2361
2362 DefOMod->setImm(OMod);
2363 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
2364 // Kill flags can be wrong if we replaced a def inside a loop with a def
2365 // outside the loop.
2366 MRI->clearKillFlags(Def->getOperand(0).getReg());
2367 MI.eraseFromParent();
2368
2369 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
2370 // instruction, so we might as well convert it to the more flexible VOP3-only
2371 // mad/fma form.
2372 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
2373 Def->eraseFromParent();
2374
2375 return true;
2376}
2377
2378// Try to fold a reg_sequence with vgpr output and agpr inputs into an
2379// instruction which can take an agpr. So far that means a store.
2380bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
2381 assert(MI.isRegSequence());
2382 auto Reg = MI.getOperand(0).getReg();
2383
2384 if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
2385 !MRI->hasOneNonDBGUse(Reg))
2386 return false;
2387
2389 if (!getRegSeqInit(Defs, Reg))
2390 return false;
2391
2392 for (auto &[Op, SubIdx] : Defs) {
2393 if (!Op->isReg())
2394 return false;
2395 if (TRI->isAGPR(*MRI, Op->getReg()))
2396 continue;
2397 // Maybe this is a COPY from AREG
2398 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
2399 if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
2400 return false;
2401 if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
2402 return false;
2403 }
2404
2405 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
2406 MachineInstr *UseMI = Op->getParent();
2407 while (UseMI->isCopy() && !Op->getSubReg()) {
2408 Reg = UseMI->getOperand(0).getReg();
2409 if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
2410 return false;
2411 Op = &*MRI->use_nodbg_begin(Reg);
2412 UseMI = Op->getParent();
2413 }
2414
2415 if (Op->getSubReg())
2416 return false;
2417
2418 unsigned OpIdx = Op - &UseMI->getOperand(0);
2419 const MCInstrDesc &InstDesc = UseMI->getDesc();
2420 const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx, TRI);
2421 if (!OpRC || !TRI->isVectorSuperClass(OpRC))
2422 return false;
2423
2424 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
2425 auto Dst = MRI->createVirtualRegister(NewDstRC);
2426 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
2427 TII->get(AMDGPU::REG_SEQUENCE), Dst);
2428
2429 for (auto &[Def, SubIdx] : Defs) {
2430 Def->setIsKill(false);
2431 if (TRI->isAGPR(*MRI, Def->getReg())) {
2432 RS.add(*Def);
2433 } else { // This is a copy
2434 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
2435 SubDef->getOperand(1).setIsKill(false);
2436 RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg());
2437 }
2438 RS.addImm(SubIdx);
2439 }
2440
2441 Op->setReg(Dst);
2442 if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
2443 Op->setReg(Reg);
2444 RS->eraseFromParent();
2445 return false;
2446 }
2447
2448 LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
2449
2450 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
2451 // in which case we can erase them all later in runOnMachineFunction.
2452 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
2453 MI.eraseFromParent();
2454 return true;
2455}
2456
2457/// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and
2458/// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg
2459static bool isAGPRCopy(const SIRegisterInfo &TRI,
2460 const MachineRegisterInfo &MRI, const MachineInstr &Copy,
2461 Register &OutReg, unsigned &OutSubReg) {
2462 assert(Copy.isCopy());
2463
2464 const MachineOperand &CopySrc = Copy.getOperand(1);
2465 Register CopySrcReg = CopySrc.getReg();
2466 if (!CopySrcReg.isVirtual())
2467 return false;
2468
2469 // Common case: copy from AGPR directly, e.g.
2470 // %1:vgpr_32 = COPY %0:agpr_32
2471 if (TRI.isAGPR(MRI, CopySrcReg)) {
2472 OutReg = CopySrcReg;
2473 OutSubReg = CopySrc.getSubReg();
2474 return true;
2475 }
2476
2477 // Sometimes it can also involve two copies, e.g.
2478 // %1:vgpr_256 = COPY %0:agpr_256
2479 // %2:vgpr_32 = COPY %1:vgpr_256.sub0
2480 const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg);
2481 if (!CopySrcDef || !CopySrcDef->isCopy())
2482 return false;
2483
2484 const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1);
2485 Register OtherCopySrcReg = OtherCopySrc.getReg();
2486 if (!OtherCopySrcReg.isVirtual() ||
2487 CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister ||
2488 OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister ||
2489 !TRI.isAGPR(MRI, OtherCopySrcReg))
2490 return false;
2491
2492 OutReg = OtherCopySrcReg;
2493 OutSubReg = CopySrc.getSubReg();
2494 return true;
2495}
2496
2497// Try to hoist an AGPR to VGPR copy across a PHI.
2498// This should allow folding of an AGPR into a consumer which may support it.
2499//
2500// Example 1: LCSSA PHI
2501// loop:
2502// %1:vreg = COPY %0:areg
2503// exit:
2504// %2:vreg = PHI %1:vreg, %loop
2505// =>
2506// loop:
2507// exit:
2508// %1:areg = PHI %0:areg, %loop
2509// %2:vreg = COPY %1:areg
2510//
2511// Example 2: PHI with multiple incoming values:
2512// entry:
2513// %1:vreg = GLOBAL_LOAD(..)
2514// loop:
2515// %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
2516// %3:areg = COPY %2:vreg
2517// %4:areg = (instr using %3:areg)
2518// %5:vreg = COPY %4:areg
2519// =>
2520// entry:
2521// %1:vreg = GLOBAL_LOAD(..)
2522// %2:areg = COPY %1:vreg
2523// loop:
2524// %3:areg = PHI %2:areg, %entry, %X:areg,
2525// %4:areg = (instr using %3:areg)
2526bool SIFoldOperandsImpl::tryFoldPhiAGPR(MachineInstr &PHI) {
2527 assert(PHI.isPHI());
2528
2529 Register PhiOut = PHI.getOperand(0).getReg();
2530 if (!TRI->isVGPR(*MRI, PhiOut))
2531 return false;
2532
2533 // Iterate once over all incoming values of the PHI to check if this PHI is
2534 // eligible, and determine the exact AGPR RC we'll target.
2535 const TargetRegisterClass *ARC = nullptr;
2536 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2537 MachineOperand &MO = PHI.getOperand(K);
2538 MachineInstr *Copy = MRI->getVRegDef(MO.getReg());
2539 if (!Copy || !Copy->isCopy())
2540 continue;
2541
2542 Register AGPRSrc;
2543 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
2544 if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask))
2545 continue;
2546
2547 const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc);
2548 if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
2549 CopyInRC = SubRC;
2550
2551 if (ARC && !ARC->hasSubClassEq(CopyInRC))
2552 return false;
2553 ARC = CopyInRC;
2554 }
2555
2556 if (!ARC)
2557 return false;
2558
2559 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
2560
2561 // Rewrite the PHI's incoming values to ARC.
2562 LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI);
2563 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2564 MachineOperand &MO = PHI.getOperand(K);
2565 Register Reg = MO.getReg();
2566
2568 MachineBasicBlock *InsertMBB = nullptr;
2569
2570 // Look at the def of Reg, ignoring all copies.
2571 unsigned CopyOpc = AMDGPU::COPY;
2572 if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
2573
2574 // Look at pre-existing COPY instructions from ARC: Steal the operand. If
2575 // the copy was single-use, it will be removed by DCE later.
2576 if (Def->isCopy()) {
2577 Register AGPRSrc;
2578 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
2579 if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) {
2580 MO.setReg(AGPRSrc);
2581 MO.setSubReg(AGPRSubReg);
2582 continue;
2583 }
2584
2585 // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
2586 // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
2587 // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
2588 // is unlikely to be profitable.
2589 //
2590 // Note that V_ACCVGPR_WRITE is only used for AGPR_32.
2591 MachineOperand &CopyIn = Def->getOperand(1);
2592 if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) &&
2593 TRI->isSGPRReg(*MRI, CopyIn.getReg()))
2594 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
2595 }
2596
2597 InsertMBB = Def->getParent();
2598 InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator());
2599 } else {
2600 InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
2601 InsertPt = InsertMBB->getFirstTerminator();
2602 }
2603
2604 Register NewReg = MRI->createVirtualRegister(ARC);
2605 MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
2606 TII->get(CopyOpc), NewReg)
2607 .addReg(Reg);
2608 MO.setReg(NewReg);
2609
2610 (void)MI;
2611 LLVM_DEBUG(dbgs() << " Created COPY: " << *MI);
2612 }
2613
2614 // Replace the PHI's result with a new register.
2615 Register NewReg = MRI->createVirtualRegister(ARC);
2616 PHI.getOperand(0).setReg(NewReg);
2617
2618 // COPY that new register back to the original PhiOut register. This COPY will
2619 // usually be folded out later.
2620 MachineBasicBlock *MBB = PHI.getParent();
2621 BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(),
2622 TII->get(AMDGPU::COPY), PhiOut)
2623 .addReg(NewReg);
2624
2625 LLVM_DEBUG(dbgs() << " Done: Folded " << PHI);
2626 return true;
2627}
2628
2629// Attempt to convert VGPR load to an AGPR load.
2630bool SIFoldOperandsImpl::tryFoldLoad(MachineInstr &MI) {
2631 assert(MI.mayLoad());
2632 if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
2633 return false;
2634
2635 MachineOperand &Def = MI.getOperand(0);
2636 if (!Def.isDef())
2637 return false;
2638
2639 Register DefReg = Def.getReg();
2640
2641 if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
2642 return false;
2643
2645 llvm::make_pointer_range(MRI->use_nodbg_instructions(DefReg)));
2646 SmallVector<Register, 8> MoveRegs;
2647
2648 if (Users.empty())
2649 return false;
2650
2651 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
2652 while (!Users.empty()) {
2653 const MachineInstr *I = Users.pop_back_val();
2654 if (!I->isCopy() && !I->isRegSequence())
2655 return false;
2656 Register DstReg = I->getOperand(0).getReg();
2657 // Physical registers may have more than one instruction definitions
2658 if (DstReg.isPhysical())
2659 return false;
2660 if (TRI->isAGPR(*MRI, DstReg))
2661 continue;
2662 MoveRegs.push_back(DstReg);
2663 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg))
2664 Users.push_back(&U);
2665 }
2666
2667 const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
2668 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
2669 if (!TII->isOperandLegal(MI, 0, &Def)) {
2670 MRI->setRegClass(DefReg, RC);
2671 return false;
2672 }
2673
2674 while (!MoveRegs.empty()) {
2675 Register Reg = MoveRegs.pop_back_val();
2676 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
2677 }
2678
2679 LLVM_DEBUG(dbgs() << "Folded " << MI);
2680
2681 return true;
2682}
2683
2684// tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
2685// For GFX90A and later, this is pretty much always a good thing, but for GFX908
2686// there's cases where it can create a lot more AGPR-AGPR copies, which are
2687// expensive on this architecture due to the lack of V_ACCVGPR_MOV.
2688//
2689// This function looks at all AGPR PHIs in a basic block and collects their
2690// operands. Then, it checks for register that are used more than once across
2691// all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
2692// having to create one VGPR temporary per use, which can get very messy if
2693// these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
2694// element).
2695//
2696// Example
2697// a:
2698// %in:agpr_256 = COPY %foo:vgpr_256
2699// c:
2700// %x:agpr_32 = ..
2701// b:
2702// %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
2703// %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
2704// %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
2705// =>
2706// a:
2707// %in:agpr_256 = COPY %foo:vgpr_256
2708// %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
2709// %tmp_agpr:agpr_32 = COPY %tmp
2710// c:
2711// %x:agpr_32 = ..
2712// b:
2713// %0:areg = PHI %tmp_agpr, %a, %x, %c
2714// %1:areg = PHI %tmp_agpr, %a, %y, %c
2715// %2:areg = PHI %tmp_agpr, %a, %z, %c
2716bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
2717 // This is only really needed on GFX908 where AGPR-AGPR copies are
2718 // unreasonably difficult.
2719 if (ST->hasGFX90AInsts())
2720 return false;
2721
2722 // Look at all AGPR Phis and collect the register + subregister used.
2723 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
2724 RegToMO;
2725
2726 for (auto &MI : MBB) {
2727 if (!MI.isPHI())
2728 break;
2729
2730 if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))
2731 continue;
2732
2733 for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
2734 MachineOperand &PhiMO = MI.getOperand(K);
2735 if (!PhiMO.getSubReg())
2736 continue;
2737 RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
2738 }
2739 }
2740
2741 // For all (Reg, SubReg) pair that are used more than once, cache the value in
2742 // a VGPR.
2743 bool Changed = false;
2744 for (const auto &[Entry, MOs] : RegToMO) {
2745 if (MOs.size() == 1)
2746 continue;
2747
2748 const auto [Reg, SubReg] = Entry;
2749 MachineInstr *Def = MRI->getVRegDef(Reg);
2750 MachineBasicBlock *DefMBB = Def->getParent();
2751
2752 // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
2753 // out.
2754 const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
2755 Register TempVGPR =
2756 MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
2757 MachineInstr *VGPRCopy =
2758 BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
2759 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2760 .addReg(Reg, /* flags */ 0, SubReg);
2761
2762 // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
2763 Register TempAGPR = MRI->createVirtualRegister(ARC);
2764 BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(),
2765 TII->get(AMDGPU::COPY), TempAGPR)
2766 .addReg(TempVGPR);
2767
2768 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
2769 for (MachineOperand *MO : MOs) {
2770 MO->setReg(TempAGPR);
2771 MO->setSubReg(AMDGPU::NoSubRegister);
2772 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n");
2773 }
2774
2775 Changed = true;
2776 }
2777
2778 return Changed;
2779}
2780
2781bool SIFoldOperandsImpl::run(MachineFunction &MF) {
2782 this->MF = &MF;
2783 MRI = &MF.getRegInfo();
2784 ST = &MF.getSubtarget<GCNSubtarget>();
2785 TII = ST->getInstrInfo();
2786 TRI = &TII->getRegisterInfo();
2787 MFI = MF.getInfo<SIMachineFunctionInfo>();
2788
2789 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
2790 // correctly handle signed zeros.
2791 //
2792 // FIXME: Also need to check strictfp
2793 bool IsIEEEMode = MFI->getMode().IEEE;
2794 bool HasNSZ = MFI->hasNoSignedZerosFPMath();
2795
2796 bool Changed = false;
2797 for (MachineBasicBlock *MBB : depth_first(&MF)) {
2798 MachineOperand *CurrentKnownM0Val = nullptr;
2799 for (auto &MI : make_early_inc_range(*MBB)) {
2800 Changed |= tryFoldCndMask(MI);
2801
2802 if (tryFoldZeroHighBits(MI)) {
2803 Changed = true;
2804 continue;
2805 }
2806
2807 if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
2808 Changed = true;
2809 continue;
2810 }
2811
2812 if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
2813 Changed = true;
2814 continue;
2815 }
2816
2817 if (MI.mayLoad() && tryFoldLoad(MI)) {
2818 Changed = true;
2819 continue;
2820 }
2821
2822 if (TII->isFoldableCopy(MI)) {
2823 Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
2824 continue;
2825 }
2826
2827 // Saw an unknown clobber of m0, so we no longer know what it is.
2828 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
2829 CurrentKnownM0Val = nullptr;
2830
2831 // TODO: Omod might be OK if there is NSZ only on the source
2832 // instruction, and not the omod multiply.
2833 if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
2834 !tryFoldOMod(MI))
2835 Changed |= tryFoldClamp(MI);
2836 }
2837
2838 Changed |= tryOptimizeAGPRPhis(*MBB);
2839 }
2840
2841 return Changed;
2842}
2843
2846 MFPropsModifier _(*this, MF);
2847
2848 bool Changed = SIFoldOperandsImpl().run(MF);
2849 if (!Changed) {
2850 return PreservedAnalyses::all();
2851 }
2853 PA.preserveSet<CFGAnalyses>();
2854 return PA;
2855}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
#define _
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
#define I(x, y, z)
Definition MD5.cpp:58
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, FoldCandidate &&Entry)
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static unsigned getMovOpc(bool IsScalar)
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc)
static MachineOperand * lookUpCopyChain(const SIInstrInfo &TII, const MachineRegisterInfo &MRI, Register SrcReg)
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static int Lookup(ArrayRef< TableEntry > Table, unsigned Opcode)
Value * RHS
Value * LHS
Represent the analysis usage information of a pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool hasGFX90AInsts() const
const SIInstrInfo * getInstrInfo() const override
bool hasDOTOpSelHazard() const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
Describe properties that are true of each instruction in the target description file.
ArrayRef< MCOperandInfo > operands() const
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
An RAII based helper class to modify MachineFunctionProperties when running pass.
LLVM_ABI iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg=Register(), bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
LivenessQueryResult
Possible outcome of a register liveness query to computeRegisterLiveness()
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool isCopy() const
const MachineBasicBlock * getParent() const
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
bool isRegSequence() const
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
LLVM_ABI void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
LLVM_ABI void substPhysReg(MCRegister Reg, const TargetRegisterInfo &)
substPhysReg - Substitute the current register with the physical register Reg, taking any existing Su...
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
static MachineOperand CreateFI(int Idx)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:74
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
IteratorT begin() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:209
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:222
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:224
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:210
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:208
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:218
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:211
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:225
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:236
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:237
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:221
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:217
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:223
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:238
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
LLVM_READONLY int getFlatScratchInstSSfromSV(uint16_t Opcode)
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Dead
Unused definition.
@ Kill
The last use of a register.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2116
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Op::Description Desc
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
FunctionPass * createSIFoldOperandsLegacyPass()
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
char & SIFoldOperandsLegacyID
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:363
iterator_range< df_iterator< T > > depth_first(const T &G)
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
constexpr uint64_t Make_64(uint32_t High, uint32_t Low)
Make a 64-bit integer from a high / low pair of 32-bit integers.
Definition MathExtras.h:169
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.