LLVM 23.0.0git
SIFoldOperands.cpp
Go to the documentation of this file.
1//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7/// \file
8//===----------------------------------------------------------------------===//
9//
10
11#include "SIFoldOperands.h"
12#include "AMDGPU.h"
13#include "GCNSubtarget.h"
15#include "SIInstrInfo.h"
17#include "SIRegisterInfo.h"
22
23#define DEBUG_TYPE "si-fold-operands"
24using namespace llvm;
25
26namespace {
27
28/// Track a value we may want to fold into downstream users, applying
29/// subregister extracts along the way.
30struct FoldableDef {
31 union {
32 MachineOperand *OpToFold = nullptr;
33 uint64_t ImmToFold;
34 int FrameIndexToFold;
35 };
36
37 /// Register class of the originally defined value.
38 const TargetRegisterClass *DefRC = nullptr;
39
40 /// Track the original defining instruction for the value.
41 const MachineInstr *DefMI = nullptr;
42
43 /// Subregister to apply to the value at the use point.
44 unsigned DefSubReg = AMDGPU::NoSubRegister;
45
46 /// Kind of value stored in the union.
48
49 FoldableDef() = delete;
50 FoldableDef(MachineOperand &FoldOp, const TargetRegisterClass *DefRC,
51 unsigned DefSubReg = AMDGPU::NoSubRegister)
52 : DefRC(DefRC), DefSubReg(DefSubReg), Kind(FoldOp.getType()) {
53
54 if (FoldOp.isImm()) {
55 ImmToFold = FoldOp.getImm();
56 } else if (FoldOp.isFI()) {
57 FrameIndexToFold = FoldOp.getIndex();
58 } else {
59 assert(FoldOp.isReg() || FoldOp.isGlobal());
60 OpToFold = &FoldOp;
61 }
62
63 DefMI = FoldOp.getParent();
64 }
65
66 FoldableDef(int64_t FoldImm, const TargetRegisterClass *DefRC,
67 unsigned DefSubReg = AMDGPU::NoSubRegister)
68 : ImmToFold(FoldImm), DefRC(DefRC), DefSubReg(DefSubReg),
70
71 /// Copy the current def and apply \p SubReg to the value.
72 FoldableDef getWithSubReg(const SIRegisterInfo &TRI, unsigned SubReg) const {
73 FoldableDef Copy(*this);
74 Copy.DefSubReg = TRI.composeSubRegIndices(DefSubReg, SubReg);
75 return Copy;
76 }
77
78 bool isReg() const { return Kind == MachineOperand::MO_Register; }
79
80 Register getReg() const {
81 assert(isReg());
82 return OpToFold->getReg();
83 }
84
85 unsigned getSubReg() const {
86 assert(isReg());
87 return OpToFold->getSubReg();
88 }
89
90 bool isImm() const { return Kind == MachineOperand::MO_Immediate; }
91
92 bool isFI() const {
93 return Kind == MachineOperand::MO_FrameIndex;
94 }
95
96 int getFI() const {
97 assert(isFI());
98 return FrameIndexToFold;
99 }
100
101 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
102
103 /// Return the effective immediate value defined by this instruction, after
104 /// application of any subregister extracts which may exist between the use
105 /// and def instruction.
106 std::optional<int64_t> getEffectiveImmVal() const {
107 assert(isImm());
108 return SIInstrInfo::extractSubregFromImm(ImmToFold, DefSubReg);
109 }
110
111 /// Check if it is legal to fold this effective value into \p MI's \p OpNo
112 /// operand.
113 bool isOperandLegal(const SIInstrInfo &TII, const MachineInstr &MI,
114 unsigned OpIdx) const {
115 switch (Kind) {
117 std::optional<int64_t> ImmToFold = getEffectiveImmVal();
118 if (!ImmToFold)
119 return false;
120
121 // TODO: Should verify the subregister index is supported by the class
122 // TODO: Avoid the temporary MachineOperand
123 MachineOperand TmpOp = MachineOperand::CreateImm(*ImmToFold);
124 return TII.isOperandLegal(MI, OpIdx, &TmpOp);
125 }
127 if (DefSubReg != AMDGPU::NoSubRegister)
128 return false;
129 MachineOperand TmpOp = MachineOperand::CreateFI(FrameIndexToFold);
130 return TII.isOperandLegal(MI, OpIdx, &TmpOp);
131 }
132 default:
133 // TODO: Try to apply DefSubReg, for global address we can extract
134 // low/high.
135 if (DefSubReg != AMDGPU::NoSubRegister)
136 return false;
137 return TII.isOperandLegal(MI, OpIdx, OpToFold);
138 }
139
140 llvm_unreachable("covered MachineOperand kind switch");
141 }
142};
143
144struct FoldCandidate {
146 FoldableDef Def;
147 int ShrinkOpcode;
148 unsigned UseOpNo;
149 bool Commuted;
150
151 FoldCandidate(MachineInstr *MI, unsigned OpNo, FoldableDef Def,
152 bool Commuted = false, int ShrinkOp = -1)
153 : UseMI(MI), Def(Def), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
154 Commuted(Commuted) {}
155
156 bool isFI() const { return Def.isFI(); }
157
158 int getFI() const {
159 assert(isFI());
160 return Def.FrameIndexToFold;
161 }
162
163 bool isImm() const { return Def.isImm(); }
164
165 bool isReg() const { return Def.isReg(); }
166
167 Register getReg() const { return Def.getReg(); }
168
169 bool isGlobal() const { return Def.isGlobal(); }
170
171 bool needsShrink() const { return ShrinkOpcode != -1; }
172};
173
174class SIFoldOperandsImpl {
175public:
176 MachineFunction *MF;
178 const SIInstrInfo *TII;
179 const SIRegisterInfo *TRI;
180 const GCNSubtarget *ST;
181 const SIMachineFunctionInfo *MFI;
182
183 bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
184 const FoldableDef &OpToFold) const;
185
186 // TODO: Just use TII::getVALUOp
187 unsigned convertToVALUOp(unsigned Opc, bool UseVOP3 = false) const {
188 switch (Opc) {
189 case AMDGPU::S_ADD_I32: {
190 if (ST->hasAddNoCarryInsts())
191 return UseVOP3 ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_U32_e32;
192 return UseVOP3 ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
193 }
194 case AMDGPU::S_OR_B32:
195 return UseVOP3 ? AMDGPU::V_OR_B32_e64 : AMDGPU::V_OR_B32_e32;
196 case AMDGPU::S_AND_B32:
197 return UseVOP3 ? AMDGPU::V_AND_B32_e64 : AMDGPU::V_AND_B32_e32;
198 case AMDGPU::S_MUL_I32:
199 return AMDGPU::V_MUL_LO_U32_e64;
200 default:
201 return AMDGPU::INSTRUCTION_LIST_END;
202 }
203 }
204
205 bool foldCopyToVGPROfScalarAddOfFrameIndex(Register DstReg, Register SrcReg,
206 MachineInstr &MI) const;
207
208 bool updateOperand(FoldCandidate &Fold) const;
209
210 bool canUseImmWithOpSel(const MachineInstr *MI, unsigned UseOpNo,
211 int64_t ImmVal) const;
212
213 /// Try to fold immediate \p ImmVal into \p MI's operand at index \p UseOpNo.
214 bool tryFoldImmWithOpSel(MachineInstr *MI, unsigned UseOpNo,
215 int64_t ImmVal) const;
216
217 bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
218 MachineInstr *MI, unsigned OpNo,
219 const FoldableDef &OpToFold) const;
220 bool isUseSafeToFold(const MachineInstr &MI,
221 const MachineOperand &UseMO) const;
222
223 const TargetRegisterClass *getRegSeqInit(
224 MachineInstr &RegSeq,
225 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const;
226
227 const TargetRegisterClass *
228 getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
229 Register UseReg) const;
230
231 std::pair<int64_t, const TargetRegisterClass *>
232 isRegSeqSplat(MachineInstr &RegSeg) const;
233
234 bool tryFoldRegSeqSplat(MachineInstr *UseMI, unsigned UseOpIdx,
235 int64_t SplatVal,
236 const TargetRegisterClass *SplatRC) const;
237
238 bool tryToFoldACImm(const FoldableDef &OpToFold, MachineInstr *UseMI,
239 unsigned UseOpIdx,
240 SmallVectorImpl<FoldCandidate> &FoldList) const;
241 void foldOperand(FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
243 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
244
245 bool tryConstantFoldOp(MachineInstr *MI) const;
246 bool tryFoldCndMask(MachineInstr &MI) const;
247 bool tryFoldZeroHighBits(MachineInstr &MI) const;
248 bool foldInstOperand(MachineInstr &MI, const FoldableDef &OpToFold) const;
249
250 bool foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const;
251 bool tryFoldFoldableCopy(MachineInstr &MI,
252 MachineOperand *&CurrentKnownM0Val) const;
253
254 const MachineOperand *isClamp(const MachineInstr &MI) const;
255 bool tryFoldClamp(MachineInstr &MI);
256
257 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
258 bool tryFoldOMod(MachineInstr &MI);
259 bool tryFoldRegSequence(MachineInstr &MI);
260 bool tryFoldPhiAGPR(MachineInstr &MI);
261 bool tryFoldLoad(MachineInstr &MI);
262
263 bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
264
265public:
266 SIFoldOperandsImpl() = default;
267
268 bool run(MachineFunction &MF);
269};
270
271class SIFoldOperandsLegacy : public MachineFunctionPass {
272public:
273 static char ID;
274
275 SIFoldOperandsLegacy() : MachineFunctionPass(ID) {}
276
277 bool runOnMachineFunction(MachineFunction &MF) override {
278 if (skipFunction(MF.getFunction()))
279 return false;
280 return SIFoldOperandsImpl().run(MF);
281 }
282
283 StringRef getPassName() const override { return "SI Fold Operands"; }
284
285 void getAnalysisUsage(AnalysisUsage &AU) const override {
286 AU.setPreservesCFG();
288 }
289
290 MachineFunctionProperties getRequiredProperties() const override {
291 return MachineFunctionProperties().setIsSSA();
292 }
293};
294
295} // End anonymous namespace.
296
297INITIALIZE_PASS(SIFoldOperandsLegacy, DEBUG_TYPE, "SI Fold Operands", false,
298 false)
299
300char SIFoldOperandsLegacy::ID = 0;
301
302char &llvm::SIFoldOperandsLegacyID = SIFoldOperandsLegacy::ID;
303
306 const MachineOperand &MO) {
307 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
308 if (const TargetRegisterClass *SubRC =
309 TRI.getSubRegisterClass(RC, MO.getSubReg()))
310 RC = SubRC;
311 return RC;
312}
313
314// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
315static unsigned macToMad(unsigned Opc) {
316 switch (Opc) {
317 case AMDGPU::V_MAC_F32_e64:
318 return AMDGPU::V_MAD_F32_e64;
319 case AMDGPU::V_MAC_F16_e64:
320 return AMDGPU::V_MAD_F16_e64;
321 case AMDGPU::V_FMAC_F32_e64:
322 return AMDGPU::V_FMA_F32_e64;
323 case AMDGPU::V_FMAC_F16_e64:
324 return AMDGPU::V_FMA_F16_gfx9_e64;
325 case AMDGPU::V_FMAC_F16_t16_e64:
326 return AMDGPU::V_FMA_F16_gfx9_t16_e64;
327 case AMDGPU::V_FMAC_F16_fake16_e64:
328 return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
329 case AMDGPU::V_FMAC_LEGACY_F32_e64:
330 return AMDGPU::V_FMA_LEGACY_F32_e64;
331 case AMDGPU::V_FMAC_F64_e64:
332 return AMDGPU::V_FMA_F64_e64;
333 }
334 return AMDGPU::INSTRUCTION_LIST_END;
335}
336
337// TODO: Add heuristic that the frame index might not fit in the addressing mode
338// immediate offset to avoid materializing in loops.
339bool SIFoldOperandsImpl::frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
340 const FoldableDef &OpToFold) const {
341 if (!OpToFold.isFI())
342 return false;
343
344 const unsigned Opc = UseMI.getOpcode();
345 switch (Opc) {
346 case AMDGPU::S_ADD_I32:
347 case AMDGPU::S_ADD_U32:
348 case AMDGPU::V_ADD_U32_e32:
349 case AMDGPU::V_ADD_CO_U32_e32:
350 // TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have
351 // to insert the wave size shift at every point we use the index.
352 // TODO: Fix depending on visit order to fold immediates into the operand
353 return UseMI.getOperand(OpNo == 1 ? 2 : 1).isImm() &&
354 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
355 case AMDGPU::V_ADD_U32_e64:
356 case AMDGPU::V_ADD_CO_U32_e64:
357 return UseMI.getOperand(OpNo == 2 ? 3 : 2).isImm() &&
358 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
359 default:
360 break;
361 }
362
363 if (TII->isMUBUF(UseMI))
364 return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
365 if (!TII->isFLATScratch(UseMI))
366 return false;
367
368 int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
369 if (OpNo == SIdx)
370 return true;
371
372 int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
373 return OpNo == VIdx && SIdx == -1;
374}
375
376/// Fold %vgpr = COPY (S_ADD_I32 x, frameindex)
377///
378/// => %vgpr = V_ADD_U32 x, frameindex
379bool SIFoldOperandsImpl::foldCopyToVGPROfScalarAddOfFrameIndex(
380 Register DstReg, Register SrcReg, MachineInstr &MI) const {
381 if (TRI->isVGPR(*MRI, DstReg) && TRI->isSGPRReg(*MRI, SrcReg) &&
382 MRI->hasOneNonDBGUse(SrcReg)) {
383 MachineInstr *Def = MRI->getVRegDef(SrcReg);
384 if (!Def || Def->getNumOperands() != 4)
385 return false;
386
387 MachineOperand *Src0 = &Def->getOperand(1);
388 MachineOperand *Src1 = &Def->getOperand(2);
389
390 // TODO: This is profitable with more operand types, and for more
391 // opcodes. But ultimately this is working around poor / nonexistent
392 // regbankselect.
393 if (!Src0->isFI() && !Src1->isFI())
394 return false;
395
396 if (Src0->isFI())
397 std::swap(Src0, Src1);
398
399 const bool UseVOP3 = !Src0->isImm() || TII->isInlineConstant(*Src0);
400 unsigned NewOp = convertToVALUOp(Def->getOpcode(), UseVOP3);
401 if (NewOp == AMDGPU::INSTRUCTION_LIST_END ||
402 !Def->getOperand(3).isDead()) // Check if scc is dead
403 return false;
404
405 MachineBasicBlock *MBB = Def->getParent();
406 const DebugLoc &DL = Def->getDebugLoc();
407 if (NewOp != AMDGPU::V_ADD_CO_U32_e32) {
408 MachineInstrBuilder Add =
409 BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg);
410
411 if (Add->getDesc().getNumDefs() == 2) {
412 Register CarryOutReg = MRI->createVirtualRegister(TRI->getBoolRC());
413 Add.addDef(CarryOutReg, RegState::Dead);
414 MRI->setRegAllocationHint(CarryOutReg, 0, TRI->getVCC());
415 }
416
417 Add.add(*Src0).add(*Src1).setMIFlags(Def->getFlags());
418 if (AMDGPU::hasNamedOperand(NewOp, AMDGPU::OpName::clamp))
419 Add.addImm(0);
420
421 Def->eraseFromParent();
422 MI.eraseFromParent();
423 return true;
424 }
425
426 assert(NewOp == AMDGPU::V_ADD_CO_U32_e32);
427
429 MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, *Def, 16);
430 if (Liveness == MachineBasicBlock::LQR_Dead) {
431 // TODO: If src1 satisfies operand constraints, use vop3 version.
432 BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg)
433 .add(*Src0)
434 .add(*Src1)
435 .setOperandDead(3) // implicit-def $vcc
436 .setMIFlags(Def->getFlags());
437 Def->eraseFromParent();
438 MI.eraseFromParent();
439 return true;
440 }
441 }
442
443 return false;
444}
445
447 return new SIFoldOperandsLegacy();
448}
449
450bool SIFoldOperandsImpl::canUseImmWithOpSel(const MachineInstr *MI,
451 unsigned UseOpNo,
452 int64_t ImmVal) const {
453 const uint64_t TSFlags = MI->getDesc().TSFlags;
454
455 if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
456 (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) ||
457 (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
458 return false;
459
460 const MachineOperand &Old = MI->getOperand(UseOpNo);
461 int OpNo = MI->getOperandNo(&Old);
462
463 unsigned Opcode = MI->getOpcode();
464 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
465 switch (OpType) {
466 default:
467 return false;
475 // VOP3 packed instructions ignore op_sel source modifiers, we cannot encode
476 // two different constants.
477 if ((TSFlags & SIInstrFlags::VOP3) && !(TSFlags & SIInstrFlags::VOP3P) &&
478 static_cast<uint16_t>(ImmVal) != static_cast<uint16_t>(ImmVal >> 16))
479 return false;
480 break;
481 }
482
483 return true;
484}
485
486bool SIFoldOperandsImpl::tryFoldImmWithOpSel(MachineInstr *MI, unsigned UseOpNo,
487 int64_t ImmVal) const {
488 MachineOperand &Old = MI->getOperand(UseOpNo);
489 unsigned Opcode = MI->getOpcode();
490 int OpNo = MI->getOperandNo(&Old);
491 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
492
493 // If the literal can be inlined as-is, apply it and short-circuit the
494 // tests below. The main motivation for this is to avoid unintuitive
495 // uses of opsel.
496 if (AMDGPU::isInlinableLiteralV216(ImmVal, OpType)) {
497 Old.ChangeToImmediate(ImmVal);
498 return true;
499 }
500
501 // Refer to op_sel/op_sel_hi and check if we can change the immediate and
502 // op_sel in a way that allows an inline constant.
503 AMDGPU::OpName ModName = AMDGPU::OpName::NUM_OPERAND_NAMES;
504 unsigned SrcIdx = ~0;
505 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
506 ModName = AMDGPU::OpName::src0_modifiers;
507 SrcIdx = 0;
508 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
509 ModName = AMDGPU::OpName::src1_modifiers;
510 SrcIdx = 1;
511 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
512 ModName = AMDGPU::OpName::src2_modifiers;
513 SrcIdx = 2;
514 }
515 assert(ModName != AMDGPU::OpName::NUM_OPERAND_NAMES);
516 int ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModName);
517 MachineOperand &Mod = MI->getOperand(ModIdx);
518 unsigned ModVal = Mod.getImm();
519
520 uint16_t ImmLo =
521 static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
522 uint16_t ImmHi =
523 static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
524 uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
525 unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
526
527 // Helper function that attempts to inline the given value with a newly
528 // chosen opsel pattern.
529 auto tryFoldToInline = [&](uint32_t Imm) -> bool {
530 if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) {
531 Mod.setImm(NewModVal | SISrcMods::OP_SEL_1);
532 Old.ChangeToImmediate(Imm);
533 return true;
534 }
535
536 // Try to shuffle the halves around and leverage opsel to get an inline
537 // constant.
538 uint16_t Lo = static_cast<uint16_t>(Imm);
539 uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
540 if (Lo == Hi) {
541 if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) {
542 Mod.setImm(NewModVal);
544 return true;
545 }
546
547 if (static_cast<int16_t>(Lo) < 0) {
548 int32_t SExt = static_cast<int16_t>(Lo);
549 if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) {
550 Mod.setImm(NewModVal);
551 Old.ChangeToImmediate(SExt);
552 return true;
553 }
554 }
555
556 // This check is only useful for integer instructions
557 if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16) {
558 if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) {
559 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
560 Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16);
561 return true;
562 }
563 }
564 } else {
565 uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
566 if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) {
567 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0);
568 Old.ChangeToImmediate(Swapped);
569 return true;
570 }
571 }
572
573 return false;
574 };
575
576 if (tryFoldToInline(Imm))
577 return true;
578
579 // Replace integer addition by subtraction and vice versa if it allows
580 // folding the immediate to an inline constant.
581 //
582 // We should only ever get here for SrcIdx == 1 due to canonicalization
583 // earlier in the pipeline, but we double-check here to be safe / fully
584 // general.
585 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
586 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
587 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
588 unsigned ClampIdx =
589 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
590 bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
591
592 if (!Clamp) {
593 uint16_t NegLo = -static_cast<uint16_t>(Imm);
594 uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
595 uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
596
597 if (tryFoldToInline(NegImm)) {
598 unsigned NegOpcode =
599 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
600 MI->setDesc(TII->get(NegOpcode));
601 return true;
602 }
603 }
604 }
605
606 return false;
607}
608
609bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
610 MachineInstr *MI = Fold.UseMI;
611 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
612 assert(Old.isReg());
613
614 std::optional<int64_t> ImmVal;
615 if (Fold.isImm())
616 ImmVal = Fold.Def.getEffectiveImmVal();
617
618 if (ImmVal && canUseImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal)) {
619 if (tryFoldImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal))
620 return true;
621
622 // We can't represent the candidate as an inline constant. Try as a literal
623 // with the original opsel, checking constant bus limitations.
624 MachineOperand New = MachineOperand::CreateImm(*ImmVal);
625 int OpNo = MI->getOperandNo(&Old);
626 if (!TII->isOperandLegal(*MI, OpNo, &New))
627 return false;
628 Old.ChangeToImmediate(*ImmVal);
629 return true;
630 }
631
632 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
633 MachineBasicBlock *MBB = MI->getParent();
634 auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16);
635 if (Liveness != MachineBasicBlock::LQR_Dead) {
636 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
637 return false;
638 }
639
640 int Op32 = Fold.ShrinkOpcode;
641 MachineOperand &Dst0 = MI->getOperand(0);
642 MachineOperand &Dst1 = MI->getOperand(1);
643 assert(Dst0.isDef() && Dst1.isDef());
644
645 bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(Dst1.getReg());
646
647 const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg());
648 Register NewReg0 = MRI->createVirtualRegister(Dst0RC);
649
650 MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32);
651
652 if (HaveNonDbgCarryUse) {
653 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::COPY),
654 Dst1.getReg())
655 .addReg(AMDGPU::VCC, RegState::Kill);
656 }
657
658 // Keep the old instruction around to avoid breaking iterators, but
659 // replace it with a dummy instruction to remove uses.
660 //
661 // FIXME: We should not invert how this pass looks at operands to avoid
662 // this. Should track set of foldable movs instead of looking for uses
663 // when looking at a use.
664 Dst0.setReg(NewReg0);
665 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
666 MI->removeOperand(I);
667 MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF));
668
669 if (Fold.Commuted)
670 TII->commuteInstruction(*Inst32, false);
671 return true;
672 }
673
674 assert(!Fold.needsShrink() && "not handled");
675
676 if (ImmVal) {
677 if (Old.isTied()) {
678 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
679 if (NewMFMAOpc == -1)
680 return false;
681 MI->setDesc(TII->get(NewMFMAOpc));
682 MI->untieRegOperand(0);
683 const MCInstrDesc &MCID = MI->getDesc();
684 for (unsigned I = 0; I < MI->getNumDefs(); ++I)
686 MI->getOperand(I).setIsEarlyClobber(true);
687 }
688
689 // TODO: Should we try to avoid adding this to the candidate list?
690 MachineOperand New = MachineOperand::CreateImm(*ImmVal);
691 int OpNo = MI->getOperandNo(&Old);
692 if (!TII->isOperandLegal(*MI, OpNo, &New))
693 return false;
694
695 Old.ChangeToImmediate(*ImmVal);
696 return true;
697 }
698
699 if (Fold.isGlobal()) {
700 Old.ChangeToGA(Fold.Def.OpToFold->getGlobal(),
701 Fold.Def.OpToFold->getOffset(),
702 Fold.Def.OpToFold->getTargetFlags());
703 return true;
704 }
705
706 if (Fold.isFI()) {
707 Old.ChangeToFrameIndex(Fold.getFI());
708 return true;
709 }
710
711 MachineOperand *New = Fold.Def.OpToFold;
712
713 // Verify the register is compatible with the operand.
714 if (const TargetRegisterClass *OpRC =
715 TII->getRegClass(MI->getDesc(), Fold.UseOpNo)) {
716 const TargetRegisterClass *NewRC =
717 TRI->getRegClassForReg(*MRI, New->getReg());
718
719 const TargetRegisterClass *ConstrainRC = OpRC;
720 if (New->getSubReg()) {
721 ConstrainRC =
722 TRI->getMatchingSuperRegClass(NewRC, OpRC, New->getSubReg());
723
724 if (!ConstrainRC)
725 return false;
726 }
727
728 if (New->getReg().isVirtual() &&
729 !MRI->constrainRegClass(New->getReg(), ConstrainRC)) {
730 LLVM_DEBUG(dbgs() << "Cannot constrain " << printReg(New->getReg(), TRI)
731 << TRI->getRegClassName(ConstrainRC) << '\n');
732 return false;
733 }
734 }
735
736 // Rework once the VS_16 register class is updated to include proper
737 // 16-bit SGPRs instead of 32-bit ones.
738 if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
739 Old.setSubReg(AMDGPU::NoSubRegister);
740 if (New->getReg().isPhysical()) {
741 Old.substPhysReg(New->getReg(), *TRI);
742 } else {
743 Register OldReg = Old.getReg();
744 Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
745 Old.setIsUndef(New->isUndef());
746
747 // If MI is in a BUNDLE, also update header's matching implicit use.
748 if (MI->isBundledWithPred()) {
749 MachineInstr &Header = *getBundleStart(MI->getIterator());
750 for (MachineOperand &MO : Header.operands()) {
751 if (MO.getReg() == OldReg) {
752 MO.setReg(New->getReg());
753 MO.setSubReg(New->getSubReg());
754 }
755 }
756 }
757 }
758 return true;
759}
760
762 FoldCandidate &&Entry) {
763 // Skip additional folding on the same operand.
764 for (FoldCandidate &Fold : FoldList)
765 if (Fold.UseMI == Entry.UseMI && Fold.UseOpNo == Entry.UseOpNo)
766 return;
767 LLVM_DEBUG(dbgs() << "Append " << (Entry.Commuted ? "commuted" : "normal")
768 << " operand " << Entry.UseOpNo << "\n " << *Entry.UseMI);
769 FoldList.push_back(Entry);
770}
771
773 MachineInstr *MI, unsigned OpNo,
774 const FoldableDef &FoldOp,
775 bool Commuted = false, int ShrinkOp = -1) {
776 appendFoldCandidate(FoldList,
777 FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
778}
779
780// Returns true if the instruction is a packed F32 instruction and the
781// corresponding scalar operand reads 32 bits and replicates the bits to both
782// channels.
784 const GCNSubtarget *ST, MachineInstr *MI, unsigned OpNo) {
785 if (!ST->hasPKF32InstsReplicatingLower32BitsOfScalarInput())
786 return false;
787 const MCOperandInfo &OpDesc = MI->getDesc().operands()[OpNo];
789}
790
791// Packed FP32 instructions only read 32 bits from a scalar operand (SGPR or
792// literal) and replicates the bits to both channels. Therefore, if the hi and
793// lo are not same, we can't fold it.
795 const FoldableDef &OpToFold) {
796 assert(OpToFold.isImm() && "Expected immediate operand");
797 uint64_t ImmVal = OpToFold.getEffectiveImmVal().value();
798 uint32_t Lo = Lo_32(ImmVal);
799 uint32_t Hi = Hi_32(ImmVal);
800 return Lo == Hi;
801}
802
803bool SIFoldOperandsImpl::tryAddToFoldList(
804 SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo,
805 const FoldableDef &OpToFold) const {
806 const unsigned Opc = MI->getOpcode();
807
808 auto tryToFoldAsFMAAKorMK = [&]() {
809 if (!OpToFold.isImm())
810 return false;
811
812 const bool TryAK = OpNo == 3;
813 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
814 MI->setDesc(TII->get(NewOpc));
815
816 // We have to fold into operand which would be Imm not into OpNo.
817 bool FoldAsFMAAKorMK =
818 tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
819 if (FoldAsFMAAKorMK) {
820 // Untie Src2 of fmac.
821 MI->untieRegOperand(3);
822 // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
823 if (OpNo == 1) {
824 MachineOperand &Op1 = MI->getOperand(1);
825 MachineOperand &Op2 = MI->getOperand(2);
826 Register OldReg = Op1.getReg();
827 // Operand 2 might be an inlinable constant
828 if (Op2.isImm()) {
829 Op1.ChangeToImmediate(Op2.getImm());
830 Op2.ChangeToRegister(OldReg, false);
831 } else {
832 Op1.setReg(Op2.getReg());
833 Op2.setReg(OldReg);
834 }
835 }
836 return true;
837 }
838 MI->setDesc(TII->get(Opc));
839 return false;
840 };
841
842 bool IsLegal = OpToFold.isOperandLegal(*TII, *MI, OpNo);
843 if (!IsLegal && OpToFold.isImm()) {
844 if (std::optional<int64_t> ImmVal = OpToFold.getEffectiveImmVal())
845 IsLegal = canUseImmWithOpSel(MI, OpNo, *ImmVal);
846 }
847
848 if (!IsLegal) {
849 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
850 unsigned NewOpc = macToMad(Opc);
851 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
852 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
853 // to fold the operand.
854 MI->setDesc(TII->get(NewOpc));
855 bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) &&
856 AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel);
857 if (AddOpSel)
858 MI->addOperand(MachineOperand::CreateImm(0));
859 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);
860 if (FoldAsMAD) {
861 MI->untieRegOperand(OpNo);
862 return true;
863 }
864 if (AddOpSel)
865 MI->removeOperand(MI->getNumExplicitOperands() - 1);
866 MI->setDesc(TII->get(Opc));
867 }
868
869 // Special case for s_fmac_f32 if we are trying to fold into Src2.
870 // By transforming into fmaak we can untie Src2 and make folding legal.
871 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
872 if (tryToFoldAsFMAAKorMK())
873 return true;
874 }
875
876 // Special case for s_setreg_b32
877 if (OpToFold.isImm()) {
878 unsigned ImmOpc = 0;
879 if (Opc == AMDGPU::S_SETREG_B32)
880 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
881 else if (Opc == AMDGPU::S_SETREG_B32_mode)
882 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
883 if (ImmOpc) {
884 MI->setDesc(TII->get(ImmOpc));
885 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
886 return true;
887 }
888 }
889
890 // Operand is not legal, so try to commute the instruction to
891 // see if this makes it possible to fold.
892 unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex;
893 bool CanCommute = TII->findCommutedOpIndices(*MI, OpNo, CommuteOpNo);
894 if (!CanCommute)
895 return false;
896
897 MachineOperand &Op = MI->getOperand(OpNo);
898 MachineOperand &CommutedOp = MI->getOperand(CommuteOpNo);
899
900 // One of operands might be an Imm operand, and OpNo may refer to it after
901 // the call of commuteInstruction() below. Such situations are avoided
902 // here explicitly as OpNo must be a register operand to be a candidate
903 // for memory folding.
904 if (!Op.isReg() || !CommutedOp.isReg())
905 return false;
906
907 // The same situation with an immediate could reproduce if both inputs are
908 // the same register.
909 if (Op.isReg() && CommutedOp.isReg() &&
910 (Op.getReg() == CommutedOp.getReg() &&
911 Op.getSubReg() == CommutedOp.getSubReg()))
912 return false;
913
914 if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo))
915 return false;
916
917 int Op32 = -1;
918 if (!OpToFold.isOperandLegal(*TII, *MI, CommuteOpNo)) {
919 if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&
920 Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME
921 (!OpToFold.isImm() && !OpToFold.isFI() && !OpToFold.isGlobal())) {
922 TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo);
923 return false;
924 }
925
926 // Verify the other operand is a VGPR, otherwise we would violate the
927 // constant bus restriction.
928 MachineOperand &OtherOp = MI->getOperand(OpNo);
929 if (!OtherOp.isReg() ||
930 !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
931 return false;
932
933 assert(MI->getOperand(1).isDef());
934
935 // Make sure to get the 32-bit version of the commuted opcode.
936 unsigned MaybeCommutedOpc = MI->getOpcode();
937 Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
938 }
939
940 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, /*Commuted=*/true,
941 Op32);
942 return true;
943 }
944
945 // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
946 // By changing into fmamk we can untie Src2.
947 // If folding for Src0 happens first and it is identical operand to Src1 we
948 // should avoid transforming into fmamk which requires commuting as it would
949 // cause folding into Src1 to fail later on due to wrong OpNo used.
950 if (Opc == AMDGPU::S_FMAC_F32 &&
951 (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
952 if (tryToFoldAsFMAAKorMK())
953 return true;
954 }
955
956 // Special case for PK_F32 instructions if we are trying to fold an imm to
957 // src0 or src1.
958 if (OpToFold.isImm() &&
961 return false;
962
963 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
964 return true;
965}
966
967bool SIFoldOperandsImpl::isUseSafeToFold(const MachineInstr &MI,
968 const MachineOperand &UseMO) const {
969 // Operands of SDWA instructions must be registers.
970 return !TII->isSDWA(MI);
971}
972
974 const MachineRegisterInfo &MRI,
975 Register SrcReg) {
976 MachineOperand *Sub = nullptr;
977 for (MachineInstr *SubDef = MRI.getVRegDef(SrcReg);
978 SubDef && TII.isFoldableCopy(*SubDef);
979 SubDef = MRI.getVRegDef(Sub->getReg())) {
980 unsigned SrcIdx = TII.getFoldableCopySrcIdx(*SubDef);
981 MachineOperand &SrcOp = SubDef->getOperand(SrcIdx);
982
983 if (SrcOp.isImm())
984 return &SrcOp;
985 if (!SrcOp.isReg() || SrcOp.getReg().isPhysical())
986 break;
987 Sub = &SrcOp;
988 // TODO: Support compose
989 if (SrcOp.getSubReg())
990 break;
991 }
992
993 return Sub;
994}
995
996const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
997 MachineInstr &RegSeq,
998 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const {
999
1000 assert(RegSeq.isRegSequence());
1001
1002 const TargetRegisterClass *RC = nullptr;
1003
1004 for (unsigned I = 1, E = RegSeq.getNumExplicitOperands(); I != E; I += 2) {
1005 MachineOperand &SrcOp = RegSeq.getOperand(I);
1006 unsigned SubRegIdx = RegSeq.getOperand(I + 1).getImm();
1007
1008 // Only accept reg_sequence with uniform reg class inputs for simplicity.
1009 const TargetRegisterClass *OpRC = getRegOpRC(*MRI, *TRI, SrcOp);
1010 if (!RC)
1011 RC = OpRC;
1012 else if (!TRI->getCommonSubClass(RC, OpRC))
1013 return nullptr;
1014
1015 if (SrcOp.getSubReg()) {
1016 // TODO: Handle subregister compose
1017 Defs.emplace_back(&SrcOp, SubRegIdx);
1018 continue;
1019 }
1020
1021 MachineOperand *DefSrc = lookUpCopyChain(*TII, *MRI, SrcOp.getReg());
1022 if (DefSrc && (DefSrc->isReg() || DefSrc->isImm())) {
1023 Defs.emplace_back(DefSrc, SubRegIdx);
1024 continue;
1025 }
1026
1027 Defs.emplace_back(&SrcOp, SubRegIdx);
1028 }
1029
1030 return RC;
1031}
1032
1033// Find a def of the UseReg, check if it is a reg_sequence and find initializers
1034// for each subreg, tracking it to an immediate if possible. Returns the
1035// register class of the inputs on success.
1036const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
1037 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
1038 Register UseReg) const {
1039 MachineInstr *Def = MRI->getVRegDef(UseReg);
1040 if (!Def || !Def->isRegSequence())
1041 return nullptr;
1042
1043 return getRegSeqInit(*Def, Defs);
1044}
1045
1046std::pair<int64_t, const TargetRegisterClass *>
1047SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const {
1049 const TargetRegisterClass *SrcRC = getRegSeqInit(RegSeq, Defs);
1050 if (!SrcRC)
1051 return {};
1052
1053 bool TryToMatchSplat64 = false;
1054
1055 int64_t Imm;
1056 for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
1057 const MachineOperand *Op = Defs[I].first;
1058 if (!Op->isImm())
1059 return {};
1060
1061 int64_t SubImm = Op->getImm();
1062 if (!I) {
1063 Imm = SubImm;
1064 continue;
1065 }
1066
1067 if (Imm != SubImm) {
1068 if (I == 1 && (E & 1) == 0) {
1069 // If we have an even number of inputs, there's a chance this is a
1070 // 64-bit element splat broken into 32-bit pieces.
1071 TryToMatchSplat64 = true;
1072 break;
1073 }
1074
1075 return {}; // Can only fold splat constants
1076 }
1077 }
1078
1079 if (!TryToMatchSplat64)
1080 return {Defs[0].first->getImm(), SrcRC};
1081
1082 // Fallback to recognizing 64-bit splats broken into 32-bit pieces
1083 // (i.e. recognize every other other element is 0 for 64-bit immediates)
1084 int64_t SplatVal64;
1085 for (unsigned I = 0, E = Defs.size(); I != E; I += 2) {
1086 const MachineOperand *Op0 = Defs[I].first;
1087 const MachineOperand *Op1 = Defs[I + 1].first;
1088
1089 if (!Op0->isImm() || !Op1->isImm())
1090 return {};
1091
1092 unsigned SubReg0 = Defs[I].second;
1093 unsigned SubReg1 = Defs[I + 1].second;
1094
1095 // Assume we're going to generally encounter reg_sequences with sorted
1096 // subreg indexes, so reject any that aren't consecutive.
1097 if (TRI->getChannelFromSubReg(SubReg0) + 1 !=
1098 TRI->getChannelFromSubReg(SubReg1))
1099 return {};
1100
1101 if (TRI->getSubRegIdxSize(SubReg0) != 32)
1102 return {};
1103
1104 int64_t MergedVal = Make_64(Op1->getImm(), Op0->getImm());
1105 if (I == 0)
1106 SplatVal64 = MergedVal;
1107 else if (SplatVal64 != MergedVal)
1108 return {};
1109 }
1110
1111 const TargetRegisterClass *RC64 = TRI->getSubRegisterClass(
1112 MRI->getRegClass(RegSeq.getOperand(0).getReg()), AMDGPU::sub0_sub1);
1113
1114 return {SplatVal64, RC64};
1115}
1116
1117bool SIFoldOperandsImpl::tryFoldRegSeqSplat(
1118 MachineInstr *UseMI, unsigned UseOpIdx, int64_t SplatVal,
1119 const TargetRegisterClass *SplatRC) const {
1120 const MCInstrDesc &Desc = UseMI->getDesc();
1121 if (UseOpIdx >= Desc.getNumOperands())
1122 return false;
1123
1124 // Filter out unhandled pseudos.
1125 if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
1126 return false;
1127
1128 int16_t RCID = TII->getOpRegClassID(Desc.operands()[UseOpIdx]);
1129 if (RCID == -1)
1130 return false;
1131
1132 const TargetRegisterClass *OpRC = TRI->getRegClass(RCID);
1133
1134 // Special case 0/-1, since when interpreted as a 64-bit element both halves
1135 // have the same bits. These are the only cases where a splat has the same
1136 // interpretation for 32-bit and 64-bit splats.
1137 if (SplatVal != 0 && SplatVal != -1) {
1138 // We need to figure out the scalar type read by the operand. e.g. the MFMA
1139 // operand will be AReg_128, and we want to check if it's compatible with an
1140 // AReg_32 constant.
1141 uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
1142 switch (OpTy) {
1147 OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
1148 break;
1152 OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
1153 break;
1154 default:
1155 return false;
1156 }
1157
1158 if (!TRI->getCommonSubClass(OpRC, SplatRC))
1159 return false;
1160 }
1161
1162 MachineOperand TmpOp = MachineOperand::CreateImm(SplatVal);
1163 if (!TII->isOperandLegal(*UseMI, UseOpIdx, &TmpOp))
1164 return false;
1165
1166 return true;
1167}
1168
1169bool SIFoldOperandsImpl::tryToFoldACImm(
1170 const FoldableDef &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
1171 SmallVectorImpl<FoldCandidate> &FoldList) const {
1172 const MCInstrDesc &Desc = UseMI->getDesc();
1173 if (UseOpIdx >= Desc.getNumOperands())
1174 return false;
1175
1176 // Filter out unhandled pseudos.
1177 if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
1178 return false;
1179
1180 if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) {
1183 return false;
1184 appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold);
1185 return true;
1186 }
1187
1188 return false;
1189}
1190
1191void SIFoldOperandsImpl::foldOperand(
1192 FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
1193 SmallVectorImpl<FoldCandidate> &FoldList,
1194 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
1195 const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);
1196
1197 if (!isUseSafeToFold(*UseMI, *UseOp))
1198 return;
1199
1200 // FIXME: Fold operands with subregs.
1201 if (UseOp->isReg() && OpToFold.isReg()) {
1202 if (UseOp->isImplicit())
1203 return;
1204 // Allow folding from SGPRs to 16-bit VGPRs.
1205 if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&
1206 (UseOp->getSubReg() != AMDGPU::lo16 ||
1207 !TRI->isSGPRReg(*MRI, OpToFold.getReg())))
1208 return;
1209 }
1210
1211 // Special case for REG_SEQUENCE: We can't fold literals into
1212 // REG_SEQUENCE instructions, so we have to fold them into the
1213 // uses of REG_SEQUENCE.
1214 if (UseMI->isRegSequence()) {
1215 Register RegSeqDstReg = UseMI->getOperand(0).getReg();
1216 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
1217
1218 int64_t SplatVal;
1219 const TargetRegisterClass *SplatRC;
1220 std::tie(SplatVal, SplatRC) = isRegSeqSplat(*UseMI);
1221
1222 // Grab the use operands first
1224 llvm::make_pointer_range(MRI->use_nodbg_operands(RegSeqDstReg)));
1225 for (unsigned I = 0; I != UsesToProcess.size(); ++I) {
1226 MachineOperand *RSUse = UsesToProcess[I];
1227 MachineInstr *RSUseMI = RSUse->getParent();
1228 unsigned OpNo = RSUseMI->getOperandNo(RSUse);
1229
1230 if (SplatRC) {
1231 if (RSUseMI->isCopy()) {
1232 Register DstReg = RSUseMI->getOperand(0).getReg();
1233 append_range(UsesToProcess,
1235 continue;
1236 }
1237 if (tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) {
1238 FoldableDef SplatDef(SplatVal, SplatRC);
1239 appendFoldCandidate(FoldList, RSUseMI, OpNo, SplatDef);
1240 continue;
1241 }
1242 }
1243
1244 // TODO: Handle general compose
1245 if (RSUse->getSubReg() != RegSeqDstSubReg)
1246 continue;
1247
1248 // FIXME: We should avoid recursing here. There should be a cleaner split
1249 // between the in-place mutations and adding to the fold list.
1250 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
1251 CopiesToReplace);
1252 }
1253
1254 return;
1255 }
1256
1257 if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList))
1258 return;
1259
1260 if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) {
1261 // Verify that this is a stack access.
1262 // FIXME: Should probably use stack pseudos before frame lowering.
1263
1264 if (TII->isMUBUF(*UseMI)) {
1265 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
1266 MFI->getScratchRSrcReg())
1267 return;
1268
1269 // Ensure this is either relative to the current frame or the current
1270 // wave.
1271 MachineOperand &SOff =
1272 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
1273 if (!SOff.isImm() || SOff.getImm() != 0)
1274 return;
1275 }
1276
1277 const unsigned Opc = UseMI->getOpcode();
1278 if (TII->isFLATScratch(*UseMI) &&
1279 AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
1280 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
1281 unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
1282 unsigned CPol =
1283 TII->getNamedOperand(*UseMI, AMDGPU::OpName::cpol)->getImm();
1284 if ((CPol & AMDGPU::CPol::SCAL) &&
1286 return;
1287
1288 UseMI->setDesc(TII->get(NewOpc));
1289 }
1290
1291 // A frame index will resolve to a positive constant, so it should always be
1292 // safe to fold the addressing mode, even pre-GFX9.
1293 UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getFI());
1294
1295 return;
1296 }
1297
1298 bool FoldingImmLike =
1299 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1300
1301 if (FoldingImmLike && UseMI->isCopy()) {
1302 Register DestReg = UseMI->getOperand(0).getReg();
1303 Register SrcReg = UseMI->getOperand(1).getReg();
1304 unsigned UseSubReg = UseMI->getOperand(1).getSubReg();
1305 assert(SrcReg.isVirtual());
1306
1307 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
1308
1309 // Don't fold into a copy to a physical register with the same class. Doing
1310 // so would interfere with the register coalescer's logic which would avoid
1311 // redundant initializations.
1312 if (DestReg.isPhysical() && SrcRC->contains(DestReg))
1313 return;
1314
1315 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
1316 // In order to fold immediates into copies, we need to change the copy to a
1317 // MOV. Find a compatible mov instruction with the value.
1318 for (unsigned MovOp :
1319 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
1320 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_MOV_B16_t16_e64,
1321 AMDGPU::V_ACCVGPR_WRITE_B32_e64, AMDGPU::AV_MOV_B32_IMM_PSEUDO,
1322 AMDGPU::AV_MOV_B64_IMM_PSEUDO}) {
1323 const MCInstrDesc &MovDesc = TII->get(MovOp);
1324 const TargetRegisterClass *MovDstRC =
1325 TRI->getRegClass(TII->getOpRegClassID(MovDesc.operands()[0]));
1326
1327 // Fold if the destination register class of the MOV instruction (ResRC)
1328 // is a superclass of (or equal to) the destination register class of the
1329 // COPY (DestRC). If this condition fails, folding would be illegal.
1330 if (!DestRC->hasSuperClassEq(MovDstRC))
1331 continue;
1332
1333 const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1;
1334
1335 int16_t RegClassID = TII->getOpRegClassID(MovDesc.operands()[SrcIdx]);
1336 if (RegClassID != -1) {
1337 const TargetRegisterClass *MovSrcRC = TRI->getRegClass(RegClassID);
1338
1339 if (UseSubReg)
1340 MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg);
1341
1342 // FIXME: We should be able to directly check immediate operand legality
1343 // for all cases, but gfx908 hacks break.
1344 if (MovOp == AMDGPU::AV_MOV_B32_IMM_PSEUDO &&
1345 (!OpToFold.isImm() ||
1346 !TII->isImmOperandLegal(MovDesc, SrcIdx,
1347 *OpToFold.getEffectiveImmVal())))
1348 break;
1349
1350 if (!MRI->constrainRegClass(SrcReg, MovSrcRC))
1351 break;
1352
1353 // FIXME: This is mutating the instruction only and deferring the actual
1354 // fold of the immediate
1355 } else {
1356 // For the _IMM_PSEUDO cases, there can be value restrictions on the
1357 // immediate to verify. Technically we should always verify this, but it
1358 // only matters for these concrete cases.
1359 // TODO: Handle non-imm case if it's useful.
1360 if (!OpToFold.isImm() ||
1361 !TII->isImmOperandLegal(MovDesc, 1, *OpToFold.getEffectiveImmVal()))
1362 break;
1363 }
1364
1367 while (ImpOpI != ImpOpE) {
1368 MachineInstr::mop_iterator Tmp = ImpOpI;
1369 ImpOpI++;
1371 }
1372 UseMI->setDesc(MovDesc);
1373
1374 if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
1375 const auto &SrcOp = UseMI->getOperand(UseOpIdx);
1376 MachineOperand NewSrcOp(SrcOp);
1377 MachineFunction *MF = UseMI->getMF();
1378 UseMI->removeOperand(1);
1379 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
1380 UseMI->addOperand(NewSrcOp); // src0
1381 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
1382 UseOpIdx = SrcIdx;
1383 UseOp = &UseMI->getOperand(UseOpIdx);
1384 }
1385 CopiesToReplace.push_back(UseMI);
1386 break;
1387 }
1388
1389 // We failed to replace the copy, so give up.
1390 if (UseMI->getOpcode() == AMDGPU::COPY)
1391 return;
1392
1393 } else {
1394 if (UseMI->isCopy() && OpToFold.isReg() &&
1395 UseMI->getOperand(0).getReg().isVirtual() &&
1396 !UseMI->getOperand(1).getSubReg() &&
1397 OpToFold.DefMI->implicit_operands().empty()) {
1398 LLVM_DEBUG(dbgs() << "Folding " << OpToFold.OpToFold << "\n into "
1399 << *UseMI);
1400 unsigned Size = TII->getOpSize(*UseMI, 1);
1401 Register UseReg = OpToFold.getReg();
1403 unsigned SubRegIdx = OpToFold.getSubReg();
1404 // Hack to allow 32-bit SGPRs to be folded into True16 instructions
1405 // Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the
1406 // VS_16RegClass
1407 //
1408 // Excerpt from AMDGPUGenRegisterInfoEnums.inc
1409 // NoSubRegister, //0
1410 // hi16, // 1
1411 // lo16, // 2
1412 // sub0, // 3
1413 // ...
1414 // sub1, // 11
1415 // sub1_hi16, // 12
1416 // sub1_lo16, // 13
1417 static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed");
1418 if (Size == 2 && TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
1419 TRI->isSGPRReg(*MRI, UseReg)) {
1420 // Produce the 32 bit subregister index to which the 16-bit subregister
1421 // is aligned.
1422 if (SubRegIdx > AMDGPU::sub1) {
1423 LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx);
1424 M |= M.getLane(M.getHighestLane() - 1);
1425 SmallVector<unsigned, 4> Indexes;
1426 TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M,
1427 Indexes);
1428 assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover");
1429 SubRegIdx = Indexes[0];
1430 // 32-bit registers do not have a sub0 index
1431 } else if (TII->getOpSize(*UseMI, 1) == 4)
1432 SubRegIdx = 0;
1433 else
1434 SubRegIdx = AMDGPU::sub0;
1435 }
1436 UseMI->getOperand(1).setSubReg(SubRegIdx);
1437 UseMI->getOperand(1).setIsKill(false);
1438 CopiesToReplace.push_back(UseMI);
1439 OpToFold.OpToFold->setIsKill(false);
1440
1441 // Remove kill flags as kills may now be out of order with uses.
1442 MRI->clearKillFlags(UseReg);
1443 if (foldCopyToAGPRRegSequence(UseMI))
1444 return;
1445 }
1446
1447 unsigned UseOpc = UseMI->getOpcode();
1448 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
1449 (UseOpc == AMDGPU::V_READLANE_B32 &&
1450 (int)UseOpIdx ==
1451 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
1452 // %vgpr = V_MOV_B32 imm
1453 // %sgpr = V_READFIRSTLANE_B32 %vgpr
1454 // =>
1455 // %sgpr = S_MOV_B32 imm
1456 if (FoldingImmLike) {
1458 UseMI->getOperand(UseOpIdx).getReg(),
1459 *OpToFold.DefMI, *UseMI))
1460 return;
1461
1462 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
1464
1465 if (OpToFold.isImm()) {
1467 *OpToFold.getEffectiveImmVal());
1468 } else if (OpToFold.isFI())
1469 UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getFI());
1470 else {
1471 assert(OpToFold.isGlobal());
1472 UseMI->getOperand(1).ChangeToGA(OpToFold.OpToFold->getGlobal(),
1473 OpToFold.OpToFold->getOffset(),
1474 OpToFold.OpToFold->getTargetFlags());
1475 }
1476 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1477 return;
1478 }
1479
1480 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
1482 UseMI->getOperand(UseOpIdx).getReg(),
1483 *OpToFold.DefMI, *UseMI))
1484 return;
1485
1486 // %vgpr = COPY %sgpr0
1487 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
1488 // =>
1489 // %sgpr1 = COPY %sgpr0
1490 UseMI->setDesc(TII->get(AMDGPU::COPY));
1491 UseMI->getOperand(1).setReg(OpToFold.getReg());
1492 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1493 UseMI->getOperand(1).setIsKill(false);
1494 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1496 return;
1497 }
1498 }
1499
1500 const MCInstrDesc &UseDesc = UseMI->getDesc();
1501
1502 // Don't fold into target independent nodes. Target independent opcodes
1503 // don't have defined register classes.
1504 if (UseDesc.isVariadic() || UseOp->isImplicit() ||
1505 UseDesc.operands()[UseOpIdx].RegClass == -1)
1506 return;
1507 }
1508
1509 // FIXME: We could try to change the instruction from 64-bit to 32-bit
1510 // to enable more folding opportunities. The shrink operands pass
1511 // already does this.
1512
1513 tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold);
1514}
1515
1516static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
1518 switch (Opcode) {
1519 case AMDGPU::S_ADD_I32:
1520 case AMDGPU::S_ADD_U32:
1521 Result = LHS + RHS;
1522 return true;
1523 case AMDGPU::S_SUB_I32:
1524 case AMDGPU::S_SUB_U32:
1525 Result = LHS - RHS;
1526 return true;
1527 case AMDGPU::V_AND_B32_e64:
1528 case AMDGPU::V_AND_B32_e32:
1529 case AMDGPU::S_AND_B32:
1530 Result = LHS & RHS;
1531 return true;
1532 case AMDGPU::V_OR_B32_e64:
1533 case AMDGPU::V_OR_B32_e32:
1534 case AMDGPU::S_OR_B32:
1535 Result = LHS | RHS;
1536 return true;
1537 case AMDGPU::V_XOR_B32_e64:
1538 case AMDGPU::V_XOR_B32_e32:
1539 case AMDGPU::S_XOR_B32:
1540 Result = LHS ^ RHS;
1541 return true;
1542 case AMDGPU::S_XNOR_B32:
1543 Result = ~(LHS ^ RHS);
1544 return true;
1545 case AMDGPU::S_NAND_B32:
1546 Result = ~(LHS & RHS);
1547 return true;
1548 case AMDGPU::S_NOR_B32:
1549 Result = ~(LHS | RHS);
1550 return true;
1551 case AMDGPU::S_ANDN2_B32:
1552 Result = LHS & ~RHS;
1553 return true;
1554 case AMDGPU::S_ORN2_B32:
1555 Result = LHS | ~RHS;
1556 return true;
1557 case AMDGPU::V_LSHL_B32_e64:
1558 case AMDGPU::V_LSHL_B32_e32:
1559 case AMDGPU::S_LSHL_B32:
1560 // The instruction ignores the high bits for out of bounds shifts.
1561 Result = LHS << (RHS & 31);
1562 return true;
1563 case AMDGPU::V_LSHLREV_B32_e64:
1564 case AMDGPU::V_LSHLREV_B32_e32:
1565 Result = RHS << (LHS & 31);
1566 return true;
1567 case AMDGPU::V_LSHR_B32_e64:
1568 case AMDGPU::V_LSHR_B32_e32:
1569 case AMDGPU::S_LSHR_B32:
1570 Result = LHS >> (RHS & 31);
1571 return true;
1572 case AMDGPU::V_LSHRREV_B32_e64:
1573 case AMDGPU::V_LSHRREV_B32_e32:
1574 Result = RHS >> (LHS & 31);
1575 return true;
1576 case AMDGPU::V_ASHR_I32_e64:
1577 case AMDGPU::V_ASHR_I32_e32:
1578 case AMDGPU::S_ASHR_I32:
1579 Result = static_cast<int32_t>(LHS) >> (RHS & 31);
1580 return true;
1581 case AMDGPU::V_ASHRREV_I32_e64:
1582 case AMDGPU::V_ASHRREV_I32_e32:
1583 Result = static_cast<int32_t>(RHS) >> (LHS & 31);
1584 return true;
1585 default:
1586 return false;
1587 }
1588}
1589
1590static unsigned getMovOpc(bool IsScalar) {
1591 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1592}
1593
1594// Try to simplify operations with a constant that may appear after instruction
1595// selection.
1596// TODO: See if a frame index with a fixed offset can fold.
1597bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
1598 if (!MI->allImplicitDefsAreDead())
1599 return false;
1600
1601 unsigned Opc = MI->getOpcode();
1602
1603 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1604 if (Src0Idx == -1)
1605 return false;
1606
1607 MachineOperand *Src0 = &MI->getOperand(Src0Idx);
1608 std::optional<int64_t> Src0Imm = TII->getImmOrMaterializedImm(*Src0);
1609
1610 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1611 Opc == AMDGPU::S_NOT_B32) &&
1612 Src0Imm) {
1613 MI->getOperand(1).ChangeToImmediate(~*Src0Imm);
1614 TII->mutateAndCleanupImplicit(
1615 *MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
1616 return true;
1617 }
1618
1619 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1620 if (Src1Idx == -1)
1621 return false;
1622
1623 MachineOperand *Src1 = &MI->getOperand(Src1Idx);
1624 std::optional<int64_t> Src1Imm = TII->getImmOrMaterializedImm(*Src1);
1625
1626 if (!Src0Imm && !Src1Imm)
1627 return false;
1628
1629 // and k0, k1 -> v_mov_b32 (k0 & k1)
1630 // or k0, k1 -> v_mov_b32 (k0 | k1)
1631 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1632 if (Src0Imm && Src1Imm) {
1633 int32_t NewImm;
1634 if (!evalBinaryInstruction(Opc, NewImm, *Src0Imm, *Src1Imm))
1635 return false;
1636
1637 bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg());
1638
1639 // Be careful to change the right operand, src0 may belong to a different
1640 // instruction.
1641 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1642 MI->removeOperand(Src1Idx);
1643 TII->mutateAndCleanupImplicit(*MI, TII->get(getMovOpc(IsSGPR)));
1644 return true;
1645 }
1646
1647 // S_SUB_* is not commutable, so handle it before the commutability gate.
1648 // Only `x - 0 -> copy x` is valid; `0 - x` is a negation, not a copy.
1649 if (Opc == AMDGPU::S_SUB_I32 || Opc == AMDGPU::S_SUB_U32) {
1650 if (Src1Imm && static_cast<int32_t>(*Src1Imm) == 0) {
1651 // y = sub x, 0 => y = copy x
1652 MI->removeOperand(Src1Idx);
1653 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1654 return true;
1655 }
1656 return false;
1657 }
1658
1659 if (!MI->isCommutable())
1660 return false;
1661
1662 if (Src0Imm && !Src1Imm) {
1663 std::swap(Src0, Src1);
1664 std::swap(Src0Idx, Src1Idx);
1665 std::swap(Src0Imm, Src1Imm);
1666 }
1667
1668 int32_t Src1Val = static_cast<int32_t>(*Src1Imm);
1669 if (Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_ADD_U32) {
1670 if (Src1Val == 0) {
1671 // y = add x, 0 => y = copy x
1672 MI->removeOperand(Src1Idx);
1673 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1674 return true;
1675 }
1676 return false;
1677 }
1678
1679 if (Opc == AMDGPU::V_OR_B32_e64 ||
1680 Opc == AMDGPU::V_OR_B32_e32 ||
1681 Opc == AMDGPU::S_OR_B32) {
1682 if (Src1Val == 0) {
1683 // y = or x, 0 => y = copy x
1684 MI->removeOperand(Src1Idx);
1685 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1686 } else if (Src1Val == -1) {
1687 // y = or x, -1 => y = v_mov_b32 -1
1688 MI->removeOperand(Src0Idx);
1689 TII->mutateAndCleanupImplicit(
1690 *MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
1691 } else
1692 return false;
1693
1694 return true;
1695 }
1696
1697 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1698 Opc == AMDGPU::S_AND_B32) {
1699 if (Src1Val == 0) {
1700 // y = and x, 0 => y = v_mov_b32 0
1701 MI->removeOperand(Src0Idx);
1702 TII->mutateAndCleanupImplicit(
1703 *MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
1704 } else if (Src1Val == -1) {
1705 // y = and x, -1 => y = copy x
1706 MI->removeOperand(Src1Idx);
1707 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1708 } else
1709 return false;
1710
1711 return true;
1712 }
1713
1714 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1715 Opc == AMDGPU::S_XOR_B32) {
1716 if (Src1Val == 0) {
1717 // y = xor x, 0 => y = copy x
1718 MI->removeOperand(Src1Idx);
1719 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1720 return true;
1721 }
1722 }
1723
1724 return false;
1725}
1726
1727// Try to fold an instruction into a simpler one
1728bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
1729 unsigned Opc = MI.getOpcode();
1730 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1731 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1732 return false;
1733
1734 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1735 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1736 if (!Src1->isIdenticalTo(*Src0)) {
1737 std::optional<int64_t> Src1Imm = TII->getImmOrMaterializedImm(*Src1);
1738 if (!Src1Imm)
1739 return false;
1740
1741 std::optional<int64_t> Src0Imm = TII->getImmOrMaterializedImm(*Src0);
1742 if (!Src0Imm || *Src0Imm != *Src1Imm)
1743 return false;
1744 }
1745
1746 int Src1ModIdx =
1747 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1748 int Src0ModIdx =
1749 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1750 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1751 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1752 return false;
1753
1754 LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1755 auto &NewDesc =
1756 TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
1757 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1758 if (Src2Idx != -1)
1759 MI.removeOperand(Src2Idx);
1760 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1761 if (Src1ModIdx != -1)
1762 MI.removeOperand(Src1ModIdx);
1763 if (Src0ModIdx != -1)
1764 MI.removeOperand(Src0ModIdx);
1765 TII->mutateAndCleanupImplicit(MI, NewDesc);
1766 LLVM_DEBUG(dbgs() << MI);
1767 return true;
1768}
1769
1770bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
1771 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1772 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1773 return false;
1774
1775 std::optional<int64_t> Src0Imm =
1776 TII->getImmOrMaterializedImm(MI.getOperand(1));
1777 if (!Src0Imm || *Src0Imm != 0xffff || !MI.getOperand(2).isReg())
1778 return false;
1779
1780 Register Src1 = MI.getOperand(2).getReg();
1781 MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1782 if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
1783 return false;
1784
1785 Register Dst = MI.getOperand(0).getReg();
1786 MRI->replaceRegWith(Dst, Src1);
1787 if (!MI.getOperand(2).isKill())
1788 MRI->clearKillFlags(Src1);
1789 MI.eraseFromParent();
1790 return true;
1791}
1792
1793bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
1794 const FoldableDef &OpToFold) const {
1795 // We need mutate the operands of new mov instructions to add implicit
1796 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1797 // this.
1798 SmallVector<MachineInstr *, 4> CopiesToReplace;
1800 MachineOperand &Dst = MI.getOperand(0);
1801 bool Changed = false;
1802
1803 if (OpToFold.isImm()) {
1804 for (auto &UseMI :
1805 make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
1806 // Folding the immediate may reveal operations that can be constant
1807 // folded or replaced with a copy. This can happen for example after
1808 // frame indices are lowered to constants or from splitting 64-bit
1809 // constants.
1810 //
1811 // We may also encounter cases where one or both operands are
1812 // immediates materialized into a register, which would ordinarily not
1813 // be folded due to multiple uses or operand constraints.
1814 if (tryConstantFoldOp(&UseMI)) {
1815 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
1816 Changed = true;
1817 }
1818 }
1819 }
1820
1822 llvm::make_pointer_range(MRI->use_nodbg_operands(Dst.getReg())));
1823 for (auto *U : UsesToProcess) {
1824 MachineInstr *UseMI = U->getParent();
1825
1826 FoldableDef SubOpToFold = OpToFold.getWithSubReg(*TRI, U->getSubReg());
1827 foldOperand(SubOpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
1828 CopiesToReplace);
1829 }
1830
1831 if (CopiesToReplace.empty() && FoldList.empty())
1832 return Changed;
1833
1834 MachineFunction *MF = MI.getMF();
1835 // Make sure we add EXEC uses to any new v_mov instructions created.
1836 for (MachineInstr *Copy : CopiesToReplace)
1837 Copy->addImplicitDefUseOperands(*MF);
1838
1839 SetVector<MachineInstr *> ConstantFoldCandidates;
1840 for (FoldCandidate &Fold : FoldList) {
1841 assert(!Fold.isReg() || Fold.Def.OpToFold);
1842 if (Fold.isReg() && Fold.getReg().isVirtual()) {
1843 Register Reg = Fold.getReg();
1844 const MachineInstr *DefMI = Fold.Def.DefMI;
1845 if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
1846 execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
1847 continue;
1848 }
1849 if (updateOperand(Fold)) {
1850 // Clear kill flags.
1851 if (Fold.isReg()) {
1852 assert(Fold.Def.OpToFold && Fold.isReg());
1853 // FIXME: Probably shouldn't bother trying to fold if not an
1854 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1855 // copies.
1856 MRI->clearKillFlags(Fold.getReg());
1857 }
1858 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1859 << static_cast<int>(Fold.UseOpNo) << " of "
1860 << *Fold.UseMI);
1861
1862 if (Fold.isImm())
1863 ConstantFoldCandidates.insert(Fold.UseMI);
1864
1865 } else if (Fold.Commuted) {
1866 // Restoring instruction's original operand order if fold has failed.
1867 TII->commuteInstruction(*Fold.UseMI, false);
1868 }
1869 }
1870
1871 for (MachineInstr *MI : ConstantFoldCandidates) {
1872 if (tryConstantFoldOp(MI)) {
1873 LLVM_DEBUG(dbgs() << "Constant folded " << *MI);
1874 Changed = true;
1875 }
1876 }
1877 return true;
1878}
1879
1880/// Fold %agpr = COPY (REG_SEQUENCE x_MOV_B32, ...) into REG_SEQUENCE
1881/// (V_ACCVGPR_WRITE_B32_e64) ... depending on the reg_sequence input values.
1882bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const {
1883 // It is very tricky to store a value into an AGPR. v_accvgpr_write_b32 can
1884 // only accept VGPR or inline immediate. Recreate a reg_sequence with its
1885 // initializers right here, so we will rematerialize immediates and avoid
1886 // copies via different reg classes.
1887 const TargetRegisterClass *DefRC =
1888 MRI->getRegClass(CopyMI->getOperand(0).getReg());
1889 if (!TRI->isAGPRClass(DefRC))
1890 return false;
1891
1892 Register UseReg = CopyMI->getOperand(1).getReg();
1893 MachineInstr *RegSeq = MRI->getVRegDef(UseReg);
1894 if (!RegSeq || !RegSeq->isRegSequence())
1895 return false;
1896
1897 const DebugLoc &DL = CopyMI->getDebugLoc();
1898 MachineBasicBlock &MBB = *CopyMI->getParent();
1899
1900 MachineInstrBuilder B(*MBB.getParent(), CopyMI);
1901 DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
1902
1903 const TargetRegisterClass *UseRC =
1904 MRI->getRegClass(CopyMI->getOperand(1).getReg());
1905
1906 // Value, subregindex for new REG_SEQUENCE
1908
1909 unsigned NumRegSeqOperands = RegSeq->getNumOperands();
1910 unsigned NumFoldable = 0;
1911
1912 for (unsigned I = 1; I != NumRegSeqOperands; I += 2) {
1913 MachineOperand &RegOp = RegSeq->getOperand(I);
1914 unsigned SubRegIdx = RegSeq->getOperand(I + 1).getImm();
1915
1916 if (RegOp.getSubReg()) {
1917 // TODO: Handle subregister compose
1918 NewDefs.emplace_back(&RegOp, SubRegIdx);
1919 continue;
1920 }
1921
1922 MachineOperand *Lookup = lookUpCopyChain(*TII, *MRI, RegOp.getReg());
1923 if (!Lookup)
1924 Lookup = &RegOp;
1925
1926 if (Lookup->isImm()) {
1927 // Check if this is an agpr_32 subregister.
1928 const TargetRegisterClass *DestSuperRC = TRI->getMatchingSuperRegClass(
1929 DefRC, &AMDGPU::AGPR_32RegClass, SubRegIdx);
1930 if (DestSuperRC &&
1931 TII->isInlineConstant(*Lookup, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1932 ++NumFoldable;
1933 NewDefs.emplace_back(Lookup, SubRegIdx);
1934 continue;
1935 }
1936 }
1937
1938 const TargetRegisterClass *InputRC =
1939 Lookup->isReg() ? MRI->getRegClass(Lookup->getReg())
1940 : MRI->getRegClass(RegOp.getReg());
1941
1942 // TODO: Account for Lookup->getSubReg()
1943
1944 // If we can't find a matching super class, this is an SGPR->AGPR or
1945 // VGPR->AGPR subreg copy (or something constant-like we have to materialize
1946 // in the AGPR). We can't directly copy from SGPR to AGPR on gfx908, so we
1947 // want to rewrite to copy to an intermediate VGPR class.
1948 const TargetRegisterClass *MatchRC =
1949 TRI->getMatchingSuperRegClass(DefRC, InputRC, SubRegIdx);
1950 if (!MatchRC) {
1951 ++NumFoldable;
1952 NewDefs.emplace_back(&RegOp, SubRegIdx);
1953 continue;
1954 }
1955
1956 NewDefs.emplace_back(&RegOp, SubRegIdx);
1957 }
1958
1959 // Do not clone a reg_sequence and merely change the result register class.
1960 if (NumFoldable == 0)
1961 return false;
1962
1963 CopyMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
1964 for (unsigned I = CopyMI->getNumOperands() - 1; I > 0; --I)
1965 CopyMI->removeOperand(I);
1966
1967 for (auto [Def, DestSubIdx] : NewDefs) {
1968 if (!Def->isReg()) {
1969 // TODO: Should we use single write for each repeated value like in
1970 // register case?
1971 Register Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1972 BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
1973 .add(*Def);
1974 B.addReg(Tmp);
1975 } else {
1976 TargetInstrInfo::RegSubRegPair Src = getRegSubRegPair(*Def);
1977 Def->setIsKill(false);
1978
1979 Register &VGPRCopy = VGPRCopies[Src];
1980 if (!VGPRCopy) {
1981 const TargetRegisterClass *VGPRUseSubRC =
1982 TRI->getSubRegisterClass(UseRC, DestSubIdx);
1983
1984 // We cannot build a reg_sequence out of the same registers, they
1985 // must be copied. Better do it here before copyPhysReg() created
1986 // several reads to do the AGPR->VGPR->AGPR copy.
1987
1988 // Direct copy from SGPR to AGPR is not possible on gfx908. To avoid
1989 // creation of exploded copies SGPR->VGPR->AGPR in the copyPhysReg()
1990 // later, create a copy here and track if we already have such a copy.
1991 const TargetRegisterClass *SubRC =
1992 TRI->getSubRegisterClass(MRI->getRegClass(Src.Reg), Src.SubReg);
1993 if (!VGPRUseSubRC->hasSubClassEq(SubRC)) {
1994 // TODO: Try to reconstrain class
1995 VGPRCopy = MRI->createVirtualRegister(VGPRUseSubRC);
1996 BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), VGPRCopy).add(*Def);
1997 B.addReg(VGPRCopy);
1998 } else {
1999 // If it is already a VGPR, do not copy the register.
2000 B.add(*Def);
2001 }
2002 } else {
2003 B.addReg(VGPRCopy);
2004 }
2005 }
2006
2007 B.addImm(DestSubIdx);
2008 }
2009
2010 LLVM_DEBUG(dbgs() << "Folded " << *CopyMI);
2011 return true;
2012}
2013
2014bool SIFoldOperandsImpl::tryFoldFoldableCopy(
2015 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
2016 Register DstReg = MI.getOperand(0).getReg();
2017 // Specially track simple redefs of m0 to the same value in a block, so we
2018 // can erase the later ones.
2019 if (DstReg == AMDGPU::M0) {
2020 MachineOperand &NewM0Val = MI.getOperand(1);
2021 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
2022 MI.eraseFromParent();
2023 return true;
2024 }
2025
2026 // We aren't tracking other physical registers
2027 CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical())
2028 ? nullptr
2029 : &NewM0Val;
2030 return false;
2031 }
2032
2033 MachineOperand *OpToFoldPtr;
2034 if (MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e64) {
2035 // Folding when any src_modifiers are non-zero is unsupported
2036 if (TII->hasAnyModifiersSet(MI))
2037 return false;
2038 OpToFoldPtr = &MI.getOperand(2);
2039 } else
2040 OpToFoldPtr = &MI.getOperand(1);
2041 MachineOperand &OpToFold = *OpToFoldPtr;
2042 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
2043
2044 // FIXME: We could also be folding things like TargetIndexes.
2045 if (!FoldingImm && !OpToFold.isReg())
2046 return false;
2047
2048 // Fold virtual registers and constant physical registers.
2049 if (OpToFold.isReg() && OpToFold.getReg().isPhysical() &&
2050 !TRI->isConstantPhysReg(OpToFold.getReg()))
2051 return false;
2052
2053 // Prevent folding operands backwards in the function. For example,
2054 // the COPY opcode must not be replaced by 1 in this example:
2055 //
2056 // %3 = COPY %vgpr0; VGPR_32:%3
2057 // ...
2058 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
2059 if (!DstReg.isVirtual())
2060 return false;
2061
2062 const TargetRegisterClass *DstRC =
2063 MRI->getRegClass(MI.getOperand(0).getReg());
2064
2065 // True16: Fix malformed 16-bit sgpr COPY produced by peephole-opt
2066 // Can remove this code if proper 16-bit SGPRs are implemented
2067 // Example: Pre-peephole-opt
2068 // %29:sgpr_lo16 = COPY %16.lo16:sreg_32
2069 // %32:sreg_32 = COPY %29:sgpr_lo16
2070 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2071 // Post-peephole-opt and DCE
2072 // %32:sreg_32 = COPY %16.lo16:sreg_32
2073 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2074 // After this transform
2075 // %32:sreg_32 = COPY %16:sreg_32
2076 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
2077 // After the fold operands pass
2078 // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %16:sreg_32
2079 if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() &&
2080 OpToFold.getSubReg()) {
2081 if (DstRC == &AMDGPU::SReg_32RegClass &&
2082 DstRC == MRI->getRegClass(OpToFold.getReg())) {
2083 assert(OpToFold.getSubReg() == AMDGPU::lo16);
2084 OpToFold.setSubReg(0);
2085 }
2086 }
2087
2088 // Fold copy to AGPR through reg_sequence
2089 // TODO: Handle with subregister extract
2090 if (OpToFold.isReg() && MI.isCopy() && !MI.getOperand(1).getSubReg()) {
2091 if (foldCopyToAGPRRegSequence(&MI))
2092 return true;
2093 }
2094
2095 FoldableDef Def(OpToFold, DstRC);
2096 bool Changed = foldInstOperand(MI, Def);
2097
2098 // If we managed to fold all uses of this copy then we might as well
2099 // delete it now.
2100 // The only reason we need to follow chains of copies here is that
2101 // tryFoldRegSequence looks forward through copies before folding a
2102 // REG_SEQUENCE into its eventual users.
2103 auto *InstToErase = &MI;
2104 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2105 auto &SrcOp = InstToErase->getOperand(1);
2106 auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
2107 InstToErase->eraseFromParent();
2108 Changed = true;
2109 InstToErase = nullptr;
2110 if (!SrcReg || SrcReg.isPhysical())
2111 break;
2112 InstToErase = MRI->getVRegDef(SrcReg);
2113 if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
2114 break;
2115 }
2116
2117 if (InstToErase && InstToErase->isRegSequence() &&
2118 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2119 InstToErase->eraseFromParent();
2120 Changed = true;
2121 }
2122
2123 if (Changed)
2124 return true;
2125
2126 // Run this after foldInstOperand to avoid turning scalar additions into
2127 // vector additions when the result scalar result could just be folded into
2128 // the user(s).
2129 return OpToFold.isReg() &&
2130 foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI);
2131}
2132
2133// Clamp patterns are canonically selected to v_max_* instructions, so only
2134// handle them.
2135const MachineOperand *
2136SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const {
2137 unsigned Op = MI.getOpcode();
2138 switch (Op) {
2139 case AMDGPU::V_MAX_F32_e64:
2140 case AMDGPU::V_MAX_F16_e64:
2141 case AMDGPU::V_MAX_F16_t16_e64:
2142 case AMDGPU::V_MAX_F16_fake16_e64:
2143 case AMDGPU::V_MAX_F64_e64:
2144 case AMDGPU::V_MAX_NUM_F64_e64:
2145 case AMDGPU::V_PK_MAX_F16:
2146 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2147 case AMDGPU::V_PK_MAX_NUM_BF16: {
2148 if (MI.mayRaiseFPException())
2149 return nullptr;
2150
2151 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
2152 return nullptr;
2153
2154 // Make sure sources are identical.
2155 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2156 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2157 if (!Src0->isReg() || !Src1->isReg() ||
2158 Src0->getReg() != Src1->getReg() ||
2159 Src0->getSubReg() != Src1->getSubReg() ||
2160 Src0->getSubReg() != AMDGPU::NoSubRegister)
2161 return nullptr;
2162
2163 // Can't fold up if we have modifiers.
2164 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
2165 return nullptr;
2166
2167 unsigned Src0Mods
2168 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
2169 unsigned Src1Mods
2170 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
2171
2172 // Having a 0 op_sel_hi would require swizzling the output in the source
2173 // instruction, which we can't do.
2174 unsigned UnsetMods =
2175 (Op == AMDGPU::V_PK_MAX_F16 || Op == AMDGPU::V_PK_MAX_NUM_BF16)
2177 : 0u;
2178 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
2179 return nullptr;
2180 return Src0;
2181 }
2182 default:
2183 return nullptr;
2184 }
2185}
2186
2187// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
2188bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
2189 const MachineOperand *ClampSrc = isClamp(MI);
2190 if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
2191 return false;
2192
2193 if (!ClampSrc->getReg().isVirtual())
2194 return false;
2195
2196 // Look through COPY. COPY only observed with True16.
2197 Register DefSrcReg = TRI->lookThruCopyLike(ClampSrc->getReg(), MRI);
2198 MachineInstr *Def =
2199 MRI->getVRegDef(DefSrcReg.isVirtual() ? DefSrcReg : ClampSrc->getReg());
2200
2201 // The type of clamp must be compatible.
2202 if (!SIInstrInfo::hasSameClamp(*Def, MI))
2203 return false;
2204
2205 if (Def->mayRaiseFPException())
2206 return false;
2207
2208 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
2209 if (!DefClamp)
2210 return false;
2211
2212 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
2213
2214 // Clamp is applied after omod, so it is OK if omod is set.
2215 DefClamp->setImm(1);
2216
2217 Register DefReg = Def->getOperand(0).getReg();
2218 Register MIDstReg = MI.getOperand(0).getReg();
2219 if (TRI->isSGPRReg(*MRI, DefReg)) {
2220 // Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
2221 // instruction with a VGPR dst.
2222 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
2223 MIDstReg)
2224 .addReg(DefReg);
2225 } else {
2226 MRI->replaceRegWith(MIDstReg, DefReg);
2227 }
2228 MI.eraseFromParent();
2229
2230 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
2231 // instruction, so we might as well convert it to the more flexible VOP3-only
2232 // mad/fma form.
2233 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
2234 Def->eraseFromParent();
2235
2236 return true;
2237}
2238
2239static int getOModValue(unsigned Opc, int64_t Val) {
2240 switch (Opc) {
2241 case AMDGPU::V_MUL_F64_e64:
2242 case AMDGPU::V_MUL_F64_pseudo_e64: {
2243 switch (Val) {
2244 case 0x3fe0000000000000: // 0.5
2245 return SIOutMods::DIV2;
2246 case 0x4000000000000000: // 2.0
2247 return SIOutMods::MUL2;
2248 case 0x4010000000000000: // 4.0
2249 return SIOutMods::MUL4;
2250 default:
2251 return SIOutMods::NONE;
2252 }
2253 }
2254 case AMDGPU::V_MUL_F32_e64: {
2255 switch (static_cast<uint32_t>(Val)) {
2256 case 0x3f000000: // 0.5
2257 return SIOutMods::DIV2;
2258 case 0x40000000: // 2.0
2259 return SIOutMods::MUL2;
2260 case 0x40800000: // 4.0
2261 return SIOutMods::MUL4;
2262 default:
2263 return SIOutMods::NONE;
2264 }
2265 }
2266 case AMDGPU::V_MUL_F16_e64:
2267 case AMDGPU::V_MUL_F16_t16_e64:
2268 case AMDGPU::V_MUL_F16_fake16_e64: {
2269 switch (static_cast<uint16_t>(Val)) {
2270 case 0x3800: // 0.5
2271 return SIOutMods::DIV2;
2272 case 0x4000: // 2.0
2273 return SIOutMods::MUL2;
2274 case 0x4400: // 4.0
2275 return SIOutMods::MUL4;
2276 default:
2277 return SIOutMods::NONE;
2278 }
2279 }
2280 default:
2281 llvm_unreachable("invalid mul opcode");
2282 }
2283}
2284
2285// FIXME: Does this really not support denormals with f16?
2286// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
2287// handled, so will anything other than that break?
2288std::pair<const MachineOperand *, int>
2289SIFoldOperandsImpl::isOMod(const MachineInstr &MI) const {
2290 unsigned Op = MI.getOpcode();
2291 switch (Op) {
2292 case AMDGPU::V_MUL_F64_e64:
2293 case AMDGPU::V_MUL_F64_pseudo_e64:
2294 case AMDGPU::V_MUL_F32_e64:
2295 case AMDGPU::V_MUL_F16_t16_e64:
2296 case AMDGPU::V_MUL_F16_fake16_e64:
2297 case AMDGPU::V_MUL_F16_e64: {
2298 // If output denormals are enabled, omod is ignored.
2299 if ((Op == AMDGPU::V_MUL_F32_e64 &&
2301 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
2302 Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
2303 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
2306 MI.mayRaiseFPException())
2307 return std::pair(nullptr, SIOutMods::NONE);
2308
2309 const MachineOperand *RegOp = nullptr;
2310 const MachineOperand *ImmOp = nullptr;
2311 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2312 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2313 if (Src0->isImm()) {
2314 ImmOp = Src0;
2315 RegOp = Src1;
2316 } else if (Src1->isImm()) {
2317 ImmOp = Src1;
2318 RegOp = Src0;
2319 } else
2320 return std::pair(nullptr, SIOutMods::NONE);
2321
2322 int OMod = getOModValue(Op, ImmOp->getImm());
2323 if (OMod == SIOutMods::NONE ||
2324 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
2325 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
2326 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
2327 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
2328 return std::pair(nullptr, SIOutMods::NONE);
2329
2330 return std::pair(RegOp, OMod);
2331 }
2332 case AMDGPU::V_ADD_F64_e64:
2333 case AMDGPU::V_ADD_F64_pseudo_e64:
2334 case AMDGPU::V_ADD_F32_e64:
2335 case AMDGPU::V_ADD_F16_e64:
2336 case AMDGPU::V_ADD_F16_t16_e64:
2337 case AMDGPU::V_ADD_F16_fake16_e64: {
2338 // If output denormals are enabled, omod is ignored.
2339 if ((Op == AMDGPU::V_ADD_F32_e64 &&
2341 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
2342 Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
2343 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
2345 return std::pair(nullptr, SIOutMods::NONE);
2346
2347 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
2348 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2349 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2350
2351 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
2352 Src0->getSubReg() == Src1->getSubReg() &&
2353 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
2354 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
2355 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
2356 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
2357 return std::pair(Src0, SIOutMods::MUL2);
2358
2359 return std::pair(nullptr, SIOutMods::NONE);
2360 }
2361 default:
2362 return std::pair(nullptr, SIOutMods::NONE);
2363 }
2364}
2365
2366// FIXME: Does this need to check IEEE bit on function?
2367bool SIFoldOperandsImpl::tryFoldOMod(MachineInstr &MI) {
2368 const MachineOperand *RegOp;
2369 int OMod;
2370 std::tie(RegOp, OMod) = isOMod(MI);
2371 if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
2372 RegOp->getSubReg() != AMDGPU::NoSubRegister ||
2373 !MRI->hasOneNonDBGUser(RegOp->getReg()))
2374 return false;
2375
2376 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
2377 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
2378 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
2379 return false;
2380
2381 if (Def->mayRaiseFPException())
2382 return false;
2383
2384 // Clamp is applied after omod. If the source already has clamp set, don't
2385 // fold it.
2386 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
2387 return false;
2388
2389 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
2390
2391 DefOMod->setImm(OMod);
2392 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
2393 // Kill flags can be wrong if we replaced a def inside a loop with a def
2394 // outside the loop.
2395 MRI->clearKillFlags(Def->getOperand(0).getReg());
2396 MI.eraseFromParent();
2397
2398 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
2399 // instruction, so we might as well convert it to the more flexible VOP3-only
2400 // mad/fma form.
2401 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
2402 Def->eraseFromParent();
2403
2404 return true;
2405}
2406
2407// Try to fold a reg_sequence with vgpr output and agpr inputs into an
2408// instruction which can take an agpr. So far that means a store.
2409bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
2410 assert(MI.isRegSequence());
2411 auto Reg = MI.getOperand(0).getReg();
2412
2413 if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
2414 !MRI->hasOneNonDBGUse(Reg))
2415 return false;
2416
2418 if (!getRegSeqInit(Defs, Reg))
2419 return false;
2420
2421 for (auto &[Op, SubIdx] : Defs) {
2422 if (!Op->isReg())
2423 return false;
2424 if (TRI->isAGPR(*MRI, Op->getReg()))
2425 continue;
2426 // Maybe this is a COPY from AREG
2427 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
2428 if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
2429 return false;
2430 if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
2431 return false;
2432 }
2433
2434 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
2435 MachineInstr *UseMI = Op->getParent();
2436 while (UseMI->isCopy() && !Op->getSubReg()) {
2437 Reg = UseMI->getOperand(0).getReg();
2438 if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
2439 return false;
2440 Op = &*MRI->use_nodbg_begin(Reg);
2441 UseMI = Op->getParent();
2442 }
2443
2444 if (Op->getSubReg())
2445 return false;
2446
2447 unsigned OpIdx = Op - &UseMI->getOperand(0);
2448 const MCInstrDesc &InstDesc = UseMI->getDesc();
2449 const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx);
2450 if (!OpRC || !TRI->isVectorSuperClass(OpRC))
2451 return false;
2452
2453 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
2454 auto Dst = MRI->createVirtualRegister(NewDstRC);
2455 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
2456 TII->get(AMDGPU::REG_SEQUENCE), Dst);
2457
2458 for (auto &[Def, SubIdx] : Defs) {
2459 Def->setIsKill(false);
2460 if (TRI->isAGPR(*MRI, Def->getReg())) {
2461 RS.add(*Def);
2462 } else { // This is a copy
2463 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
2464 SubDef->getOperand(1).setIsKill(false);
2465 RS.addReg(SubDef->getOperand(1).getReg(), {}, Def->getSubReg());
2466 }
2467 RS.addImm(SubIdx);
2468 }
2469
2470 Op->setReg(Dst);
2471 if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
2472 Op->setReg(Reg);
2473 RS->eraseFromParent();
2474 return false;
2475 }
2476
2477 LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
2478
2479 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
2480 // in which case we can erase them all later in runOnMachineFunction.
2481 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
2482 MI.eraseFromParent();
2483 return true;
2484}
2485
2486/// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and
2487/// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg
2488static bool isAGPRCopy(const SIRegisterInfo &TRI,
2489 const MachineRegisterInfo &MRI, const MachineInstr &Copy,
2490 Register &OutReg, unsigned &OutSubReg) {
2491 assert(Copy.isCopy());
2492
2493 const MachineOperand &CopySrc = Copy.getOperand(1);
2494 Register CopySrcReg = CopySrc.getReg();
2495 if (!CopySrcReg.isVirtual())
2496 return false;
2497
2498 // Common case: copy from AGPR directly, e.g.
2499 // %1:vgpr_32 = COPY %0:agpr_32
2500 if (TRI.isAGPR(MRI, CopySrcReg)) {
2501 OutReg = CopySrcReg;
2502 OutSubReg = CopySrc.getSubReg();
2503 return true;
2504 }
2505
2506 // Sometimes it can also involve two copies, e.g.
2507 // %1:vgpr_256 = COPY %0:agpr_256
2508 // %2:vgpr_32 = COPY %1:vgpr_256.sub0
2509 const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg);
2510 if (!CopySrcDef || !CopySrcDef->isCopy())
2511 return false;
2512
2513 const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1);
2514 Register OtherCopySrcReg = OtherCopySrc.getReg();
2515 if (!OtherCopySrcReg.isVirtual() ||
2516 CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister ||
2517 OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister ||
2518 !TRI.isAGPR(MRI, OtherCopySrcReg))
2519 return false;
2520
2521 OutReg = OtherCopySrcReg;
2522 OutSubReg = CopySrc.getSubReg();
2523 return true;
2524}
2525
2526// Try to hoist an AGPR to VGPR copy across a PHI.
2527// This should allow folding of an AGPR into a consumer which may support it.
2528//
2529// Example 1: LCSSA PHI
2530// loop:
2531// %1:vreg = COPY %0:areg
2532// exit:
2533// %2:vreg = PHI %1:vreg, %loop
2534// =>
2535// loop:
2536// exit:
2537// %1:areg = PHI %0:areg, %loop
2538// %2:vreg = COPY %1:areg
2539//
2540// Example 2: PHI with multiple incoming values:
2541// entry:
2542// %1:vreg = GLOBAL_LOAD(..)
2543// loop:
2544// %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
2545// %3:areg = COPY %2:vreg
2546// %4:areg = (instr using %3:areg)
2547// %5:vreg = COPY %4:areg
2548// =>
2549// entry:
2550// %1:vreg = GLOBAL_LOAD(..)
2551// %2:areg = COPY %1:vreg
2552// loop:
2553// %3:areg = PHI %2:areg, %entry, %X:areg,
2554// %4:areg = (instr using %3:areg)
2555bool SIFoldOperandsImpl::tryFoldPhiAGPR(MachineInstr &PHI) {
2556 assert(PHI.isPHI());
2557
2558 Register PhiOut = PHI.getOperand(0).getReg();
2559 if (!TRI->isVGPR(*MRI, PhiOut))
2560 return false;
2561
2562 // Iterate once over all incoming values of the PHI to check if this PHI is
2563 // eligible, and determine the exact AGPR RC we'll target.
2564 const TargetRegisterClass *ARC = nullptr;
2565 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2566 MachineOperand &MO = PHI.getOperand(K);
2567 MachineInstr *Copy = MRI->getVRegDef(MO.getReg());
2568 if (!Copy || !Copy->isCopy())
2569 continue;
2570
2571 Register AGPRSrc;
2572 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
2573 if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask))
2574 continue;
2575
2576 const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc);
2577 if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
2578 CopyInRC = SubRC;
2579
2580 if (ARC && !ARC->hasSubClassEq(CopyInRC))
2581 return false;
2582 ARC = CopyInRC;
2583 }
2584
2585 if (!ARC)
2586 return false;
2587
2588 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
2589
2590 // Rewrite the PHI's incoming values to ARC.
2591 LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI);
2592 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2593 MachineOperand &MO = PHI.getOperand(K);
2594 Register Reg = MO.getReg();
2595
2597 MachineBasicBlock *InsertMBB = nullptr;
2598
2599 // Look at the def of Reg, ignoring all copies.
2600 unsigned CopyOpc = AMDGPU::COPY;
2601 if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
2602
2603 // Look at pre-existing COPY instructions from ARC: Steal the operand. If
2604 // the copy was single-use, it will be removed by DCE later.
2605 if (Def->isCopy()) {
2606 Register AGPRSrc;
2607 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
2608 if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) {
2609 MO.setReg(AGPRSrc);
2610 MO.setSubReg(AGPRSubReg);
2611 continue;
2612 }
2613
2614 // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
2615 // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
2616 // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
2617 // is unlikely to be profitable.
2618 //
2619 // Note that V_ACCVGPR_WRITE is only used for AGPR_32.
2620 MachineOperand &CopyIn = Def->getOperand(1);
2621 if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) &&
2622 TRI->isSGPRReg(*MRI, CopyIn.getReg()))
2623 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
2624 }
2625
2626 InsertMBB = Def->getParent();
2627 InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator());
2628 } else {
2629 InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
2630 InsertPt = InsertMBB->getFirstTerminator();
2631 }
2632
2633 Register NewReg = MRI->createVirtualRegister(ARC);
2634 MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
2635 TII->get(CopyOpc), NewReg)
2636 .addReg(Reg);
2637 MO.setReg(NewReg);
2638
2639 (void)MI;
2640 LLVM_DEBUG(dbgs() << " Created COPY: " << *MI);
2641 }
2642
2643 // Replace the PHI's result with a new register.
2644 Register NewReg = MRI->createVirtualRegister(ARC);
2645 PHI.getOperand(0).setReg(NewReg);
2646
2647 // COPY that new register back to the original PhiOut register. This COPY will
2648 // usually be folded out later.
2649 MachineBasicBlock *MBB = PHI.getParent();
2650 BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(),
2651 TII->get(AMDGPU::COPY), PhiOut)
2652 .addReg(NewReg);
2653
2654 LLVM_DEBUG(dbgs() << " Done: Folded " << PHI);
2655 return true;
2656}
2657
2658// Attempt to convert VGPR load to an AGPR load.
2659bool SIFoldOperandsImpl::tryFoldLoad(MachineInstr &MI) {
2660 assert(MI.mayLoad());
2661 if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
2662 return false;
2663
2664 MachineOperand &Def = MI.getOperand(0);
2665 if (!Def.isDef())
2666 return false;
2667
2668 Register DefReg = Def.getReg();
2669
2670 if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
2671 return false;
2672
2675 SmallVector<Register, 8> MoveRegs;
2676
2677 if (Users.empty())
2678 return false;
2679
2680 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
2681 while (!Users.empty()) {
2682 const MachineInstr *I = Users.pop_back_val();
2683 if (!I->isCopy() && !I->isRegSequence())
2684 return false;
2685 Register DstReg = I->getOperand(0).getReg();
2686 // Physical registers may have more than one instruction definitions
2687 if (DstReg.isPhysical())
2688 return false;
2689 if (TRI->isAGPR(*MRI, DstReg))
2690 continue;
2691 MoveRegs.push_back(DstReg);
2692 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg))
2693 Users.push_back(&U);
2694 }
2695
2696 const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
2697 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
2698 if (!TII->isOperandLegal(MI, 0, &Def)) {
2699 MRI->setRegClass(DefReg, RC);
2700 return false;
2701 }
2702
2703 while (!MoveRegs.empty()) {
2704 Register Reg = MoveRegs.pop_back_val();
2705 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
2706 }
2707
2708 LLVM_DEBUG(dbgs() << "Folded " << MI);
2709
2710 return true;
2711}
2712
2713// tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
2714// For GFX90A and later, this is pretty much always a good thing, but for GFX908
2715// there's cases where it can create a lot more AGPR-AGPR copies, which are
2716// expensive on this architecture due to the lack of V_ACCVGPR_MOV.
2717//
2718// This function looks at all AGPR PHIs in a basic block and collects their
2719// operands. Then, it checks for register that are used more than once across
2720// all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
2721// having to create one VGPR temporary per use, which can get very messy if
2722// these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
2723// element).
2724//
2725// Example
2726// a:
2727// %in:agpr_256 = COPY %foo:vgpr_256
2728// c:
2729// %x:agpr_32 = ..
2730// b:
2731// %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
2732// %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
2733// %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
2734// =>
2735// a:
2736// %in:agpr_256 = COPY %foo:vgpr_256
2737// %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
2738// %tmp_agpr:agpr_32 = COPY %tmp
2739// c:
2740// %x:agpr_32 = ..
2741// b:
2742// %0:areg = PHI %tmp_agpr, %a, %x, %c
2743// %1:areg = PHI %tmp_agpr, %a, %y, %c
2744// %2:areg = PHI %tmp_agpr, %a, %z, %c
2745bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
2746 // This is only really needed on GFX908 where AGPR-AGPR copies are
2747 // unreasonably difficult.
2748 if (ST->hasGFX90AInsts())
2749 return false;
2750
2751 // Look at all AGPR Phis and collect the register + subregister used.
2752 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
2753 RegToMO;
2754
2755 for (auto &MI : MBB) {
2756 if (!MI.isPHI())
2757 break;
2758
2759 if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))
2760 continue;
2761
2762 for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
2763 MachineOperand &PhiMO = MI.getOperand(K);
2764 if (!PhiMO.getSubReg())
2765 continue;
2766 RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
2767 }
2768 }
2769
2770 // For all (Reg, SubReg) pair that are used more than once, cache the value in
2771 // a VGPR.
2772 bool Changed = false;
2773 for (const auto &[Entry, MOs] : RegToMO) {
2774 if (MOs.size() == 1)
2775 continue;
2776
2777 const auto [Reg, SubReg] = Entry;
2778 MachineInstr *Def = MRI->getVRegDef(Reg);
2779 MachineBasicBlock *DefMBB = Def->getParent();
2780
2781 // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
2782 // out.
2783 const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
2784 Register TempVGPR =
2785 MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
2786 MachineInstr *VGPRCopy =
2787 BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
2788 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2789 .addReg(Reg, /* flags */ {}, SubReg);
2790
2791 // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
2792 Register TempAGPR = MRI->createVirtualRegister(ARC);
2793 BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(),
2794 TII->get(AMDGPU::COPY), TempAGPR)
2795 .addReg(TempVGPR);
2796
2797 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
2798 for (MachineOperand *MO : MOs) {
2799 MO->setReg(TempAGPR);
2800 MO->setSubReg(AMDGPU::NoSubRegister);
2801 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n");
2802 }
2803
2804 Changed = true;
2805 }
2806
2807 return Changed;
2808}
2809
2810bool SIFoldOperandsImpl::run(MachineFunction &MF) {
2811 this->MF = &MF;
2812 MRI = &MF.getRegInfo();
2813 ST = &MF.getSubtarget<GCNSubtarget>();
2814 TII = ST->getInstrInfo();
2815 TRI = &TII->getRegisterInfo();
2816 MFI = MF.getInfo<SIMachineFunctionInfo>();
2817
2818 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
2819 // correctly handle signed zeros.
2820 //
2821 // FIXME: Also need to check strictfp
2822 bool IsIEEEMode = MFI->getMode().IEEE;
2823
2824 bool Changed = false;
2825 for (MachineBasicBlock *MBB : depth_first(&MF)) {
2826 MachineOperand *CurrentKnownM0Val = nullptr;
2827 for (auto &MI : make_early_inc_range(*MBB)) {
2828 Changed |= tryFoldCndMask(MI);
2829
2830 if (tryFoldZeroHighBits(MI)) {
2831 Changed = true;
2832 continue;
2833 }
2834
2835 if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
2836 Changed = true;
2837 continue;
2838 }
2839
2840 if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
2841 Changed = true;
2842 continue;
2843 }
2844
2845 if (MI.mayLoad() && tryFoldLoad(MI)) {
2846 Changed = true;
2847 continue;
2848 }
2849
2850 if (TII->isFoldableCopy(MI)) {
2851 Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
2852 continue;
2853 }
2854
2855 // Saw an unknown clobber of m0, so we no longer know what it is.
2856 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
2857 CurrentKnownM0Val = nullptr;
2858
2859 // TODO: Omod might be OK if there is NSZ only on the source
2860 // instruction, and not the omod multiply.
2861 if (IsIEEEMode || !MI.getFlag(MachineInstr::FmNsz) || !tryFoldOMod(MI))
2862 Changed |= tryFoldClamp(MI);
2863 }
2864
2865 Changed |= tryOptimizeAGPRPhis(*MBB);
2866 }
2867
2868 return Changed;
2869}
2870
2873 MFPropsModifier _(*this, MF);
2874
2875 bool Changed = SIFoldOperandsImpl().run(MF);
2876 if (!Changed) {
2877 return PreservedAnalyses::all();
2878 }
2880 PA.preserveSet<CFGAnalyses>();
2881 return PA;
2882}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
#define _
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, FoldCandidate &&Entry)
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static unsigned getMovOpc(bool IsScalar)
static MachineOperand * lookUpCopyChain(const SIInstrInfo &TII, const MachineRegisterInfo &MRI, Register SrcReg)
static bool checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(const FoldableDef &OpToFold)
static bool isPKF32InstrReplicatesLower32BitsOfScalarOperand(const GCNSubtarget *ST, MachineInstr *MI, unsigned OpNo)
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
#define LLVM_DEBUG(...)
Definition Debug.h:119
static int Lookup(ArrayRef< TableEntry > Table, unsigned Opcode)
Value * RHS
Value * LHS
Represent the analysis usage information of a pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:275
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
const SIInstrInfo * getInstrInfo() const override
bool hasDOTOpSelHazard() const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
const HexagonRegisterInfo & getRegisterInfo() const
ArrayRef< MCOperandInfo > operands() const
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
uint8_t OperandType
Information about the type of the operand.
Definition MCInstrDesc.h:98
An RAII based helper class to modify MachineFunctionProperties when running pass.
LLVM_ABI iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg=Register(), bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
LivenessQueryResult
Possible outcome of a register liveness query to computeRegisterLiveness()
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool isCopy() const
const MachineBasicBlock * getParent() const
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
void clearFlag(MIFlag Flag)
clearFlag - Clear a MI flag.
bool isRegSequence() const
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
LLVM_ABI void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
LLVM_ABI void substPhysReg(MCRegister Reg, const TargetRegisterInfo &)
substPhysReg - Substitute the current register with the physical register Reg, taking any existing Su...
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
static MachineOperand CreateFI(int Idx)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
use_nodbg_iterator use_nodbg_begin(Register RegNo) const
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
iterator_range< use_nodbg_iterator > use_nodbg_operands(Register Reg) const
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLVM_ABI bool hasOneNonDBGUser(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug instruction using the specified regis...
iterator_range< use_instr_nodbg_iterator > use_nodbg_instructions(Register Reg) const
void setRegAllocationHint(Register VReg, unsigned Type, Register PrefReg)
setRegAllocationHint - Specify a register allocation hint for the specified virtual register.
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool hasSameClamp(const MachineInstr &A, const MachineInstr &B)
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
Register getReg() const
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
IteratorT begin() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType)
LLVM_READONLY int32_t getMFMAEarlyClobberOp(uint32_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY int32_t getVOPe32(uint32_t Opcode)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
@ OPERAND_REG_IMM_V2FP16
Definition SIDefines.h:423
@ OPERAND_REG_INLINE_C_FP64
Definition SIDefines.h:439
@ OPERAND_REG_INLINE_C_V2BF16
Definition SIDefines.h:441
@ OPERAND_REG_IMM_V2INT16
Definition SIDefines.h:425
@ OPERAND_REG_IMM_V2BF16
Definition SIDefines.h:422
@ OPERAND_REG_INLINE_C_INT64
Definition SIDefines.h:435
@ OPERAND_REG_IMM_NOINLINE_V2FP16
Definition SIDefines.h:427
@ OPERAND_REG_INLINE_C_V2FP16
Definition SIDefines.h:442
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
Definition SIDefines.h:453
@ OPERAND_REG_INLINE_AC_FP32
Definition SIDefines.h:454
@ OPERAND_REG_INLINE_C_FP32
Definition SIDefines.h:438
@ OPERAND_REG_INLINE_C_INT32
Definition SIDefines.h:434
@ OPERAND_REG_INLINE_C_V2INT16
Definition SIDefines.h:440
@ OPERAND_REG_IMM_V2FP32
Definition SIDefines.h:429
@ OPERAND_REG_INLINE_AC_FP64
Definition SIDefines.h:455
LLVM_READONLY int32_t getFlatScratchInstSSfromSV(uint32_t Opcode)
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
MachineBasicBlock::instr_iterator getBundleStart(MachineBasicBlock::instr_iterator I)
Returns an iterator to the first instruction in the bundle containing I.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Op::Description Desc
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
FunctionPass * createSIFoldOperandsLegacyPass()
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
char & SIFoldOperandsLegacyID
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:368
iterator_range< df_iterator< T > > depth_first(const T &G)
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
constexpr uint64_t Make_64(uint32_t High, uint32_t Low)
Make a 64-bit integer from a high / low pair of 32-bit integers.
Definition MathExtras.h:160
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.