LLVM 20.0.0git
SILowerI1Copies.cpp
Go to the documentation of this file.
1//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass lowers all occurrences of i1 values (with a vreg_1 register class)
10// to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA
11// form and a wave-level control flow graph.
12//
13// Before this pass, values that are semantically i1 and are defined and used
14// within the same basic block are already represented as lane masks in scalar
15// registers. However, values that cross basic blocks are always transferred
16// between basic blocks in vreg_1 virtual registers and are lowered by this
17// pass.
18//
19// The only instructions that use or define vreg_1 virtual registers are COPY,
20// PHI, and IMPLICIT_DEF.
21//
22//===----------------------------------------------------------------------===//
23
24#include "SILowerI1Copies.h"
25#include "AMDGPU.h"
29
30#define DEBUG_TYPE "si-i1-copies"
31
32using namespace llvm;
33
34static Register
36 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs);
37
38namespace {
39
40class Vreg1LoweringHelper : public PhiLoweringHelper {
41public:
42 Vreg1LoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,
44
45private:
46 DenseSet<Register> ConstrainRegs;
47
48public:
49 void markAsLaneMask(Register DstReg) const override;
51 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override;
53 const MachineInstr *MI,
54 SmallVectorImpl<Incoming> &Incomings) const override;
55 void replaceDstReg(Register NewReg, Register OldReg,
56 MachineBasicBlock *MBB) override;
59 Register DstReg, Register PrevReg,
60 Register CurReg) override;
61 void constrainAsLaneMask(Incoming &In) override;
62
63 bool lowerCopiesFromI1();
64 bool lowerCopiesToI1();
65 bool cleanConstrainRegs(bool Changed);
66 bool isVreg1(Register Reg) const {
67 return Reg.isVirtual() && MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass;
68 }
69};
70
71Vreg1LoweringHelper::Vreg1LoweringHelper(MachineFunction *MF,
74 : PhiLoweringHelper(MF, DT, PDT) {}
75
76bool Vreg1LoweringHelper::cleanConstrainRegs(bool Changed) {
77 assert(Changed || ConstrainRegs.empty());
78 for (Register Reg : ConstrainRegs)
79 MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass);
80 ConstrainRegs.clear();
81
82 return Changed;
83}
84
85/// Helper class that determines the relationship between incoming values of a
86/// phi in the control flow graph to determine where an incoming value can
87/// simply be taken as a scalar lane mask as-is, and where it needs to be
88/// merged with another, previously defined lane mask.
89///
90/// The approach is as follows:
91/// - Determine all basic blocks which, starting from the incoming blocks,
92/// a wave may reach before entering the def block (the block containing the
93/// phi).
94/// - If an incoming block has no predecessors in this set, we can take the
95/// incoming value as a scalar lane mask as-is.
96/// -- A special case of this is when the def block has a self-loop.
97/// - Otherwise, the incoming value needs to be merged with a previously
98/// defined lane mask.
99/// - If there is a path into the set of reachable blocks that does _not_ go
100/// through an incoming block where we can take the scalar lane mask as-is,
101/// we need to invent an available value for the SSAUpdater. Choices are
102/// 0 and undef, with differing consequences for how to merge values etc.
103///
104/// TODO: We could use region analysis to quickly skip over SESE regions during
105/// the traversal.
106///
107class PhiIncomingAnalysis {
109 const SIInstrInfo *TII;
110
111 // For each reachable basic block, whether it is a source in the induced
112 // subgraph of the CFG.
117
118public:
119 PhiIncomingAnalysis(MachinePostDominatorTree &PDT, const SIInstrInfo *TII)
120 : PDT(PDT), TII(TII) {}
121
122 /// Returns whether \p MBB is a source in the induced subgraph of reachable
123 /// blocks.
124 bool isSource(MachineBasicBlock &MBB) const {
125 return ReachableMap.find(&MBB)->second;
126 }
127
128 ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; }
129
130 void analyze(MachineBasicBlock &DefBlock, ArrayRef<Incoming> Incomings) {
131 assert(Stack.empty());
132 ReachableMap.clear();
133 ReachableOrdered.clear();
134 Predecessors.clear();
135
136 // Insert the def block first, so that it acts as an end point for the
137 // traversal.
138 ReachableMap.try_emplace(&DefBlock, false);
139 ReachableOrdered.push_back(&DefBlock);
140
141 for (auto Incoming : Incomings) {
143 if (MBB == &DefBlock) {
144 ReachableMap[&DefBlock] = true; // self-loop on DefBlock
145 continue;
146 }
147
148 ReachableMap.try_emplace(MBB, false);
149 ReachableOrdered.push_back(MBB);
150
151 // If this block has a divergent terminator and the def block is its
152 // post-dominator, the wave may first visit the other successors.
153 if (TII->hasDivergentBranch(MBB) && PDT.dominates(&DefBlock, MBB))
154 append_range(Stack, MBB->successors());
155 }
156
157 while (!Stack.empty()) {
158 MachineBasicBlock *MBB = Stack.pop_back_val();
159 if (!ReachableMap.try_emplace(MBB, false).second)
160 continue;
161 ReachableOrdered.push_back(MBB);
162
163 append_range(Stack, MBB->successors());
164 }
165
166 for (MachineBasicBlock *MBB : ReachableOrdered) {
167 bool HaveReachablePred = false;
168 for (MachineBasicBlock *Pred : MBB->predecessors()) {
169 if (ReachableMap.count(Pred)) {
170 HaveReachablePred = true;
171 } else {
172 Stack.push_back(Pred);
173 }
174 }
175 if (!HaveReachablePred)
176 ReachableMap[MBB] = true;
177 if (HaveReachablePred) {
178 for (MachineBasicBlock *UnreachablePred : Stack) {
179 if (!llvm::is_contained(Predecessors, UnreachablePred))
180 Predecessors.push_back(UnreachablePred);
181 }
182 }
183 Stack.clear();
184 }
185 }
186};
187
188/// Helper class that detects loops which require us to lower an i1 COPY into
189/// bitwise manipulation.
190///
191/// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish
192/// between loops with the same header. Consider this example:
193///
194/// A-+-+
195/// | | |
196/// B-+ |
197/// | |
198/// C---+
199///
200/// A is the header of a loop containing A, B, and C as far as LoopInfo is
201/// concerned. However, an i1 COPY in B that is used in C must be lowered to
202/// bitwise operations to combine results from different loop iterations when
203/// B has a divergent branch (since by default we will compile this code such
204/// that threads in a wave are merged at the entry of C).
205///
206/// The following rule is implemented to determine whether bitwise operations
207/// are required: use the bitwise lowering for a def in block B if a backward
208/// edge to B is reachable without going through the nearest common
209/// post-dominator of B and all uses of the def.
210///
211/// TODO: This rule is conservative because it does not check whether the
212/// relevant branches are actually divergent.
213///
214/// The class is designed to cache the CFG traversal so that it can be re-used
215/// for multiple defs within the same basic block.
216///
217/// TODO: We could use region analysis to quickly skip over SESE regions during
218/// the traversal.
219///
220class LoopFinder {
223
224 // All visited / reachable block, tagged by level (level 0 is the def block,
225 // level 1 are all blocks reachable including but not going through the def
226 // block's IPDOM, etc.).
228
229 // Nearest common dominator of all visited blocks by level (level 0 is the
230 // def block). Used for seeding the SSAUpdater.
232
233 // Post-dominator of all visited blocks.
234 MachineBasicBlock *VisitedPostDom = nullptr;
235
236 // Level at which a loop was found: 0 is not possible; 1 = a backward edge is
237 // reachable without going through the IPDOM of the def block (if the IPDOM
238 // itself has an edge to the def block, the loop level is 2), etc.
239 unsigned FoundLoopLevel = ~0u;
240
241 MachineBasicBlock *DefBlock = nullptr;
244
245public:
247 : DT(DT), PDT(PDT) {}
248
250 Visited.clear();
251 CommonDominators.clear();
252 Stack.clear();
253 NextLevel.clear();
254 VisitedPostDom = nullptr;
255 FoundLoopLevel = ~0u;
256
257 DefBlock = &MBB;
258 }
259
260 /// Check whether a backward edge can be reached without going through the
261 /// given \p PostDom of the def block.
262 ///
263 /// Return the level of \p PostDom if a loop was found, or 0 otherwise.
264 unsigned findLoop(MachineBasicBlock *PostDom) {
265 MachineDomTreeNode *PDNode = PDT.getNode(DefBlock);
266
267 if (!VisitedPostDom)
268 advanceLevel();
269
270 unsigned Level = 0;
271 while (PDNode->getBlock() != PostDom) {
272 if (PDNode->getBlock() == VisitedPostDom)
273 advanceLevel();
274 PDNode = PDNode->getIDom();
275 Level++;
276 if (FoundLoopLevel == Level)
277 return Level;
278 }
279
280 return 0;
281 }
282
283 /// Add undef values dominating the loop and the optionally given additional
284 /// blocks, so that the SSA updater doesn't have to search all the way to the
285 /// function entry.
286 void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater,
288 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs,
289 ArrayRef<Incoming> Incomings = {}) {
290 assert(LoopLevel < CommonDominators.size());
291
292 MachineBasicBlock *Dom = CommonDominators[LoopLevel];
293 for (auto &Incoming : Incomings)
295
296 if (!inLoopLevel(*Dom, LoopLevel, Incomings)) {
298 Dom, insertUndefLaneMask(Dom, &MRI, LaneMaskRegAttrs));
299 } else {
300 // The dominator is part of the loop or the given blocks, so add the
301 // undef value to unreachable predecessors instead.
302 for (MachineBasicBlock *Pred : Dom->predecessors()) {
303 if (!inLoopLevel(*Pred, LoopLevel, Incomings))
305 Pred, insertUndefLaneMask(Pred, &MRI, LaneMaskRegAttrs));
306 }
307 }
308 }
309
310private:
311 bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel,
312 ArrayRef<Incoming> Incomings) const {
313 auto DomIt = Visited.find(&MBB);
314 if (DomIt != Visited.end() && DomIt->second <= LoopLevel)
315 return true;
316
317 for (auto &Incoming : Incomings)
318 if (Incoming.Block == &MBB)
319 return true;
320
321 return false;
322 }
323
324 void advanceLevel() {
325 MachineBasicBlock *VisitedDom;
326
327 if (!VisitedPostDom) {
328 VisitedPostDom = DefBlock;
329 VisitedDom = DefBlock;
330 Stack.push_back(DefBlock);
331 } else {
332 VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock();
333 VisitedDom = CommonDominators.back();
334
335 for (unsigned i = 0; i < NextLevel.size();) {
336 if (PDT.dominates(VisitedPostDom, NextLevel[i])) {
337 Stack.push_back(NextLevel[i]);
338
339 NextLevel[i] = NextLevel.back();
340 NextLevel.pop_back();
341 } else {
342 i++;
343 }
344 }
345 }
346
347 unsigned Level = CommonDominators.size();
348 while (!Stack.empty()) {
349 MachineBasicBlock *MBB = Stack.pop_back_val();
350 if (!PDT.dominates(VisitedPostDom, MBB))
351 NextLevel.push_back(MBB);
352
353 Visited[MBB] = Level;
354 VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB);
355
356 for (MachineBasicBlock *Succ : MBB->successors()) {
357 if (Succ == DefBlock) {
358 if (MBB == VisitedPostDom)
359 FoundLoopLevel = std::min(FoundLoopLevel, Level + 1);
360 else
361 FoundLoopLevel = std::min(FoundLoopLevel, Level);
362 continue;
363 }
364
365 if (Visited.try_emplace(Succ, ~0u).second) {
366 if (MBB == VisitedPostDom)
367 NextLevel.push_back(Succ);
368 else
369 Stack.push_back(Succ);
370 }
371 }
372 }
373
374 CommonDominators.push_back(VisitedDom);
375 }
376};
377
378} // End anonymous namespace.
379
382 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) {
383 return MRI->createVirtualRegister(LaneMaskRegAttrs);
384}
385
386static Register
388 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) {
389 MachineFunction &MF = *MBB->getParent();
390 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
391 const SIInstrInfo *TII = ST.getInstrInfo();
392 Register UndefReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
393 BuildMI(*MBB, MBB->getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF),
394 UndefReg);
395 return UndefReg;
396}
397
398#ifndef NDEBUG
401 Register Reg) {
402 unsigned Size = TRI.getRegSizeInBits(Reg, MRI);
403 return Size == 1 || Size == 32;
404}
405#endif
406
407bool Vreg1LoweringHelper::lowerCopiesFromI1() {
408 bool Changed = false;
410
411 for (MachineBasicBlock &MBB : *MF) {
412 for (MachineInstr &MI : MBB) {
413 if (MI.getOpcode() != AMDGPU::COPY)
414 continue;
415
416 Register DstReg = MI.getOperand(0).getReg();
417 Register SrcReg = MI.getOperand(1).getReg();
418 if (!isVreg1(SrcReg))
419 continue;
420
421 if (isLaneMaskReg(DstReg) || isVreg1(DstReg))
422 continue;
423
424 Changed = true;
425
426 // Copy into a 32-bit vector register.
427 LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI);
428 DebugLoc DL = MI.getDebugLoc();
429
430 assert(isVRegCompatibleReg(TII->getRegisterInfo(), *MRI, DstReg));
431 assert(!MI.getOperand(0).getSubReg());
432
433 ConstrainRegs.insert(SrcReg);
434 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
435 .addImm(0)
436 .addImm(0)
437 .addImm(0)
438 .addImm(-1)
439 .addReg(SrcReg);
440 DeadCopies.push_back(&MI);
441 }
442
443 for (MachineInstr *MI : DeadCopies)
444 MI->eraseFromParent();
445 DeadCopies.clear();
446 }
447 return Changed;
448}
449
453 : MF(MF), DT(DT), PDT(PDT) {
454 MRI = &MF->getRegInfo();
455
457 TII = ST->getInstrInfo();
458 IsWave32 = ST->isWave32();
459
460 if (IsWave32) {
461 ExecReg = AMDGPU::EXEC_LO;
462 MovOp = AMDGPU::S_MOV_B32;
463 AndOp = AMDGPU::S_AND_B32;
464 OrOp = AMDGPU::S_OR_B32;
465 XorOp = AMDGPU::S_XOR_B32;
466 AndN2Op = AMDGPU::S_ANDN2_B32;
467 OrN2Op = AMDGPU::S_ORN2_B32;
468 } else {
469 ExecReg = AMDGPU::EXEC;
470 MovOp = AMDGPU::S_MOV_B64;
471 AndOp = AMDGPU::S_AND_B64;
472 OrOp = AMDGPU::S_OR_B64;
473 XorOp = AMDGPU::S_XOR_B64;
474 AndN2Op = AMDGPU::S_ANDN2_B64;
475 OrN2Op = AMDGPU::S_ORN2_B64;
476 }
477}
478
481 LoopFinder LF(*DT, *PDT);
482 PhiIncomingAnalysis PIA(*PDT, TII);
484 SmallVector<Incoming, 4> Incomings;
485
486 getCandidatesForLowering(Vreg1Phis);
487 if (Vreg1Phis.empty())
488 return false;
489
491 MachineBasicBlock *PrevMBB = nullptr;
492 for (MachineInstr *MI : Vreg1Phis) {
493 MachineBasicBlock &MBB = *MI->getParent();
494 if (&MBB != PrevMBB) {
495 LF.initialize(MBB);
496 PrevMBB = &MBB;
497 }
498
499 LLVM_DEBUG(dbgs() << "Lower PHI: " << *MI);
500
501 Register DstReg = MI->getOperand(0).getReg();
502 markAsLaneMask(DstReg);
504
506
507 // Sort the incomings such that incoming values that dominate other incoming
508 // values are sorted earlier. This allows us to do some amount of on-the-fly
509 // constant folding.
510 // Incoming with smaller DFSNumIn goes first, DFSNumIn is 0 for entry block.
511 llvm::sort(Incomings, [this](Incoming LHS, Incoming RHS) {
512 return DT->getNode(LHS.Block)->getDFSNumIn() <
513 DT->getNode(RHS.Block)->getDFSNumIn();
514 });
515
516#ifndef NDEBUG
517 PhiRegisters.insert(DstReg);
518#endif
519
520 // Phis in a loop that are observed outside the loop receive a simple but
521 // conservatively correct treatment.
522 std::vector<MachineBasicBlock *> DomBlocks = {&MBB};
523 for (MachineInstr &Use : MRI->use_instructions(DstReg))
524 DomBlocks.push_back(Use.getParent());
525
526 MachineBasicBlock *PostDomBound =
528
529 // FIXME: This fails to find irreducible cycles. If we have a def (other
530 // than a constant) in a pair of blocks that end up looping back to each
531 // other, it will be mishandle. Due to structurization this shouldn't occur
532 // in practice.
533 unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
534
535 SSAUpdater.Initialize(DstReg);
536
537 if (FoundLoopLevel) {
538 LF.addLoopEntries(FoundLoopLevel, SSAUpdater, *MRI, LaneMaskRegAttrs,
539 Incomings);
540
541 for (auto &Incoming : Incomings) {
544 }
545
546 for (auto &Incoming : Incomings) {
551 }
552 } else {
553 // The phi is not observed from outside a loop. Use a more accurate
554 // lowering.
555 PIA.analyze(MBB, Incomings);
556
557 for (MachineBasicBlock *MBB : PIA.predecessors())
560
561 for (auto &Incoming : Incomings) {
563 if (PIA.isSource(IMBB)) {
566 } else {
569 }
570 }
571
572 for (auto &Incoming : Incomings) {
574 continue;
575
580 }
581 }
582
584 if (NewReg != DstReg) {
585 replaceDstReg(NewReg, DstReg, &MBB);
586 MI->eraseFromParent();
587 }
588
589 Incomings.clear();
590 }
591 return true;
592}
593
594bool Vreg1LoweringHelper::lowerCopiesToI1() {
595 bool Changed = false;
597 LoopFinder LF(*DT, *PDT);
599
600 for (MachineBasicBlock &MBB : *MF) {
601 LF.initialize(MBB);
602
603 for (MachineInstr &MI : MBB) {
604 if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF &&
605 MI.getOpcode() != AMDGPU::COPY)
606 continue;
607
608 Register DstReg = MI.getOperand(0).getReg();
609 if (!isVreg1(DstReg))
610 continue;
611
612 Changed = true;
613
614 if (MRI->use_empty(DstReg)) {
615 DeadCopies.push_back(&MI);
616 continue;
617 }
618
619 LLVM_DEBUG(dbgs() << "Lower Other: " << MI);
620
621 markAsLaneMask(DstReg);
622 initializeLaneMaskRegisterAttributes(DstReg);
623
624 if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF)
625 continue;
626
627 DebugLoc DL = MI.getDebugLoc();
628 Register SrcReg = MI.getOperand(1).getReg();
629 assert(!MI.getOperand(1).getSubReg());
630
631 if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
632 assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
633 Register TmpReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
634 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
635 .addReg(SrcReg)
636 .addImm(0);
637 MI.getOperand(1).setReg(TmpReg);
638 SrcReg = TmpReg;
639 } else {
640 // SrcReg needs to be live beyond copy.
641 MI.getOperand(1).setIsKill(false);
642 }
643
644 // Defs in a loop that are observed outside the loop must be transformed
645 // into appropriate bit manipulation.
646 std::vector<MachineBasicBlock *> DomBlocks = {&MBB};
647 for (MachineInstr &Use : MRI->use_instructions(DstReg))
648 DomBlocks.push_back(Use.getParent());
649
650 MachineBasicBlock *PostDomBound =
651 PDT->findNearestCommonDominator(DomBlocks);
652 unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
653 if (FoundLoopLevel) {
654 SSAUpdater.Initialize(DstReg);
656 LF.addLoopEntries(FoundLoopLevel, SSAUpdater, *MRI, LaneMaskRegAttrs);
657
658 buildMergeLaneMasks(MBB, MI, DL, DstReg,
660 DeadCopies.push_back(&MI);
661 }
662 }
663
664 for (MachineInstr *MI : DeadCopies)
665 MI->eraseFromParent();
666 DeadCopies.clear();
667 }
668 return Changed;
669}
670
672 const MachineInstr *MI;
673 for (;;) {
674 MI = MRI->getUniqueVRegDef(Reg);
675 if (MI->getOpcode() == AMDGPU::IMPLICIT_DEF)
676 return true;
677
678 if (MI->getOpcode() != AMDGPU::COPY)
679 break;
680
681 Reg = MI->getOperand(1).getReg();
682 if (!Reg.isVirtual())
683 return false;
684 if (!isLaneMaskReg(Reg))
685 return false;
686 }
687
688 if (MI->getOpcode() != MovOp)
689 return false;
690
691 if (!MI->getOperand(1).isImm())
692 return false;
693
694 int64_t Imm = MI->getOperand(1).getImm();
695 if (Imm == 0) {
696 Val = false;
697 return true;
698 }
699 if (Imm == -1) {
700 Val = true;
701 return true;
702 }
703
704 return false;
705}
706
707static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) {
708 Def = false;
709 Use = false;
710
711 for (const MachineOperand &MO : MI.operands()) {
712 if (MO.isReg() && MO.getReg() == AMDGPU::SCC) {
713 if (MO.isUse())
714 Use = true;
715 else
716 Def = true;
717 }
718 }
719}
720
721/// Return a point at the end of the given \p MBB to insert SALU instructions
722/// for lane mask calculation. Take terminators and SCC into account.
725 auto InsertionPt = MBB.getFirstTerminator();
726 bool TerminatorsUseSCC = false;
727 for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) {
728 bool DefsSCC;
729 instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC);
730 if (TerminatorsUseSCC || DefsSCC)
731 break;
732 }
733
734 if (!TerminatorsUseSCC)
735 return InsertionPt;
736
737 while (InsertionPt != MBB.begin()) {
738 InsertionPt--;
739
740 bool DefSCC, UseSCC;
741 instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC);
742 if (DefSCC)
743 return InsertionPt;
744 }
745
746 // We should have at least seen an IMPLICIT_DEF or COPY
747 llvm_unreachable("SCC used by terminator but no def in block");
748}
749
750// VReg_1 -> SReg_32 or SReg_64
751void Vreg1LoweringHelper::markAsLaneMask(Register DstReg) const {
752 MRI->setRegClass(DstReg, ST->getBoolRC());
753}
754
755void Vreg1LoweringHelper::getCandidatesForLowering(
756 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
757 for (MachineBasicBlock &MBB : *MF) {
758 for (MachineInstr &MI : MBB.phis()) {
759 if (isVreg1(MI.getOperand(0).getReg()))
760 Vreg1Phis.push_back(&MI);
761 }
762 }
763}
764
765void Vreg1LoweringHelper::collectIncomingValuesFromPhi(
766 const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const {
767 for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
768 assert(i + 1 < MI->getNumOperands());
769 Register IncomingReg = MI->getOperand(i).getReg();
770 MachineBasicBlock *IncomingMBB = MI->getOperand(i + 1).getMBB();
771 MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg);
772
773 if (IncomingDef->getOpcode() == AMDGPU::COPY) {
774 IncomingReg = IncomingDef->getOperand(1).getReg();
775 assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg));
776 assert(!IncomingDef->getOperand(1).getSubReg());
777 } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) {
778 continue;
779 } else {
780 assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg));
781 }
782
783 Incomings.emplace_back(IncomingReg, IncomingMBB, Register());
784 }
785}
786
787void Vreg1LoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
789 MRI->replaceRegWith(NewReg, OldReg);
790}
791
792void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB,
794 const DebugLoc &DL,
795 Register DstReg, Register PrevReg,
796 Register CurReg) {
797 bool PrevVal = false;
798 bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal);
799 bool CurVal = false;
800 bool CurConstant = isConstantLaneMask(CurReg, CurVal);
801
802 if (PrevConstant && CurConstant) {
803 if (PrevVal == CurVal) {
804 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg);
805 } else if (CurVal) {
806 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(ExecReg);
807 } else {
808 BuildMI(MBB, I, DL, TII->get(XorOp), DstReg)
809 .addReg(ExecReg)
810 .addImm(-1);
811 }
812 return;
813 }
814
815 Register PrevMaskedReg;
816 Register CurMaskedReg;
817 if (!PrevConstant) {
818 if (CurConstant && CurVal) {
819 PrevMaskedReg = PrevReg;
820 } else {
821 PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
822 BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg)
823 .addReg(PrevReg)
824 .addReg(ExecReg);
825 }
826 }
827 if (!CurConstant) {
828 // TODO: check whether CurReg is already masked by EXEC
829 if (PrevConstant && PrevVal) {
830 CurMaskedReg = CurReg;
831 } else {
832 CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
833 BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg)
834 .addReg(CurReg)
835 .addReg(ExecReg);
836 }
837 }
838
839 if (PrevConstant && !PrevVal) {
840 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
841 .addReg(CurMaskedReg);
842 } else if (CurConstant && !CurVal) {
843 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
844 .addReg(PrevMaskedReg);
845 } else if (PrevConstant && PrevVal) {
846 BuildMI(MBB, I, DL, TII->get(OrN2Op), DstReg)
847 .addReg(CurMaskedReg)
848 .addReg(ExecReg);
849 } else {
850 BuildMI(MBB, I, DL, TII->get(OrOp), DstReg)
851 .addReg(PrevMaskedReg)
852 .addReg(CurMaskedReg ? CurMaskedReg : ExecReg);
853 }
854}
855
856void Vreg1LoweringHelper::constrainAsLaneMask(Incoming &In) {}
857
858/// Lower all instructions that def or use vreg_1 registers.
859///
860/// In a first pass, we lower COPYs from vreg_1 to vector registers, as can
861/// occur around inline assembly. We do this first, before vreg_1 registers
862/// are changed to scalar mask registers.
863///
864/// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before
865/// all others, because phi lowering looks through copies and can therefore
866/// often make copy lowering unnecessary.
869 // Only need to run this in SelectionDAG path.
870 if (MF.getProperties().hasProperty(
872 return false;
873
874 Vreg1LoweringHelper Helper(&MF, &MDT, &MPDT);
875 bool Changed = false;
876 Changed |= Helper.lowerCopiesFromI1();
877 Changed |= Helper.lowerPhis();
878 Changed |= Helper.lowerCopiesToI1();
879 return Helper.cleanConstrainRegs(Changed);
880}
881
888 bool Changed = runFixI1Copies(MF, MDT, MPDT);
889 if (!Changed)
890 return PreservedAnalyses::all();
891
892 // TODO: Probably preserves most.
895 return PA;
896}
897
899public:
900 static char ID;
901
904 }
905
906 bool runOnMachineFunction(MachineFunction &MF) override;
907
908 StringRef getPassName() const override { return "SI Lower i1 Copies"; }
909
910 void getAnalysisUsage(AnalysisUsage &AU) const override {
911 AU.setPreservesCFG();
915 }
916};
917
920 getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
922 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
923 return runFixI1Copies(MF, MDT, MPDT);
924}
925
927 false, false)
932
933char SILowerI1CopiesLegacy::ID = 0;
934
936
938 return new SILowerI1CopiesLegacy();
939}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use)
static Register insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI, MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs)
SI Lower i1 Copies
static bool runFixI1Copies(MachineFunction &MF, MachineDominatorTree &MDT, MachinePostDominatorTree &MPDT)
Lower all instructions that def or use vreg_1 registers.
static bool isVRegCompatibleReg(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, Register Reg)
Interface definition of the PhiLoweringHelper class that implements lane mask merging algorithm for d...
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, ArrayRef< StringLiteral > StandardNames)
Initialize the set of available library functions based on the specified target triple.
Value * RHS
Value * LHS
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:405
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:256
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
A debug info location.
Definition: DebugLoc.h:33
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:226
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
Base class for the actual dominator tree node.
DomTreeNodeBase * getIDom() const
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
NodeT * getBlock() const
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:266
bool isWave32() const
iterator_range< iterator > phis()
Returns a range that iterates over the phis in the basic block.
void push_back(MachineInstr *MI)
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
Analysis pass which computes a MachineDominatorTree.
Analysis pass which computes a MachineDominatorTree.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineBasicBlock * findNearestCommonDominator(MachineBasicBlock *A, MachineBasicBlock *B)
findNearestCommonDominator - Find nearest common dominator basic block for basic block A and B.
MachineDomTreeNode * getNode(MachineBasicBlock *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
MachineDominatorTree & getBase()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
bool hasProperty(Property P) const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const MachineFunctionProperties & getProperties() const
Get the function properties.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
bool isPHI() const
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MachineBasicBlock * findNearestCommonDominator(ArrayRef< MachineBasicBlock * > Blocks) const
Returns the nearest common dominator of the given blocks.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
iterator_range< use_instr_iterator > use_instructions(Register Reg) const
MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
MachineSSAUpdater - This class updates SSA form for a set of virtual registers defined in multiple bl...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
PhiLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT, MachinePostDominatorTree *PDT)
bool isLaneMaskReg(Register Reg) const
MachineRegisterInfo * MRI
MachineDominatorTree * DT
DenseSet< Register > PhiRegisters
virtual void getCandidatesForLowering(SmallVectorImpl< MachineInstr * > &Vreg1Phis) const =0
MachineFunction * MF
virtual void constrainAsLaneMask(Incoming &In)=0
virtual void collectIncomingValuesFromPhi(const MachineInstr *MI, SmallVectorImpl< Incoming > &Incomings) const =0
virtual void markAsLaneMask(Register DstReg) const =0
MachinePostDominatorTree * PDT
const GCNSubtarget * ST
const SIInstrInfo * TII
MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs
MachineBasicBlock::iterator getSaluInsertionAtEnd(MachineBasicBlock &MBB) const
Return a point at the end of the given MBB to insert SALU instructions for lane mask calculation.
void initializeLaneMaskRegisterAttributes(Register LaneMask)
bool isConstantLaneMask(Register Reg, bool &Val) const
virtual void buildMergeLaneMasks(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, Register PrevReg, Register CurReg)=0
virtual void replaceDstReg(Register NewReg, Register OldReg, MachineBasicBlock *MBB)=0
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition: SSAUpdater.h:40
void Initialize(Type *Ty, StringRef Name)
Reset this object to get ready for a new set of SSA updates with type 'Ty'.
Definition: SSAUpdater.cpp:53
Value * GetValueInMiddleOfBlock(BasicBlock *BB)
Construct SSA form, materializing a value that is live in the middle of the specified block.
Definition: SSAUpdater.cpp:98
void AddAvailableValue(BasicBlock *BB, Value *V)
Indicate that a rewritten value is available in the specified block with the specified value.
Definition: SSAUpdater.cpp:70
bool empty() const
Definition: SmallVector.h:95
size_t size() const
Definition: SmallVector.h:92
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:587
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:951
void push_back(const T &Elt)
Definition: SmallVector.h:427
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1210
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2073
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
Register createLaneMaskReg(MachineRegisterInfo *MRI, MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs)
void initializeSILowerI1CopiesLegacyPass(PassRegistry &)
FunctionPass * createSILowerI1CopiesLegacyPass()
char & SILowerI1CopiesLegacyID
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
MachineBasicBlock * Block
Register UpdatedReg
All attributes(register class or bank and low-level type) a virtual register can have.