LLVM 23.0.0git
SILowerI1Copies.cpp
Go to the documentation of this file.
1//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass lowers all occurrences of i1 values (with a vreg_1 register class)
10// to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA
11// form and a wave-level control flow graph.
12//
13// Before this pass, values that are semantically i1 and are defined and used
14// within the same basic block are already represented as lane masks in scalar
15// registers. However, values that cross basic blocks are always transferred
16// between basic blocks in vreg_1 virtual registers and are lowered by this
17// pass.
18//
19// The only instructions that use or define vreg_1 virtual registers are COPY,
20// PHI, and IMPLICIT_DEF.
21//
22//===----------------------------------------------------------------------===//
23
24#include "SILowerI1Copies.h"
25#include "AMDGPU.h"
28
29#define DEBUG_TYPE "si-i1-copies"
30
31using namespace llvm;
32
33static Register
35 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs);
36
37namespace {
38
39class Vreg1LoweringHelper : public AMDGPU::PhiLoweringHelper {
40public:
41 Vreg1LoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,
43
44private:
45 DenseSet<Register> ConstrainRegs;
46
47public:
48 void markAsLaneMask(Register DstReg) const override;
49 void getCandidatesForLowering(
50 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override;
51 void collectIncomingValuesFromPhi(
52 const MachineInstr *MI,
53 SmallVectorImpl<AMDGPU::Incoming> &Incomings) const override;
54 void replaceDstReg(Register NewReg, Register OldReg,
55 MachineBasicBlock *MBB) override;
56 void buildMergeLaneMasks(MachineBasicBlock &MBB,
58 Register DstReg, Register PrevReg,
59 Register CurReg) override;
60 void constrainAsLaneMask(AMDGPU::Incoming &In) override;
61
62 bool lowerCopiesFromI1();
63 bool lowerCopiesToI1();
64 bool cleanConstrainRegs(bool Changed);
65 bool isVreg1(Register Reg) const {
66 return Reg.isVirtual() && MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass;
67 }
68};
69
70Vreg1LoweringHelper::Vreg1LoweringHelper(MachineFunction *MF,
73 : PhiLoweringHelper(MF, DT, PDT) {}
74
75bool Vreg1LoweringHelper::cleanConstrainRegs(bool Changed) {
76 assert(Changed || ConstrainRegs.empty());
77 for (Register Reg : ConstrainRegs)
78 MRI->constrainRegClass(Reg, TII->getRegisterInfo().getWaveMaskRegClass());
79 ConstrainRegs.clear();
80
81 return Changed;
82}
83
84/// Helper class that determines the relationship between incoming values of a
85/// phi in the control flow graph to determine where an incoming value can
86/// simply be taken as a scalar lane mask as-is, and where it needs to be
87/// merged with another, previously defined lane mask.
88///
89/// The approach is as follows:
90/// - Determine all basic blocks which, starting from the incoming blocks,
91/// a wave may reach before entering the def block (the block containing the
92/// phi).
93/// - If an incoming block has no predecessors in this set, we can take the
94/// incoming value as a scalar lane mask as-is.
95/// -- A special case of this is when the def block has a self-loop.
96/// - Otherwise, the incoming value needs to be merged with a previously
97/// defined lane mask.
98/// - If there is a path into the set of reachable blocks that does _not_ go
99/// through an incoming block where we can take the scalar lane mask as-is,
100/// we need to invent an available value for the SSAUpdater. Choices are
101/// 0 and undef, with differing consequences for how to merge values etc.
102///
103/// TODO: We could use region analysis to quickly skip over SESE regions during
104/// the traversal.
105///
106class PhiIncomingAnalysis {
107 MachinePostDominatorTree &PDT;
108 const SIInstrInfo *TII;
109
110 // For each reachable basic block, whether it is a source in the induced
111 // subgraph of the CFG.
112 MapVector<MachineBasicBlock *, bool> ReachableMap;
113 SmallVector<MachineBasicBlock *, 4> Stack;
114 SmallVector<MachineBasicBlock *, 4> Predecessors;
115
116public:
117 PhiIncomingAnalysis(MachinePostDominatorTree &PDT, const SIInstrInfo *TII)
118 : PDT(PDT), TII(TII) {}
119
120 /// Returns whether \p MBB is a source in the induced subgraph of reachable
121 /// blocks.
122 bool isSource(MachineBasicBlock &MBB) const {
123 return ReachableMap.find(&MBB)->second;
124 }
125
126 ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; }
127
128 void analyze(MachineBasicBlock &DefBlock,
129 ArrayRef<AMDGPU::Incoming> Incomings) {
130 assert(Stack.empty());
131 ReachableMap.clear();
132 Predecessors.clear();
133
134 // Insert the def block first, so that it acts as an end point for the
135 // traversal.
136 ReachableMap.try_emplace(&DefBlock, false);
137
138 for (auto Incoming : Incomings) {
139 MachineBasicBlock *MBB = Incoming.Block;
140 if (MBB == &DefBlock) {
141 ReachableMap[&DefBlock] = true; // self-loop on DefBlock
142 continue;
143 }
144
145 ReachableMap.try_emplace(MBB, false);
146
147 // If this block has a divergent terminator and the def block is its
148 // post-dominator, the wave may first visit the other successors.
149 if (TII->hasDivergentBranch(MBB) && PDT.dominates(&DefBlock, MBB))
150 append_range(Stack, MBB->successors());
151 }
152
153 while (!Stack.empty()) {
154 MachineBasicBlock *MBB = Stack.pop_back_val();
155 if (ReachableMap.try_emplace(MBB, false).second)
156 append_range(Stack, MBB->successors());
157 }
158
159 for (auto &[MBB, Reachable] : ReachableMap) {
160 bool HaveReachablePred = false;
161 for (MachineBasicBlock *Pred : MBB->predecessors()) {
162 if (ReachableMap.count(Pred)) {
163 HaveReachablePred = true;
164 } else {
165 Stack.push_back(Pred);
166 }
167 }
168 if (!HaveReachablePred)
169 Reachable = true;
170 if (HaveReachablePred) {
171 for (MachineBasicBlock *UnreachablePred : Stack) {
172 if (!llvm::is_contained(Predecessors, UnreachablePred))
173 Predecessors.push_back(UnreachablePred);
174 }
175 }
176 Stack.clear();
177 }
178 }
179};
180
181/// Helper class that detects loops which require us to lower an i1 COPY into
182/// bitwise manipulation.
183///
184/// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish
185/// between loops with the same header. Consider this example:
186///
187/// A-+-+
188/// | | |
189/// B-+ |
190/// | |
191/// C---+
192///
193/// A is the header of a loop containing A, B, and C as far as LoopInfo is
194/// concerned. However, an i1 COPY in B that is used in C must be lowered to
195/// bitwise operations to combine results from different loop iterations when
196/// B has a divergent branch (since by default we will compile this code such
197/// that threads in a wave are merged at the entry of C).
198///
199/// The following rule is implemented to determine whether bitwise operations
200/// are required: use the bitwise lowering for a def in block B if a backward
201/// edge to B is reachable without going through the nearest common
202/// post-dominator of B and all uses of the def.
203///
204/// TODO: This rule is conservative because it does not check whether the
205/// relevant branches are actually divergent.
206///
207/// The class is designed to cache the CFG traversal so that it can be re-used
208/// for multiple defs within the same basic block.
209///
210/// TODO: We could use region analysis to quickly skip over SESE regions during
211/// the traversal.
212///
213class LoopFinder {
214 MachineDominatorTree &DT;
215 MachinePostDominatorTree &PDT;
216
217 // All visited / reachable block, tagged by level (level 0 is the def block,
218 // level 1 are all blocks reachable including but not going through the def
219 // block's IPDOM, etc.).
220 DenseMap<MachineBasicBlock *, unsigned> Visited;
221
222 // Nearest common dominator of all visited blocks by level (level 0 is the
223 // def block). Used for seeding the SSAUpdater.
224 SmallVector<MachineBasicBlock *, 4> CommonDominators;
225
226 // Post-dominator of all visited blocks.
227 MachineBasicBlock *VisitedPostDom = nullptr;
228
229 // Level at which a loop was found: 0 is not possible; 1 = a backward edge is
230 // reachable without going through the IPDOM of the def block (if the IPDOM
231 // itself has an edge to the def block, the loop level is 2), etc.
232 unsigned FoundLoopLevel = ~0u;
233
234 MachineBasicBlock *DefBlock = nullptr;
235 SmallVector<MachineBasicBlock *, 4> Stack;
236 SmallVector<MachineBasicBlock *, 4> NextLevel;
237
238public:
239 LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT)
240 : DT(DT), PDT(PDT) {}
241
242 void initialize(MachineBasicBlock &MBB) {
243 Visited.clear();
244 CommonDominators.clear();
245 Stack.clear();
246 NextLevel.clear();
247 VisitedPostDom = nullptr;
248 FoundLoopLevel = ~0u;
249
250 DefBlock = &MBB;
251 }
252
253 /// Check whether a backward edge can be reached without going through the
254 /// given \p PostDom of the def block.
255 ///
256 /// Return the level of \p PostDom if a loop was found, or 0 otherwise.
257 unsigned findLoop(MachineBasicBlock *PostDom) {
258 MachineDomTreeNode *PDNode = PDT.getNode(DefBlock);
259
260 if (!VisitedPostDom)
261 advanceLevel();
262
263 unsigned Level = 0;
264 while (PDNode->getBlock() != PostDom) {
265 if (PDNode->getBlock() == VisitedPostDom)
266 advanceLevel();
267 PDNode = PDNode->getIDom();
268 Level++;
269 if (FoundLoopLevel == Level)
270 return Level;
271 }
272
273 return 0;
274 }
275
276 /// Add undef values dominating the loop and the optionally given additional
277 /// blocks, so that the SSA updater doesn't have to search all the way to the
278 /// function entry.
279 void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater,
280 MachineRegisterInfo &MRI,
281 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs,
282 ArrayRef<AMDGPU::Incoming> Incomings = {}) {
283 assert(LoopLevel < CommonDominators.size());
284
285 MachineBasicBlock *Dom = CommonDominators[LoopLevel];
286 for (auto &Incoming : Incomings)
287 Dom = DT.findNearestCommonDominator(Dom, Incoming.Block);
288
289 if (!inLoopLevel(*Dom, LoopLevel, Incomings)) {
290 SSAUpdater.AddAvailableValue(
291 Dom, insertUndefLaneMask(Dom, &MRI, LaneMaskRegAttrs));
292 } else {
293 // The dominator is part of the loop or the given blocks, so add the
294 // undef value to unreachable predecessors instead.
295 for (MachineBasicBlock *Pred : Dom->predecessors()) {
296 if (!inLoopLevel(*Pred, LoopLevel, Incomings))
297 SSAUpdater.AddAvailableValue(
298 Pred, insertUndefLaneMask(Pred, &MRI, LaneMaskRegAttrs));
299 }
300 }
301 }
302
303private:
304 bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel,
305 ArrayRef<AMDGPU::Incoming> Incomings) const {
306 auto DomIt = Visited.find(&MBB);
307 if (DomIt != Visited.end() && DomIt->second <= LoopLevel)
308 return true;
309
310 for (auto &Incoming : Incomings)
311 if (Incoming.Block == &MBB)
312 return true;
313
314 return false;
315 }
316
317 void advanceLevel() {
318 MachineBasicBlock *VisitedDom;
319
320 if (!VisitedPostDom) {
321 VisitedPostDom = DefBlock;
322 VisitedDom = DefBlock;
323 Stack.push_back(DefBlock);
324 } else {
325 VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock();
326 VisitedDom = CommonDominators.back();
327
328 for (unsigned i = 0; i < NextLevel.size();) {
329 if (PDT.dominates(VisitedPostDom, NextLevel[i])) {
330 Stack.push_back(NextLevel[i]);
331
332 NextLevel[i] = NextLevel.back();
333 NextLevel.pop_back();
334 } else {
335 i++;
336 }
337 }
338 }
339
340 unsigned Level = CommonDominators.size();
341 while (!Stack.empty()) {
342 MachineBasicBlock *MBB = Stack.pop_back_val();
343 if (!PDT.dominates(VisitedPostDom, MBB))
344 NextLevel.push_back(MBB);
345
346 Visited[MBB] = Level;
347 VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB);
348
349 for (MachineBasicBlock *Succ : MBB->successors()) {
350 if (Succ == DefBlock) {
351 if (MBB == VisitedPostDom)
352 FoundLoopLevel = std::min(FoundLoopLevel, Level + 1);
353 else
354 FoundLoopLevel = std::min(FoundLoopLevel, Level);
355 continue;
356 }
357
358 if (Visited.try_emplace(Succ, ~0u).second) {
359 if (MBB == VisitedPostDom)
360 NextLevel.push_back(Succ);
361 else
362 Stack.push_back(Succ);
363 }
364 }
365 }
366
367 CommonDominators.push_back(VisitedDom);
368 }
369};
370
371} // End anonymous namespace.
372
375 return MRI->createVirtualRegister(LaneMaskRegAttrs);
376}
377
378static Register
380 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) {
381 MachineFunction &MF = *MBB->getParent();
382 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
383 const SIInstrInfo *TII = ST.getInstrInfo();
384 Register UndefReg = AMDGPU::createLaneMaskReg(MRI, LaneMaskRegAttrs);
385 BuildMI(*MBB, MBB->getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF),
386 UndefReg);
387 return UndefReg;
388}
389
390#ifndef NDEBUG
392 const MachineRegisterInfo &MRI,
393 Register Reg) {
394 unsigned Size = TRI.getRegSizeInBits(Reg, MRI);
395 return Size == 1 || Size == 32;
396}
397#endif
398
399bool Vreg1LoweringHelper::lowerCopiesFromI1() {
400 bool Changed = false;
401 SmallVector<MachineInstr *, 4> DeadCopies;
402
403 for (MachineBasicBlock &MBB : *MF) {
404 for (MachineInstr &MI : MBB) {
405 if (MI.getOpcode() != AMDGPU::COPY)
406 continue;
407
408 Register DstReg = MI.getOperand(0).getReg();
409 Register SrcReg = MI.getOperand(1).getReg();
410 if (!isVreg1(SrcReg))
411 continue;
412
413 if (isLaneMaskReg(DstReg) || isVreg1(DstReg))
414 continue;
415
416 Changed = true;
417
418 // Copy into a 32-bit vector register.
419 LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI);
420 const DebugLoc &DL = MI.getDebugLoc();
421
423 assert(!MI.getOperand(0).getSubReg());
424
425 ConstrainRegs.insert(SrcReg);
426 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
427 .addImm(0)
428 .addImm(0)
429 .addImm(0)
430 .addImm(-1)
431 .addReg(SrcReg);
432 DeadCopies.push_back(&MI);
433 }
434
435 for (MachineInstr *MI : DeadCopies)
436 MI->eraseFromParent();
437 DeadCopies.clear();
438 }
439 return Changed;
440}
441
445 : MF(MF), DT(DT), PDT(PDT), ST(&MF->getSubtarget<GCNSubtarget>()),
447 MRI = &MF->getRegInfo();
448
449 TII = ST->getInstrInfo();
450}
451
454 LoopFinder LF(*DT, *PDT);
455 PhiIncomingAnalysis PIA(*PDT, TII);
457 SmallVector<Incoming, 4> Incomings;
458
459 getCandidatesForLowering(Vreg1Phis);
460 if (Vreg1Phis.empty())
461 return false;
462
463 DT->updateDFSNumbers();
464 MachineBasicBlock *PrevMBB = nullptr;
465 for (MachineInstr *MI : Vreg1Phis) {
466 MachineBasicBlock &MBB = *MI->getParent();
467 if (&MBB != PrevMBB) {
468 LF.initialize(MBB);
469 PrevMBB = &MBB;
470 }
471
472 LLVM_DEBUG(dbgs() << "Lower PHI: " << *MI);
473
474 Register DstReg = MI->getOperand(0).getReg();
475 markAsLaneMask(DstReg);
477
479
480 // Sort the incomings such that incoming values that dominate other incoming
481 // values are sorted earlier. This allows us to do some amount of on-the-fly
482 // constant folding.
483 // Incoming with smaller DFSNumIn goes first, DFSNumIn is 0 for entry block.
484 llvm::sort(Incomings, [this](Incoming LHS, Incoming RHS) {
485 return DT->getNode(LHS.Block)->getDFSNumIn() <
486 DT->getNode(RHS.Block)->getDFSNumIn();
487 });
488
489#ifndef NDEBUG
490 PhiRegisters.insert(DstReg);
491#endif
492
493 // Phis in a loop that are observed outside the loop receive a simple but
494 // conservatively correct treatment.
495 std::vector<MachineBasicBlock *> DomBlocks = {&MBB};
496 for (MachineInstr &Use : MRI->use_instructions(DstReg))
497 DomBlocks.push_back(Use.getParent());
498
499 MachineBasicBlock *PostDomBound =
500 PDT->findNearestCommonDominator(DomBlocks);
501
502 // FIXME: This fails to find irreducible cycles. If we have a def (other
503 // than a constant) in a pair of blocks that end up looping back to each
504 // other, it will be mishandle. Due to structurization this shouldn't occur
505 // in practice.
506 unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
507
508 SSAUpdater.Initialize(DstReg);
509
510 if (FoundLoopLevel) {
511 LF.addLoopEntries(FoundLoopLevel, SSAUpdater, *MRI, LaneMaskRegAttrs,
512 Incomings);
513
514 for (auto &Incoming : Incomings) {
517 }
518
519 for (auto &Incoming : Incomings) {
524 }
525 } else {
526 // The phi is not observed from outside a loop. Use a more accurate
527 // lowering.
528 PIA.analyze(MBB, Incomings);
529
530 for (MachineBasicBlock *MBB : PIA.predecessors())
533
534 for (auto &Incoming : Incomings) {
536 if (PIA.isSource(IMBB)) {
539 } else {
542 }
543 }
544
545 for (auto &Incoming : Incomings) {
547 continue;
548
553 }
554 }
555
557 if (NewReg != DstReg) {
558 replaceDstReg(NewReg, DstReg, &MBB);
559 MI->eraseFromParent();
560 }
561
562 Incomings.clear();
563 }
564 return true;
565}
566
567bool Vreg1LoweringHelper::lowerCopiesToI1() {
568 bool Changed = false;
570 LoopFinder LF(*DT, *PDT);
572
573 for (MachineBasicBlock &MBB : *MF) {
574 LF.initialize(MBB);
575
576 for (MachineInstr &MI : MBB) {
577 if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF &&
578 MI.getOpcode() != AMDGPU::COPY)
579 continue;
580
581 Register DstReg = MI.getOperand(0).getReg();
582 if (!isVreg1(DstReg))
583 continue;
584
585 Changed = true;
586
587 if (MRI->use_empty(DstReg)) {
588 DeadCopies.push_back(&MI);
589 continue;
590 }
591
592 LLVM_DEBUG(dbgs() << "Lower Other: " << MI);
593
594 markAsLaneMask(DstReg);
595 initializeLaneMaskRegisterAttributes(DstReg);
596
597 if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF)
598 continue;
599
600 const DebugLoc &DL = MI.getDebugLoc();
601 Register SrcReg = MI.getOperand(1).getReg();
602 assert(!MI.getOperand(1).getSubReg());
603
604 if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
605 assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
606 Register TmpReg = AMDGPU::createLaneMaskReg(MRI, LaneMaskRegAttrs);
607 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
608 .addReg(SrcReg)
609 .addImm(0);
610 MI.getOperand(1).setReg(TmpReg);
611 SrcReg = TmpReg;
612 } else {
613 // SrcReg needs to be live beyond copy.
614 MI.getOperand(1).setIsKill(false);
615 }
616
617 // Defs in a loop that are observed outside the loop must be transformed
618 // into appropriate bit manipulation.
619 std::vector<MachineBasicBlock *> DomBlocks = {&MBB};
620 for (MachineInstr &Use : MRI->use_instructions(DstReg))
621 DomBlocks.push_back(Use.getParent());
622
623 MachineBasicBlock *PostDomBound =
624 PDT->findNearestCommonDominator(DomBlocks);
625 unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
626 if (FoundLoopLevel) {
627 SSAUpdater.Initialize(DstReg);
628 SSAUpdater.AddAvailableValue(&MBB, DstReg);
629 LF.addLoopEntries(FoundLoopLevel, SSAUpdater, *MRI, LaneMaskRegAttrs);
630
631 buildMergeLaneMasks(MBB, MI, DL, DstReg,
632 SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg);
633 DeadCopies.push_back(&MI);
634 }
635 }
636
637 for (MachineInstr *MI : DeadCopies)
638 MI->eraseFromParent();
639 DeadCopies.clear();
640 }
641 return Changed;
642}
643
645 bool &Val) const {
646 const MachineInstr *MI;
647 for (;;) {
648 MI = MRI->getUniqueVRegDef(Reg);
649 if (MI->getOpcode() == AMDGPU::IMPLICIT_DEF)
650 return true;
651
652 if (MI->getOpcode() != AMDGPU::COPY)
653 break;
654
655 Reg = MI->getOperand(1).getReg();
656 if (!Reg.isVirtual())
657 return false;
658 if (!isLaneMaskReg(Reg))
659 return false;
660 }
661
662 if (MI->getOpcode() != LMC->MovOpc)
663 return false;
664
665 if (!MI->getOperand(1).isImm())
666 return false;
667
668 int64_t Imm = MI->getOperand(1).getImm();
669 if (Imm == 0) {
670 Val = false;
671 return true;
672 }
673 if (Imm == -1) {
674 Val = true;
675 return true;
676 }
677
678 return false;
679}
680
681static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) {
682 Def = false;
683 Use = false;
684
685 for (const MachineOperand &MO : MI.operands()) {
686 if (MO.isReg() && MO.getReg() == AMDGPU::SCC) {
687 if (MO.isUse())
688 Use = true;
689 else
690 Def = true;
691 }
692 }
693}
694
695/// Return a point at the end of the given \p MBB to insert SALU instructions
696/// for lane mask calculation. Take terminators and SCC into account.
699 auto InsertionPt = MBB.getFirstTerminator();
700 bool TerminatorsUseSCC = false;
701 for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) {
702 bool DefsSCC;
703 instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC);
704 if (TerminatorsUseSCC || DefsSCC)
705 break;
706 }
707
708 if (!TerminatorsUseSCC)
709 return InsertionPt;
710
711 while (InsertionPt != MBB.begin()) {
712 InsertionPt--;
713
714 bool DefSCC, UseSCC;
715 instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC);
716 if (DefSCC)
717 return InsertionPt;
718 }
719
720 // We should have at least seen an IMPLICIT_DEF or COPY
721 llvm_unreachable("SCC used by terminator but no def in block");
722}
723
724// VReg_1 -> SReg_32 or SReg_64
725void Vreg1LoweringHelper::markAsLaneMask(Register DstReg) const {
726 MRI->setRegClass(DstReg, ST->getBoolRC());
727}
728
729void Vreg1LoweringHelper::getCandidatesForLowering(
730 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
731 for (MachineBasicBlock &MBB : *MF) {
732 for (MachineInstr &MI : MBB.phis()) {
733 if (isVreg1(MI.getOperand(0).getReg()))
734 Vreg1Phis.push_back(&MI);
735 }
736 }
737}
738
739void Vreg1LoweringHelper::collectIncomingValuesFromPhi(
740 const MachineInstr *MI,
741 SmallVectorImpl<AMDGPU::Incoming> &Incomings) const {
742 for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
743 assert(i + 1 < MI->getNumOperands());
744 Register IncomingReg = MI->getOperand(i).getReg();
745 MachineBasicBlock *IncomingMBB = MI->getOperand(i + 1).getMBB();
746 MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg);
747
748 if (IncomingDef->getOpcode() == AMDGPU::COPY) {
749 IncomingReg = IncomingDef->getOperand(1).getReg();
750 assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg));
751 assert(!IncomingDef->getOperand(1).getSubReg());
752 } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) {
753 continue;
754 } else {
755 assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg));
756 }
757
758 Incomings.emplace_back(IncomingReg, IncomingMBB, Register());
759 }
760}
761
762void Vreg1LoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
763 MachineBasicBlock *MBB) {
764 MRI->replaceRegWith(NewReg, OldReg);
765}
766
767void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB,
769 const DebugLoc &DL,
770 Register DstReg, Register PrevReg,
771 Register CurReg) {
772 bool PrevVal = false;
773 bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal);
774 bool CurVal = false;
775 bool CurConstant = isConstantLaneMask(CurReg, CurVal);
776
777 if (PrevConstant && CurConstant) {
778 if (PrevVal == CurVal) {
779 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg);
780 } else if (CurVal) {
781 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(LMC->ExecReg);
782 } else {
783 BuildMI(MBB, I, DL, TII->get(LMC->XorOpc), DstReg)
784 .addReg(LMC->ExecReg)
785 .addImm(-1);
786 }
787 return;
788 }
789
790 Register PrevMaskedReg;
791 Register CurMaskedReg;
792 if (!PrevConstant) {
793 if (CurConstant && CurVal) {
794 PrevMaskedReg = PrevReg;
795 } else {
796 PrevMaskedReg = AMDGPU::createLaneMaskReg(MRI, LaneMaskRegAttrs);
797 BuildMI(MBB, I, DL, TII->get(LMC->AndN2Opc), PrevMaskedReg)
798 .addReg(PrevReg)
799 .addReg(LMC->ExecReg);
800 }
801 }
802 if (!CurConstant) {
803 // TODO: check whether CurReg is already masked by EXEC
804 if (PrevConstant && PrevVal) {
805 CurMaskedReg = CurReg;
806 } else {
807 CurMaskedReg = AMDGPU::createLaneMaskReg(MRI, LaneMaskRegAttrs);
808 BuildMI(MBB, I, DL, TII->get(LMC->AndOpc), CurMaskedReg)
809 .addReg(CurReg)
810 .addReg(LMC->ExecReg);
811 }
812 }
813
814 if (PrevConstant && !PrevVal) {
815 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
816 .addReg(CurMaskedReg);
817 } else if (CurConstant && !CurVal) {
818 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
819 .addReg(PrevMaskedReg);
820 } else if (PrevConstant && PrevVal) {
821 BuildMI(MBB, I, DL, TII->get(LMC->OrN2Opc), DstReg)
822 .addReg(CurMaskedReg)
823 .addReg(LMC->ExecReg);
824 } else {
825 BuildMI(MBB, I, DL, TII->get(LMC->OrOpc), DstReg)
826 .addReg(PrevMaskedReg)
827 .addReg(CurMaskedReg ? CurMaskedReg : LMC->ExecReg);
828 }
829}
830
831void Vreg1LoweringHelper::constrainAsLaneMask(AMDGPU::Incoming &In) {}
832
833/// Lower all instructions that def or use vreg_1 registers.
834///
835/// In a first pass, we lower COPYs from vreg_1 to vector registers, as can
836/// occur around inline assembly. We do this first, before vreg_1 registers
837/// are changed to scalar mask registers.
838///
839/// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before
840/// all others, because phi lowering looks through copies and can therefore
841/// often make copy lowering unnecessary.
844 // Only need to run this in SelectionDAG path.
845 if (MF.getProperties().hasSelected())
846 return false;
847
848 Vreg1LoweringHelper Helper(&MF, &MDT, &MPDT);
849 bool Changed = false;
850 Changed |= Helper.lowerCopiesFromI1();
851 Changed |= Helper.lowerPhis();
852 Changed |= Helper.lowerCopiesToI1();
853 return Helper.cleanConstrainRegs(Changed);
854}
855
856PreservedAnalyses
869
871public:
872 static char ID;
873
875
876 bool runOnMachineFunction(MachineFunction &MF) override;
877
878 StringRef getPassName() const override { return "SI Lower i1 Copies"; }
879
886};
887
895
897 false, false)
902
903char SILowerI1CopiesLegacy::ID = 0;
904
906
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use)
static Register insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI, MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs)
static bool runFixI1Copies(MachineFunction &MF, MachineDominatorTree &MDT, MachinePostDominatorTree &MPDT)
Lower all instructions that def or use vreg_1 registers.
static bool isVRegCompatibleReg(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, Register Reg)
Interface definition of the PhiLoweringHelper class that implements lane mask merging algorithm for d...
#define LLVM_DEBUG(...)
Definition Debug.h:114
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, const llvm::StringTable &StandardNames, VectorLibrary VecLib)
Initialize the set of available library functions based on the specified target triple.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
bool isLaneMaskReg(Register Reg) const
MachinePostDominatorTree * PDT
virtual void replaceDstReg(Register NewReg, Register OldReg, MachineBasicBlock *MBB)=0
MachineBasicBlock::iterator getSaluInsertionAtEnd(MachineBasicBlock &MBB) const
Return a point at the end of the given MBB to insert SALU instructions for lane mask calculation.
bool isConstantLaneMask(Register Reg, bool &Val) const
MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs
void initializeLaneMaskRegisterAttributes(Register LaneMask)
virtual void buildMergeLaneMasks(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, Register PrevReg, Register CurReg)=0
virtual void getCandidatesForLowering(SmallVectorImpl< MachineInstr * > &Vreg1Phis) const =0
const AMDGPU::LaneMaskConstants * LMC
DenseSet< Register > PhiRegisters
virtual void markAsLaneMask(Register DstReg) const =0
virtual void constrainAsLaneMask(Incoming &In)=0
virtual void collectIncomingValuesFromPhi(const MachineInstr *MI, SmallVectorImpl< Incoming > &Incomings) const =0
PhiLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT, MachinePostDominatorTree *PDT)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
A debug info location.
Definition DebugLoc.h:123
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
iterator end()
Definition DenseMap.h:81
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
DomTreeNodeBase * getIDom() const
NodeT * getBlock() const
NodeT * findNearestCommonDominator(NodeT *A, NodeT *B) const
Find nearest common dominator basic block for basic block A and B.
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
const HexagonRegisterInfo & getRegisterInfo() const
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
MachineInstrBundleIterator< MachineInstr > iterator
Analysis pass which computes a MachineDominatorTree.
Analysis pass which computes a MachineDominatorTree.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
const MachineFunctionProperties & getProperties() const
Get the function properties.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
LLVM_ABI MachineBasicBlock * findNearestCommonDominator(ArrayRef< MachineBasicBlock * > Blocks) const
Returns the nearest common dominator of the given blocks.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
iterator_range< use_instr_iterator > use_instructions(Register Reg) const
bool use_empty(Register RegNo) const
use_empty - Return true if there are no instructions using the specified register.
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
MachineSSAUpdater - This class updates SSA form for a set of virtual registers defined in multiple bl...
void Initialize(Register V)
Initialize - Reset this object to get ready for a new set of SSA updates.
Register GetValueInMiddleOfBlock(MachineBasicBlock *BB, bool ExistingValueOnly=false)
GetValueInMiddleOfBlock - Construct SSA form, materializing a value that is live in the middle of the...
void AddAvailableValue(MachineBasicBlock *BB, Register V)
AddAvailableValue - Indicate that a rewritten value is available at the end of the specified block wi...
iterator find(const KeyT &Key)
Definition MapVector.h:154
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition SSAUpdater.h:39
void Initialize(Type *Ty, StringRef Name)
Reset this object to get ready for a new set of SSA updates with type 'Ty'.
Value * GetValueInMiddleOfBlock(BasicBlock *BB)
Construct SSA form, materializing a value that is live in the middle of the specified block.
void AddAvailableValue(BasicBlock *BB, Value *V)
Indicate that a rewritten value is available in the specified block with the specified value.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Register createLaneMaskReg(MachineRegisterInfo *MRI, MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
DomTreeNodeBase< MachineBasicBlock > MachineDomTreeNode
ArrayRef(const T &OneElt) -> ArrayRef< T >
FunctionPass * createSILowerI1CopiesLegacyPass()
char & SILowerI1CopiesLegacyID
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Incoming for lane mask phi as machine instruction, incoming register Reg and incoming block Block are...
MachineBasicBlock * Block
All attributes(register class or bank and low-level type) a virtual register can have.