LLVM 23.0.0git
AMDGPURegBankLegalizeHelper.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Implements actual lowering algorithms for each ID that can be used in
10/// Rule.OperandMapping. Similar to legalizer helper but with register banks.
11//
12//===----------------------------------------------------------------------===//
13
16#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
27
28#define DEBUG_TYPE "amdgpu-regbanklegalize"
29
30using namespace llvm;
31using namespace AMDGPU;
32
35 const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
36 : MF(B.getMF()), ST(MF.getSubtarget<GCNSubtarget>()), B(B),
37 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
38 RBLRules(RBLRules), IsWave32(ST.isWave32()),
39 SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
40 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
41 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
42
44 const SetOfRulesForOpcode *RuleSet = RBLRules.getRulesForOpc(MI);
45 if (!RuleSet) {
46 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
47 "No AMDGPU RegBankLegalize rules defined for opcode",
48 MI);
49 return false;
50 }
51
52 const RegBankLLTMapping *Mapping = RuleSet->findMappingForMI(MI, MRI, MUI);
53 if (!Mapping) {
54 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
55 "AMDGPU RegBankLegalize: none of the rules defined with "
56 "'Any' for MI's opcode matched MI",
57 MI);
58 return false;
59 }
60
61 SmallSet<Register, 4> WaterfallSgprs;
62 unsigned OpIdx = 0;
63 if (Mapping->DstOpMapping.size() > 0) {
64 B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
65 if (!applyMappingDst(MI, OpIdx, Mapping->DstOpMapping))
66 return false;
67 }
68 if (Mapping->SrcOpMapping.size() > 0) {
69 B.setInstr(MI);
70 if (!applyMappingSrc(MI, OpIdx, Mapping->SrcOpMapping, WaterfallSgprs))
71 return false;
72 }
73
74 if (!lower(MI, *Mapping, WaterfallSgprs))
75 return false;
76
77 return true;
78}
79
80bool RegBankLegalizeHelper::executeInWaterfallLoop(
82 SmallSet<Register, 4> &SGPROperandRegs) {
83 // Track use registers which have already been expanded with a readfirstlane
84 // sequence. This may have multiple uses if moving a sequence.
85 DenseMap<Register, Register> WaterfalledRegMap;
86
87 MachineBasicBlock &MBB = B.getMBB();
88 MachineFunction &MF = B.getMF();
89
91 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
92 unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
93 if (IsWave32) {
94 MovExecOpc = AMDGPU::S_MOV_B32;
95 MovExecTermOpc = AMDGPU::S_MOV_B32_term;
96 XorTermOpc = AMDGPU::S_XOR_B32_term;
97 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
98 ExecReg = AMDGPU::EXEC_LO;
99 } else {
100 MovExecOpc = AMDGPU::S_MOV_B64;
101 MovExecTermOpc = AMDGPU::S_MOV_B64_term;
102 XorTermOpc = AMDGPU::S_XOR_B64_term;
103 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
104 ExecReg = AMDGPU::EXEC;
105 }
106
107#ifndef NDEBUG
108 const int OrigRangeSize = std::distance(Range.begin(), Range.end());
109#endif
110
111 MachineRegisterInfo &MRI = *B.getMRI();
112 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
113 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
114
115 // Don't bother using generic instructions/registers for the exec mask.
116 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
117
118 Register SavedExec = MRI.createVirtualRegister(WaveRC);
119
120 // To insert the loop we need to split the block. Move everything before
121 // this point to a new block, and insert a new empty block before this
122 // instruction.
125 MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
126 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
128 ++MBBI;
129 MF.insert(MBBI, LoopBB);
130 MF.insert(MBBI, BodyBB);
131 MF.insert(MBBI, RestoreExecBB);
132 MF.insert(MBBI, RemainderBB);
133
134 LoopBB->addSuccessor(BodyBB);
135 BodyBB->addSuccessor(RestoreExecBB);
136 BodyBB->addSuccessor(LoopBB);
137
138 // Move the rest of the block into a new block.
140 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
141
142 MBB.addSuccessor(LoopBB);
143 RestoreExecBB->addSuccessor(RemainderBB);
144
145 B.setInsertPt(*LoopBB, LoopBB->end());
146
147 // +-MBB:------------+
148 // | ... |
149 // | %0 = G_INST_1 |
150 // | %Dst = MI %Vgpr |
151 // | %1 = G_INST_2 |
152 // | ... |
153 // +-----------------+
154 // ->
155 // +-MBB-------------------------------+
156 // | ... |
157 // | %0 = G_INST_1 |
158 // | %SaveExecReg = S_MOV_B32 $exec_lo |
159 // +----------------|------------------+
160 // | /------------------------------|
161 // V V |
162 // +-LoopBB---------------------------------------------------------------+ |
163 // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
164 // | instead of executing for each lane, see if other lanes had | |
165 // | same value for %Vgpr and execute for them also. | |
166 // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
167 // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
168 // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
169 // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
170 // +----------------|-----------------------------------------------------+ |
171 // V |
172 // +-BodyBB------------------------------------------------------------+ |
173 // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
174 // | executed only for active lanes and written to Dst | |
175 // | $exec = S_XOR_B32 $exec, %SavedExec | |
176 // | set active lanes to 0 in SavedExec, lanes that did not write to | |
177 // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
178 // | SI_WATERFALL_LOOP LoopBB |-----|
179 // +----------------|--------------------------------------------------+
180 // V
181 // +-RestoreExecBB--------------------------+
182 // | $exec_lo = S_MOV_B32_term %SaveExecReg |
183 // +----------------|-----------------------+
184 // V
185 // +-RemainderBB:----------------------+
186 // | %1 = G_INST_2 |
187 // | ... |
188 // +---------------------------------- +
189
190 // Move the instruction into the loop body. Note we moved everything after
191 // Range.end() already into a new block, so Range.end() is no longer valid.
192 BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
193
194 // Figure out the iterator range after splicing the instructions.
195 MachineBasicBlock::iterator NewBegin = Range.begin()->getIterator();
196 auto NewEnd = BodyBB->end();
197 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
198
199 B.setMBB(*LoopBB);
200 Register CondReg;
201
202 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
203 for (MachineOperand &Op : MI.all_uses()) {
204 Register OldReg = Op.getReg();
205 if (!SGPROperandRegs.count(OldReg))
206 continue;
207
208 // See if we already processed this register in another instruction in
209 // the sequence.
210 auto OldVal = WaterfalledRegMap.find(OldReg);
211 if (OldVal != WaterfalledRegMap.end()) {
212 Op.setReg(OldVal->second);
213 continue;
214 }
215
216 Register OpReg = Op.getReg();
217 LLT OpTy = MRI.getType(OpReg);
218
219 // TODO: support for agpr
220 assert(MRI.getRegBank(OpReg) == VgprRB);
221 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
222 buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
223
224 // Build the comparison(s), CurrentLaneReg == OpReg.
225 unsigned OpSize = OpTy.getSizeInBits();
226 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
227 LLT PartTy = LLT::scalar(PartSize);
228 unsigned NumParts = OpSize / PartSize;
230 SmallVector<Register, 8> CurrentLaneParts;
231
232 if (NumParts == 1) {
233 OpParts.push_back(OpReg);
234 CurrentLaneParts.push_back(CurrentLaneReg);
235 } else {
236 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
237 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
238 for (unsigned i = 0; i < NumParts; ++i) {
239 OpParts.push_back(UnmergeOp.getReg(i));
240 CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
241 }
242 }
243
244 for (unsigned i = 0; i < NumParts; ++i) {
245 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
246 B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
247
248 if (!CondReg)
249 CondReg = CmpReg;
250 else
251 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
252 }
253
254 Op.setReg(CurrentLaneReg);
255
256 // Make sure we don't re-process this register again.
257 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
258 }
259 }
260
261 // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
262 Register CondRegLM =
263 MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
264 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
265
266 // Update EXEC, save the original EXEC value to SavedExec.
267 B.buildInstr(AndSaveExecOpc)
268 .addDef(SavedExec)
269 .addReg(CondRegLM, RegState::Kill);
270 MRI.setSimpleHint(SavedExec, CondRegLM);
271
272 B.setInsertPt(*BodyBB, BodyBB->end());
273
274 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
275 B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
276
277 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
278 // s_cbranch_scc0?
279
280 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
281 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
282
283 // Save the EXEC mask before the loop.
284 B.setInsertPt(MBB, MBB.end());
285 B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
286
287 // Restore the EXEC mask after the loop.
288 B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
289 B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
290
291 // Set the insert point after the original instruction, so any new
292 // instructions will be in the remainder.
293 B.setInsertPt(*RemainderBB, RemainderBB->begin());
294
295 return true;
296}
297
298bool RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
299 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
300 MachineFunction &MF = B.getMF();
301 assert(MI.getNumMemOperands() == 1);
302 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
303 Register Dst = MI.getOperand(0).getReg();
304 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
305 Register Base = MI.getOperand(1).getReg();
306 LLT PtrTy = MRI.getType(Base);
307 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base);
308 LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
309 SmallVector<Register, 4> LoadPartRegs;
310
311 unsigned ByteOffset = 0;
312 for (LLT PartTy : LLTBreakdown) {
313 Register BasePlusOffset;
314 if (ByteOffset == 0) {
315 BasePlusOffset = Base;
316 } else {
317 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
318 BasePlusOffset =
319 B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0);
320 }
321 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
322 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
323 LoadPartRegs.push_back(LoadPart.getReg(0));
324 ByteOffset += PartTy.getSizeInBytes();
325 }
326
327 if (!MergeTy.isValid()) {
328 // Loads are of same size, concat or merge them together.
329 B.buildMergeLikeInstr(Dst, LoadPartRegs);
330 } else {
331 // Loads are not all of same size, need to unmerge them to smaller pieces
332 // of MergeTy type, then merge pieces to Dst.
333 SmallVector<Register, 4> MergeTyParts;
334 for (Register Reg : LoadPartRegs) {
335 if (MRI.getType(Reg) == MergeTy) {
336 MergeTyParts.push_back(Reg);
337 } else {
338 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg);
339 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
340 MergeTyParts.push_back(Unmerge.getReg(i));
341 }
342 }
343 B.buildMergeLikeInstr(Dst, MergeTyParts);
344 }
345 MI.eraseFromParent();
346 return true;
347}
348
349bool RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
350 LLT MergeTy) {
351 MachineFunction &MF = B.getMF();
352 assert(MI.getNumMemOperands() == 1);
353 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
354 Register Dst = MI.getOperand(0).getReg();
355 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
356 Register Base = MI.getOperand(1).getReg();
357
358 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
359 auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO);
360
361 if (WideTy.isScalar()) {
362 B.buildTrunc(Dst, WideLoad);
363 } else {
364 SmallVector<Register, 4> MergeTyParts;
365 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
366
367 LLT DstTy = MRI.getType(Dst);
368 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
369 for (unsigned i = 0; i < NumElts; ++i) {
370 MergeTyParts.push_back(Unmerge.getReg(i));
371 }
372 B.buildMergeLikeInstr(Dst, MergeTyParts);
373 }
374 MI.eraseFromParent();
375 return true;
376}
377
378bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
379 Register Dst = MI.getDstReg();
380 Register Ptr = MI.getPointerReg();
381 MachineMemOperand &MMO = MI.getMMO();
382 unsigned MemSize = 8 * MMO.getSize().getValue();
383
384 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
385
386 if (MI.getOpcode() == G_LOAD) {
387 B.buildLoad(Dst, Ptr, *WideMMO);
388 } else {
389 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
390
391 if (MI.getOpcode() == G_ZEXTLOAD) {
392 APInt Mask = APInt::getLowBitsSet(S32.getSizeInBits(), MemSize);
393 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
394 B.buildAnd(Dst, Load, MaskCst);
395 } else {
396 assert(MI.getOpcode() == G_SEXTLOAD);
397 B.buildSExtInReg(Dst, Load, MemSize);
398 }
399 }
400
401 MI.eraseFromParent();
402 return true;
403}
404
405bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
406 Register Dst = MI.getOperand(0).getReg();
407 LLT Ty = MRI.getType(Dst);
408 Register Src = MI.getOperand(1).getReg();
409 unsigned Opc = MI.getOpcode();
410 int TrueExtCst = Opc == G_SEXT ? -1 : 1;
411 if (Ty == S32 || Ty == S16) {
412 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
413 auto False = B.buildConstant({VgprRB, Ty}, 0);
414 B.buildSelect(Dst, Src, True, False);
415 } else if (Ty == S64) {
416 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
417 auto False = B.buildConstant({VgprRB_S32}, 0);
418 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
419 MachineInstrBuilder Hi;
420 switch (Opc) {
421 case G_SEXT:
422 Hi = Lo;
423 break;
424 case G_ZEXT:
425 Hi = False;
426 break;
427 case G_ANYEXT:
428 Hi = B.buildUndef({VgprRB_S32});
429 break;
430 default:
432 MF, MORE, "amdgpu-regbanklegalize",
433 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported", MI);
434 return false;
435 }
436
437 B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)});
438 } else {
440 MF, MORE, "amdgpu-regbanklegalize",
441 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported", MI);
442 return false;
443 }
444
445 MI.eraseFromParent();
446 return true;
447}
448
449std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
450 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
451 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
452 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
453 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
454 return {Lo.getReg(0), Hi.getReg(0)};
455}
456
457std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
458 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
459 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
460 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
461 return {Lo.getReg(0), Hi.getReg(0)};
462}
463
464std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
465 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
466 auto Lo = PackedS32;
467 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
468 return {Lo.getReg(0), Hi.getReg(0)};
469}
470
471std::pair<Register, Register>
472RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) {
473 auto [Lo32, Hi32] = unpackAExt(Reg);
474 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
475 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
476}
477
478bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
479 Register Lo, Hi;
480 switch (MI.getOpcode()) {
481 case AMDGPU::G_SHL: {
482 auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg());
483 auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg());
484 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
485 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
486 break;
487 }
488 case AMDGPU::G_LSHR: {
489 auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg());
490 auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg());
491 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
492 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
493 break;
494 }
495 case AMDGPU::G_ASHR: {
496 auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg());
497 auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg());
498 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
499 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
500 break;
501 }
502 default:
504 MF, MORE, "amdgpu-regbanklegalize",
505 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
506 MI);
507 return false;
508 }
509 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
510 MI.eraseFromParent();
511 return true;
512}
513
514bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
515 Register Lo, Hi;
516 switch (MI.getOpcode()) {
517 case AMDGPU::G_SMIN:
518 case AMDGPU::G_SMAX: {
519 // For signed operations, use sign extension
520 auto [Val0_Lo, Val0_Hi] = unpackSExt(MI.getOperand(1).getReg());
521 auto [Val1_Lo, Val1_Hi] = unpackSExt(MI.getOperand(2).getReg());
522 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
523 .getReg(0);
524 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
525 .getReg(0);
526 break;
527 }
528 case AMDGPU::G_UMIN:
529 case AMDGPU::G_UMAX: {
530 // For unsigned operations, use zero extension
531 auto [Val0_Lo, Val0_Hi] = unpackZExt(MI.getOperand(1).getReg());
532 auto [Val1_Lo, Val1_Hi] = unpackZExt(MI.getOperand(2).getReg());
533 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
534 .getReg(0);
535 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
536 .getReg(0);
537 break;
538 }
539 default:
541 MF, MORE, "amdgpu-regbanklegalize",
542 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented", MI);
543 return false;
544 }
545 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
546 MI.eraseFromParent();
547 return true;
548}
549
550bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) {
551 auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg());
552 auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg());
553 auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
554 auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
555 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
556 {ResLo.getReg(0), ResHi.getReg(0)});
557 MI.eraseFromParent();
558 return true;
559}
560
563 return (GI->is(Intrinsic::amdgcn_sbfe));
564
565 return MI.getOpcode() == AMDGPU::G_SBFX;
566}
567
568bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
569 Register Dst = MI.getOperand(0).getReg();
570 assert(MRI.getType(Dst) == LLT::scalar(64));
571 bool Signed = isSignedBFE(MI);
572 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
573 // Extract bitfield from Src, LSBit is the least-significant bit for the
574 // extraction (field offset) and Width is size of bitfield.
575 Register Src = MI.getOperand(FirstOpnd).getReg();
576 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
577 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
578 // Comments are for signed bitfield extract, similar for unsigned. x is sign
579 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
580
581 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
582 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
583 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
584
585 auto ConstWidth = getIConstantVRegValWithLookThrough(Width, MRI);
586
587 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
588 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
589 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
590 if (!ConstWidth) {
591 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
592 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
593 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
594 MI.eraseFromParent();
595 return true;
596 }
597
598 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
599 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
600 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
601 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
602 auto Zero = B.buildConstant({VgprRB, S32}, 0);
603 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
604
605 if (WidthImm <= 32) {
606 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
607 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo, Zero, Width});
608 MachineInstrBuilder Hi;
609 if (Signed) {
610 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
611 Hi = B.buildAShr(VgprRB_S32, Lo, B.buildConstant(VgprRB_S32, 31));
612 } else {
613 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
614 Hi = Zero;
615 }
616 B.buildMergeLikeInstr(Dst, {Lo, Hi});
617 } else {
618 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
619 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
620 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi, Zero, Amt});
621 B.buildMergeLikeInstr(Dst, {SHRSrcLo, Hi});
622 }
623
624 MI.eraseFromParent();
625 return true;
626}
627
628bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
629 Register DstReg = MI.getOperand(0).getReg();
630 LLT Ty = MRI.getType(DstReg);
631 bool Signed = isSignedBFE(MI);
632 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
633 Register Src = MI.getOperand(FirstOpnd).getReg();
634 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
635 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
636 // For uniform bit field extract there are 4 available instructions, but
637 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
638 // field offset in low and size in high 16 bits.
639
640 // Src1 Hi16|Lo16 = Size|FieldOffset
641 auto Mask = B.buildConstant(SgprRB_S32, maskTrailingOnes<unsigned>(6));
642 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
643 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
644 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset, Size);
645 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
646 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
647 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
648
649 // Select machine instruction, because of reg class constraining, insert
650 // copies from reg class to reg bank.
651 auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}},
652 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
653 if (!constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(),
654 *ST.getRegisterInfo(), RBI)) {
656 MF, MORE, "amdgpu-regbanklegalize",
657 "AMDGPU RegBankLegalize: lowerS_BFE, failed to constrain BFE", MI);
658 return false;
659 }
660
661 B.buildCopy(DstReg, S_BFE->getOperand(0).getReg());
662 MI.eraseFromParent();
663 return true;
664}
665
666bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
667 Register Dst = MI.getOperand(0).getReg();
668 LLT DstTy = MRI.getType(Dst);
669 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
670 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
671 auto Op1 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(1).getReg());
672 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
673 unsigned Opc = MI.getOpcode();
674 auto Flags = MI.getFlags();
675 auto Lo =
676 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)}, Flags);
677 auto Hi =
678 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags);
679 B.buildMergeLikeInstr(Dst, {Lo, Hi});
680 MI.eraseFromParent();
681 return true;
682}
683
684bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &MI) {
685 Register Dst = MI.getOperand(0).getReg();
686 assert(MRI.getType(Dst) == S64);
687 auto Op1 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(1).getReg());
688 auto Op2 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(2).getReg());
689
690 // TODO: G_AMDGPU_MAD_* optimizations for G_MUL divergent S64 operation to
691 // match GlobalISel with old regbankselect.
692 auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
693 auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
694 auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
695 auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
696 auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
697 auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
698
699 B.buildMergeLikeInstr(Dst, {Lo, Hi});
700 MI.eraseFromParent();
701 return true;
702}
703
704bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
705 Register Dst = MI.getOperand(0).getReg();
706 assert(MRI.getType(Dst) == V2S16);
707 unsigned Opc = MI.getOpcode();
708 unsigned NumOps = MI.getNumOperands();
709 auto Flags = MI.getFlags();
710
711 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
712
713 if (NumOps == 2) {
714 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags);
715 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags);
716 B.buildMergeLikeInstr(Dst, {Lo, Hi});
717 MI.eraseFromParent();
718 return true;
719 }
720
721 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(MI.getOperand(2).getReg());
722
723 if (NumOps == 3) {
724 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
725 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
726 B.buildMergeLikeInstr(Dst, {Lo, Hi});
727 MI.eraseFromParent();
728 return true;
729 }
730
731 assert(NumOps == 4);
732 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(MI.getOperand(3).getReg());
733 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo}, Flags);
734 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi}, Flags);
735 B.buildMergeLikeInstr(Dst, {Lo, Hi});
736 MI.eraseFromParent();
737 return true;
738}
739
740bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &MI) {
741 Register Dst0 = MI.getOperand(0).getReg();
742 Register Dst1 = MI.getOperand(1).getReg();
743 Register Src0 = MI.getOperand(2).getReg();
744 Register Src1 = MI.getOperand(3).getReg();
745 Register Src2 = MI.getOperand(4).getReg();
746
747 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
748
749 // Keep the multiplication on the SALU.
750 Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
751 Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
752 if (ST.hasScalarMulHiInsts()) {
753 B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
754 } else {
755 auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
756 auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
757 auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
758 buildReadAnyLane(B, DstHi, MulHi.getReg(0), RBI);
759 }
760
761 // Accumulate and produce the "carry-out" bit.
762
763 // The "carry-out" is defined as bit 64 of the result when computed as a
764 // big integer. For unsigned multiply-add, this matches the usual
765 // definition of carry-out.
766 if (mi_match(Src2, MRI, MIPatternMatch::m_ZeroInt())) {
767 // No accumulate: result is just the multiplication, carry is 0.
768 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
769 B.buildConstant(Dst1, 0);
770 } else {
771 // Accumulate: add Src2 to the multiplication result with carry chain.
772 Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
773 Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
774 B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
775
776 auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
777 auto AddHi =
778 B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
779 B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
780 B.buildCopy(Dst1, AddHi.getReg(1));
781 }
782
783 MI.eraseFromParent();
784 return true;
785}
786
787bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
788 Register Dst = MI.getOperand(0).getReg();
789 LLT DstTy = MRI.getType(Dst);
790 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
791 (DstTy.isPointer() && DstTy.getSizeInBits() == 64));
792 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
793 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
794 auto Op3 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(3).getReg());
795 Register Cond = MI.getOperand(1).getReg();
796 auto Flags = MI.getFlags();
797 auto Lo =
798 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(0), Op3.getReg(0), Flags);
799 auto Hi =
800 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(1), Op3.getReg(1), Flags);
801
802 B.buildMergeLikeInstr(Dst, {Lo, Hi});
803 MI.eraseFromParent();
804 return true;
805}
806
807bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
808 auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
809 int Amt = MI.getOperand(2).getImm();
810 Register Lo, Hi;
811 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
812 if (Amt <= 32) {
813 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
814 if (Amt == 32) {
815 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
816 Lo = Freeze.getReg(0);
817 } else {
818 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
819 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
820 }
821
822 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
823 Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0);
824 } else {
825 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
826 Lo = Op1.getReg(0);
827 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
828 }
829
830 B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
831 MI.eraseFromParent();
832 return true;
833}
834
835bool RegBankLegalizeHelper::lower(MachineInstr &MI,
836 const RegBankLLTMapping &Mapping,
837 SmallSet<Register, 4> &WaterfallSgprs) {
838
839 switch (Mapping.LoweringMethod) {
840 case DoNotLower:
841 break;
842 case VccExtToSel:
843 return lowerVccExtToSel(MI);
844 case UniExtToSel: {
845 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
846 auto True = B.buildConstant({SgprRB, Ty},
847 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
848 auto False = B.buildConstant({SgprRB, Ty}, 0);
849 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare.
850 // We are making select here. S1 cond was already 'any-extended to S32' +
851 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg.
852 B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True,
853 False);
854 MI.eraseFromParent();
855 return true;
856 }
857 case UnpackBitShift:
858 return lowerUnpackBitShift(MI);
859 case UnpackMinMax:
860 return lowerUnpackMinMax(MI);
861 case ScalarizeToS16:
862 return lowerSplitTo16(MI);
863 case Ext32To64: {
864 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
865 MachineInstrBuilder Hi;
866 switch (MI.getOpcode()) {
867 case AMDGPU::G_ZEXT: {
868 Hi = B.buildConstant({RB, S32}, 0);
869 break;
870 }
871 case AMDGPU::G_SEXT: {
872 // Replicate sign bit from 32-bit extended part.
873 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
874 Hi = B.buildAShr({RB, S32}, MI.getOperand(1).getReg(), ShiftAmt);
875 break;
876 }
877 case AMDGPU::G_ANYEXT: {
878 Hi = B.buildUndef({RB, S32});
879 break;
880 }
881 default:
882 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
883 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
884 MI);
885 return false;
886 }
887
888 B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
889 {MI.getOperand(1).getReg(), Hi});
890 MI.eraseFromParent();
891 return true;
892 }
893 case UniCstExt: {
894 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
895 B.buildConstant(MI.getOperand(0).getReg(), ConstVal);
896
897 MI.eraseFromParent();
898 return true;
899 }
900 case VgprToVccCopy: {
901 Register Src = MI.getOperand(1).getReg();
902 LLT Ty = MRI.getType(Src);
903 // Take lowest bit from each lane and put it in lane mask.
904 // Lowering via compare, but we need to clean high bits first as compare
905 // compares all bits in register.
906 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
907 if (Ty == S64) {
908 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
909 auto One = B.buildConstant(VgprRB_S32, 1);
910 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
911 auto Zero = B.buildConstant(VgprRB_S32, 0);
912 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
913 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
914 } else {
915 assert(Ty == S32 || Ty == S16);
916 auto One = B.buildConstant({VgprRB, Ty}, 1);
917 B.buildAnd(BoolSrc, Src, One);
918 }
919 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
920 B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero);
921 MI.eraseFromParent();
922 return true;
923 }
924 case V_BFE:
925 return lowerV_BFE(MI);
926 case S_BFE:
927 return lowerS_BFE(MI);
928 case UniMAD64:
929 return lowerUniMAD64(MI);
930 case UniMul64: {
931 B.buildMul(MI.getOperand(0), MI.getOperand(1), MI.getOperand(2));
932 MI.eraseFromParent();
933 return true;
934 }
935 case DivSMulToMAD: {
936 auto Op1 = B.buildTrunc(VgprRB_S32, MI.getOperand(1));
937 auto Op2 = B.buildTrunc(VgprRB_S32, MI.getOperand(2));
938 auto Zero = B.buildConstant({VgprRB, S64}, 0);
939
940 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
941 ? AMDGPU::G_AMDGPU_MAD_U64_U32
942 : AMDGPU::G_AMDGPU_MAD_I64_I32;
943
944 B.buildInstr(NewOpc, {MI.getOperand(0).getReg(), {SgprRB, S32}},
945 {Op1, Op2, Zero});
946 MI.eraseFromParent();
947 return true;
948 }
949 case SplitTo32:
950 return lowerSplitTo32(MI);
951 case SplitTo32Mul:
952 return lowerSplitTo32Mul(MI);
953 case SplitTo32Select:
954 return lowerSplitTo32Select(MI);
956 return lowerSplitTo32SExtInReg(MI);
957 case SplitLoad: {
958 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
959 unsigned Size = DstTy.getSizeInBits();
960 // Even split to 128-bit loads
961 if (Size > 128) {
962 LLT B128;
963 if (DstTy.isVector()) {
964 LLT EltTy = DstTy.getElementType();
965 B128 = LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
966 } else {
967 B128 = LLT::scalar(128);
968 }
969 if (Size / 128 == 2)
970 splitLoad(MI, {B128, B128});
971 else if (Size / 128 == 4)
972 splitLoad(MI, {B128, B128, B128, B128});
973 else {
974 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
975 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
976 MI);
977 return false;
978 }
979 }
980 // 64 and 32 bit load
981 else if (DstTy == S96)
982 splitLoad(MI, {S64, S32}, S32);
983 else if (DstTy == V3S32)
984 splitLoad(MI, {V2S32, S32}, S32);
985 else if (DstTy == V6S16)
986 splitLoad(MI, {V4S16, V2S16}, V2S16);
987 else {
988 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
989 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
990 MI);
991 return false;
992 }
993 return true;
994 }
995 case WidenLoad: {
996 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
997 if (DstTy == S96)
998 widenLoad(MI, S128);
999 else if (DstTy == V3S32)
1000 widenLoad(MI, V4S32, S32);
1001 else if (DstTy == V6S16)
1002 widenLoad(MI, V8S16, V2S16);
1003 else {
1004 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1005 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1006 MI);
1007 return false;
1008 }
1009 return true;
1010 }
1011 case UnpackAExt:
1012 return lowerUnpackAExt(MI);
1013 case WidenMMOToS32:
1014 return widenMMOToS32(cast<GAnyLoad>(MI));
1015 }
1016
1017 if (!WaterfallSgprs.empty()) {
1018 MachineBasicBlock::iterator I = MI.getIterator();
1019 if (!executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs))
1020 return false;
1021 }
1022 return true;
1023}
1024
1025LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
1026 switch (ID) {
1027 case Vcc:
1028 case UniInVcc:
1029 return LLT::scalar(1);
1030 case Sgpr16:
1031 case Vgpr16:
1032 case UniInVgprS16:
1033 return LLT::scalar(16);
1034 case Sgpr32:
1035 case Sgpr32_WF:
1036 case Sgpr32Trunc:
1037 case Sgpr32AExt:
1039 case Sgpr32SExt:
1040 case Sgpr32ZExt:
1041 case UniInVgprS32:
1042 case Vgpr32:
1043 case Vgpr32AExt:
1044 case Vgpr32SExt:
1045 case Vgpr32ZExt:
1046 return LLT::scalar(32);
1047 case Sgpr64:
1048 case Vgpr64:
1049 case UniInVgprS64:
1050 return LLT::scalar(64);
1051 case Sgpr128:
1052 case Vgpr128:
1053 return LLT::scalar(128);
1054 case SgprP0:
1055 case VgprP0:
1056 return LLT::pointer(0, 64);
1057 case SgprP1:
1058 case VgprP1:
1059 return LLT::pointer(1, 64);
1060 case SgprP2:
1061 case VgprP2:
1062 return LLT::pointer(2, 32);
1063 case SgprP3:
1064 case VgprP3:
1065 return LLT::pointer(3, 32);
1066 case SgprP4:
1067 case VgprP4:
1068 return LLT::pointer(4, 64);
1069 case SgprP5:
1070 case VgprP5:
1071 return LLT::pointer(5, 32);
1072 case SgprP8:
1073 return LLT::pointer(8, 128);
1074 case SgprV2S16:
1075 case VgprV2S16:
1076 case UniInVgprV2S16:
1077 return LLT::fixed_vector(2, 16);
1078 case SgprV2S32:
1079 case VgprV2S32:
1080 case UniInVgprV2S32:
1081 return LLT::fixed_vector(2, 32);
1082 case VgprV3S32:
1083 return LLT::fixed_vector(3, 32);
1084 case SgprV4S32:
1085 case SgprV4S32_WF:
1086 case VgprV4S32:
1087 case UniInVgprV4S32:
1088 return LLT::fixed_vector(4, 32);
1089 case VgprV2S64:
1090 case UniInVgprV2S64:
1091 return LLT::fixed_vector(2, 64);
1092 default:
1093 return LLT();
1094 }
1095}
1096
1097LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
1098 switch (ID) {
1099 case SgprB32:
1100 case VgprB32:
1101 case UniInVgprB32:
1102 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
1103 isAnyPtr(Ty, 32))
1104 return Ty;
1105 return LLT();
1106 case SgprPtr32:
1107 case VgprPtr32:
1108 return isAnyPtr(Ty, 32) ? Ty : LLT();
1109 case SgprPtr64:
1110 case VgprPtr64:
1111 return isAnyPtr(Ty, 64) ? Ty : LLT();
1112 case SgprPtr128:
1113 case VgprPtr128:
1114 return isAnyPtr(Ty, 128) ? Ty : LLT();
1115 case SgprB64:
1116 case VgprB64:
1117 case UniInVgprB64:
1118 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
1119 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
1120 return Ty;
1121 return LLT();
1122 case SgprB96:
1123 case VgprB96:
1124 case UniInVgprB96:
1125 if (Ty == LLT::scalar(96) || Ty == LLT::fixed_vector(3, 32) ||
1126 Ty == LLT::fixed_vector(6, 16))
1127 return Ty;
1128 return LLT();
1129 case SgprB128:
1130 case VgprB128:
1131 case UniInVgprB128:
1132 if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) ||
1133 Ty == LLT::fixed_vector(2, 64) || Ty == LLT::fixed_vector(8, 16) ||
1134 isAnyPtr(Ty, 128))
1135 return Ty;
1136 return LLT();
1137 case SgprB256:
1138 case VgprB256:
1139 case UniInVgprB256:
1140 if (Ty == LLT::scalar(256) || Ty == LLT::fixed_vector(8, 32) ||
1141 Ty == LLT::fixed_vector(4, 64) || Ty == LLT::fixed_vector(16, 16))
1142 return Ty;
1143 return LLT();
1144 case SgprB512:
1145 case VgprB512:
1146 case UniInVgprB512:
1147 if (Ty == LLT::scalar(512) || Ty == LLT::fixed_vector(16, 32) ||
1148 Ty == LLT::fixed_vector(8, 64))
1149 return Ty;
1150 return LLT();
1151 default:
1152 return LLT();
1153 }
1154}
1155
1156const RegisterBank *
1157RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
1158 switch (ID) {
1159 case Vcc:
1160 return VccRB;
1161 case Sgpr16:
1162 case Sgpr32:
1163 case Sgpr32_WF:
1164 case Sgpr64:
1165 case Sgpr128:
1166 case SgprP0:
1167 case SgprP1:
1168 case SgprP2:
1169 case SgprP3:
1170 case SgprP4:
1171 case SgprP5:
1172 case SgprP8:
1173 case SgprPtr32:
1174 case SgprPtr64:
1175 case SgprPtr128:
1176 case SgprV2S16:
1177 case SgprV2S32:
1178 case SgprV4S32:
1179 case SgprV4S32_WF:
1180 case SgprB32:
1181 case SgprB64:
1182 case SgprB96:
1183 case SgprB128:
1184 case SgprB256:
1185 case SgprB512:
1186 case UniInVcc:
1187 case UniInVgprS16:
1188 case UniInVgprS32:
1189 case UniInVgprS64:
1190 case UniInVgprV2S16:
1191 case UniInVgprV2S32:
1192 case UniInVgprV4S32:
1193 case UniInVgprV2S64:
1194 case UniInVgprB32:
1195 case UniInVgprB64:
1196 case UniInVgprB96:
1197 case UniInVgprB128:
1198 case UniInVgprB256:
1199 case UniInVgprB512:
1200 case Sgpr32Trunc:
1201 case Sgpr32AExt:
1203 case Sgpr32SExt:
1204 case Sgpr32ZExt:
1205 return SgprRB;
1206 case Vgpr16:
1207 case Vgpr32:
1208 case Vgpr64:
1209 case Vgpr128:
1210 case VgprP0:
1211 case VgprP1:
1212 case VgprP2:
1213 case VgprP3:
1214 case VgprP4:
1215 case VgprP5:
1216 case VgprPtr32:
1217 case VgprPtr64:
1218 case VgprPtr128:
1219 case VgprV2S16:
1220 case VgprV2S32:
1221 case VgprV2S64:
1222 case VgprV3S32:
1223 case VgprV4S32:
1224 case VgprB32:
1225 case VgprB64:
1226 case VgprB96:
1227 case VgprB128:
1228 case VgprB256:
1229 case VgprB512:
1230 case Vgpr32AExt:
1231 case Vgpr32SExt:
1232 case Vgpr32ZExt:
1233 return VgprRB;
1234 default:
1235 return nullptr;
1236 }
1237}
1238
1239bool RegBankLegalizeHelper::applyMappingDst(
1240 MachineInstr &MI, unsigned &OpIdx,
1241 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1242 // Defs start from operand 0
1243 for (; OpIdx < MethodIDs.size(); ++OpIdx) {
1244 if (MethodIDs[OpIdx] == None)
1245 continue;
1246 MachineOperand &Op = MI.getOperand(OpIdx);
1247 Register Reg = Op.getReg();
1248 LLT Ty = MRI.getType(Reg);
1249 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg);
1250
1251 switch (MethodIDs[OpIdx]) {
1252 // vcc, sgpr and vgpr scalars, pointers and vectors
1253 case Vcc:
1254 case Sgpr16:
1255 case Sgpr32:
1256 case Sgpr64:
1257 case Sgpr128:
1258 case SgprP0:
1259 case SgprP1:
1260 case SgprP3:
1261 case SgprP4:
1262 case SgprP5:
1263 case SgprP8:
1264 case SgprV2S16:
1265 case SgprV2S32:
1266 case SgprV4S32:
1267 case Vgpr16:
1268 case Vgpr32:
1269 case Vgpr64:
1270 case Vgpr128:
1271 case VgprP0:
1272 case VgprP1:
1273 case VgprP2:
1274 case VgprP3:
1275 case VgprP4:
1276 case VgprP5:
1277 case VgprV2S16:
1278 case VgprV2S32:
1279 case VgprV2S64:
1280 case VgprV3S32:
1281 case VgprV4S32: {
1282 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1283 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1284 break;
1285 }
1286 // sgpr and vgpr B-types
1287 case SgprB32:
1288 case SgprB64:
1289 case SgprB96:
1290 case SgprB128:
1291 case SgprB256:
1292 case SgprB512:
1293 case SgprPtr32:
1294 case SgprPtr64:
1295 case SgprPtr128:
1296 case VgprB32:
1297 case VgprB64:
1298 case VgprB96:
1299 case VgprB128:
1300 case VgprB256:
1301 case VgprB512:
1302 case VgprPtr32:
1303 case VgprPtr64:
1304 case VgprPtr128: {
1305 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1306 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1307 break;
1308 }
1309 // uniform in vcc/vgpr: scalars, vectors and B-types
1310 case UniInVcc: {
1311 assert(Ty == S1);
1312 assert(RB == SgprRB);
1313 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1314 Op.setReg(NewDst);
1315 if (!MRI.use_empty(Reg)) {
1316 auto CopyS32_Vcc =
1317 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1318 B.buildTrunc(Reg, CopyS32_Vcc);
1319 }
1320 break;
1321 }
1322 case UniInVgprS16: {
1323 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1324 assert(RB == SgprRB);
1325 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1326 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1327 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1328 Op.setReg(NewVgprDstS16);
1329 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1330 buildReadAnyLane(B, NewSgprDstS32, NewVgprDstS32, RBI);
1331 B.buildTrunc(Reg, NewSgprDstS32);
1332 break;
1333 }
1334 case UniInVgprS32:
1335 case UniInVgprS64:
1336 case UniInVgprV2S16:
1337 case UniInVgprV2S32:
1338 case UniInVgprV4S32:
1339 case UniInVgprV2S64: {
1340 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1341 assert(RB == SgprRB);
1342 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1343 Op.setReg(NewVgprDst);
1344 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1345 break;
1346 }
1347 case UniInVgprB32:
1348 case UniInVgprB64:
1349 case UniInVgprB96:
1350 case UniInVgprB128:
1351 case UniInVgprB256:
1352 case UniInVgprB512: {
1353 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1354 assert(RB == SgprRB);
1355 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1356 Op.setReg(NewVgprDst);
1357 AMDGPU::buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1358 break;
1359 }
1360 // sgpr trunc
1361 case Sgpr32Trunc: {
1362 assert(Ty.getSizeInBits() < 32);
1363 assert(RB == SgprRB);
1364 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1365 Op.setReg(NewDst);
1366 if (!MRI.use_empty(Reg))
1367 B.buildTrunc(Reg, NewDst);
1368 break;
1369 }
1370 case InvalidMapping: {
1372 MF, MORE, "amdgpu-regbanklegalize",
1373 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for", MI);
1374 return false;
1375 }
1376 default:
1378 MF, MORE, "amdgpu-regbanklegalize",
1379 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported", MI);
1380 return false;
1381 }
1382 }
1383
1384 return true;
1385}
1386
1387bool RegBankLegalizeHelper::applyMappingSrc(
1388 MachineInstr &MI, unsigned &OpIdx,
1389 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1390 SmallSet<Register, 4> &SgprWaterfallOperandRegs) {
1391 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) {
1392 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm)
1393 continue;
1394
1395 MachineOperand &Op = MI.getOperand(OpIdx);
1396 Register Reg = Op.getReg();
1397 LLT Ty = MRI.getType(Reg);
1398 const RegisterBank *RB = MRI.getRegBank(Reg);
1399
1400 switch (MethodIDs[i]) {
1401 case Vcc: {
1402 assert(Ty == S1);
1403 assert(RB == VccRB || RB == SgprRB);
1404 if (RB == SgprRB) {
1405 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1406 auto CopyVcc_Scc =
1407 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1408 Op.setReg(CopyVcc_Scc.getReg(0));
1409 }
1410 break;
1411 }
1412 // sgpr scalars, pointers and vectors
1413 case Sgpr16:
1414 case Sgpr32:
1415 case Sgpr64:
1416 case Sgpr128:
1417 case SgprP0:
1418 case SgprP1:
1419 case SgprP3:
1420 case SgprP4:
1421 case SgprP5:
1422 case SgprP8:
1423 case SgprV2S16:
1424 case SgprV2S32:
1425 case SgprV4S32: {
1426 assert(Ty == getTyFromID(MethodIDs[i]));
1427 assert(RB == getRegBankFromID(MethodIDs[i]));
1428 break;
1429 }
1430 // sgpr B-types
1431 case SgprB32:
1432 case SgprB64:
1433 case SgprB96:
1434 case SgprB128:
1435 case SgprB256:
1436 case SgprB512:
1437 case SgprPtr32:
1438 case SgprPtr64:
1439 case SgprPtr128: {
1440 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1441 assert(RB == getRegBankFromID(MethodIDs[i]));
1442 break;
1443 }
1444 // vgpr scalars, pointers and vectors
1445 case Vgpr16:
1446 case Vgpr32:
1447 case Vgpr64:
1448 case Vgpr128:
1449 case VgprP0:
1450 case VgprP1:
1451 case VgprP2:
1452 case VgprP3:
1453 case VgprP4:
1454 case VgprP5:
1455 case VgprV2S16:
1456 case VgprV2S32:
1457 case VgprV2S64:
1458 case VgprV3S32:
1459 case VgprV4S32: {
1460 assert(Ty == getTyFromID(MethodIDs[i]));
1461 if (RB != VgprRB) {
1462 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1463 Op.setReg(CopyToVgpr.getReg(0));
1464 }
1465 break;
1466 }
1467 // vgpr B-types
1468 case VgprB32:
1469 case VgprB64:
1470 case VgprB96:
1471 case VgprB128:
1472 case VgprB256:
1473 case VgprB512:
1474 case VgprPtr32:
1475 case VgprPtr64:
1476 case VgprPtr128: {
1477 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1478 if (RB != VgprRB) {
1479 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1480 Op.setReg(CopyToVgpr.getReg(0));
1481 }
1482 break;
1483 }
1484 // sgpr waterfall, scalars and vectors
1485 case Sgpr32_WF:
1486 case SgprV4S32_WF: {
1487 assert(Ty == getTyFromID(MethodIDs[i]));
1488 if (RB != SgprRB)
1489 SgprWaterfallOperandRegs.insert(Reg);
1490 break;
1491 }
1492 // sgpr and vgpr scalars with extend
1493 case Sgpr32AExt: {
1494 // Note: this ext allows S1, and it is meant to be combined away.
1495 assert(Ty.getSizeInBits() < 32);
1496 assert(RB == SgprRB);
1497 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1498 Op.setReg(Aext.getReg(0));
1499 break;
1500 }
1501 case Sgpr32AExtBoolInReg: {
1502 // Note: this ext allows S1, and it is meant to be combined away.
1503 assert(Ty.getSizeInBits() == 1);
1504 assert(RB == SgprRB);
1505 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1506 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
1507 // most of times meant to be combined away in AMDGPURegBankCombiner.
1508 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
1509 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
1510 Op.setReg(BoolInReg.getReg(0));
1511 break;
1512 }
1513 case Sgpr32SExt: {
1514 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1515 assert(RB == SgprRB);
1516 auto Sext = B.buildSExt(SgprRB_S32, Reg);
1517 Op.setReg(Sext.getReg(0));
1518 break;
1519 }
1520 case Sgpr32ZExt: {
1521 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1522 assert(RB == SgprRB);
1523 auto Zext = B.buildZExt({SgprRB, S32}, Reg);
1524 Op.setReg(Zext.getReg(0));
1525 break;
1526 }
1527 case Vgpr32AExt: {
1528 assert(Ty.getSizeInBits() < 32);
1529 assert(RB == VgprRB);
1530 auto Aext = B.buildAnyExt({VgprRB, S32}, Reg);
1531 Op.setReg(Aext.getReg(0));
1532 break;
1533 }
1534 case Vgpr32SExt: {
1535 // Note this ext allows S1, and it is meant to be combined away.
1536 assert(Ty.getSizeInBits() < 32);
1537 assert(RB == VgprRB);
1538 auto Sext = B.buildSExt({VgprRB, S32}, Reg);
1539 Op.setReg(Sext.getReg(0));
1540 break;
1541 }
1542 case Vgpr32ZExt: {
1543 // Note this ext allows S1, and it is meant to be combined away.
1544 assert(Ty.getSizeInBits() < 32);
1545 assert(RB == VgprRB);
1546 auto Zext = B.buildZExt({VgprRB, S32}, Reg);
1547 Op.setReg(Zext.getReg(0));
1548 break;
1549 }
1550 default:
1552 MF, MORE, "amdgpu-regbanklegalize",
1553 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported", MI);
1554 return false;
1555 }
1556 }
1557 return true;
1558}
1559
1561 Register Dst = MI.getOperand(0).getReg();
1562 LLT Ty = MRI.getType(Dst);
1563
1564 if (Ty == LLT::scalar(1) && MUI.isUniform(Dst)) {
1565 B.setInsertPt(*MI.getParent(), MI.getParent()->getFirstNonPHI());
1566
1567 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1568 MI.getOperand(0).setReg(NewDst);
1569 B.buildTrunc(Dst, NewDst);
1570
1571 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1572 Register UseReg = MI.getOperand(i).getReg();
1573
1574 auto DefMI = MRI.getVRegDef(UseReg)->getIterator();
1575 MachineBasicBlock *DefMBB = DefMI->getParent();
1576
1577 B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
1578
1579 auto NewUse = B.buildAnyExt(SgprRB_S32, UseReg);
1580 MI.getOperand(i).setReg(NewUse.getReg(0));
1581 }
1582
1583 return true;
1584 }
1585
1586 // ALL divergent i1 phis should have been lowered and inst-selected into PHI
1587 // with sgpr reg class and S1 LLT in AMDGPUGlobalISelDivergenceLowering pass.
1588 // Note: this includes divergent phis that don't require lowering.
1589 if (Ty == LLT::scalar(1) && MUI.isDivergent(Dst)) {
1590 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1591 "AMDGPU RegBankLegalize: Can't lower divergent S1 G_PHI",
1592 MI);
1593 return false;
1594 }
1595
1596 // We accept all types that can fit in some register class.
1597 // Uniform G_PHIs have all sgpr registers.
1598 // Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr.
1599 if (Ty == LLT::scalar(32) || Ty == LLT::pointer(1, 64) ||
1600 Ty == LLT::pointer(4, 64)) {
1601 return true;
1602 }
1603
1604 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1605 "AMDGPU RegBankLegalize: type not supported for G_PHI",
1606 MI);
1607 return false;
1608}
1609
1610[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
1611 const RegisterBank *RB,
1613 unsigned StartOpIdx,
1614 unsigned EndOpIdx) {
1615 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
1616 if (MRI.getRegBankOrNull(MI.getOperand(i).getReg()) != RB)
1617 return false;
1618 }
1619 return true;
1620}
1621
1623 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
1624 // Put RB on all registers
1625 unsigned NumDefs = MI.getNumDefs();
1626 unsigned NumOperands = MI.getNumOperands();
1627
1628 assert(verifyRegBankOnOperands(MI, RB, MRI, 0, NumDefs - 1));
1629 if (RB == SgprRB)
1630 assert(verifyRegBankOnOperands(MI, RB, MRI, NumDefs, NumOperands - 1));
1631
1632 if (RB == VgprRB) {
1633 B.setInstr(MI);
1634 for (unsigned i = NumDefs; i < NumOperands; ++i) {
1635 Register Reg = MI.getOperand(i).getReg();
1636 if (MRI.getRegBank(Reg) != RB) {
1637 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1638 MI.getOperand(i).setReg(Copy.getReg(0));
1639 }
1640 }
1641 }
1642}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
IRTranslator LLVM IR MI
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
const SmallVectorImpl< MachineOperand > & Cond
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
@ ICMP_NE
not equal
Definition InstrTypes.h:698
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
Representation of each machine instruction.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
bool empty() const
Definition SmallSet.h:168
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
void push_back(const T &Elt)
A range adaptor for a pair of iterators.
bool isAnyPtr(LLT Ty, unsigned Width)
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
@ Offset
Definition DWP.cpp:532
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:155
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
Definition Utils.cpp:259
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:434
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping