LLVM 23.0.0git
AMDGPURegBankLegalizeHelper.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Implements actual lowering algorithms for each ID that can be used in
10/// Rule.OperandMapping. Similar to legalizer helper but with register banks.
11//
12//===----------------------------------------------------------------------===//
13
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
20#include "GCNSubtarget.h"
27#include "llvm/IR/IntrinsicsAMDGPU.h"
28
29#define DEBUG_TYPE "amdgpu-regbanklegalize"
30
31using namespace llvm;
32using namespace AMDGPU;
33
36 const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
37 : MF(B.getMF()), ST(MF.getSubtarget<GCNSubtarget>()), B(B),
38 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
39 RBLRules(RBLRules), IsWave32(ST.isWave32()),
40 SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
41 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
42 AgprRB(&RBI.getRegBank(AMDGPU::AGPRRegBankID)),
43 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
44
46 const SetOfRulesForOpcode *RuleSet = RBLRules.getRulesForOpc(MI);
47 if (!RuleSet) {
48 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
49 "No AMDGPU RegBankLegalize rules defined for opcode",
50 MI);
51 return false;
52 }
53
54 const RegBankLLTMapping *Mapping = RuleSet->findMappingForMI(MI, MRI, MUI);
55 if (!Mapping) {
56 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
57 "AMDGPU RegBankLegalize: none of the rules defined with "
58 "'Any' for MI's opcode matched MI",
59 MI);
60 return false;
61 }
62
63 WaterfallInfo WFI;
64 unsigned OpIdx = 0;
65 if (!Mapping->DstOpMapping.empty()) {
66 B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
67 if (!applyMappingDst(MI, OpIdx, Mapping->DstOpMapping))
68 return false;
69 }
70 if (!Mapping->SrcOpMapping.empty()) {
71 B.setInstr(MI);
72 if (!applyMappingSrc(MI, OpIdx, Mapping->SrcOpMapping, WFI))
73 return false;
74 }
75
76 if (!lower(MI, *Mapping, WFI))
77 return false;
78
79 if (!WFI.SgprWaterfallOperandRegs.empty()) {
80 if (!executeInWaterfallLoop(B, WFI))
81 return false;
82 }
83
84 return true;
85}
86
87bool RegBankLegalizeHelper::executeInWaterfallLoop(MachineIRBuilder &B,
88 const WaterfallInfo &WFI) {
89 assert(WFI.Start.isValid() && WFI.End.isValid() &&
90 "Waterfall range not initialized");
91
92 // Track use registers which have already been expanded with a readfirstlane
93 // sequence. This may have multiple uses if moving a sequence.
94 DenseMap<Register, Register> WaterfalledRegMap;
95
96 MachineBasicBlock &MBB = B.getMBB();
97 MachineFunction &MF = B.getMF();
98
101
102 const SIRegisterInfo *TRI = ST.getRegisterInfo();
103 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
105
106#ifndef NDEBUG
107 const int OrigRangeSize = std::distance(BeginIt, EndIt);
108#endif
109
110 MachineRegisterInfo &MRI = *B.getMRI();
111 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
112 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
113
114 // Don't bother using generic instructions/registers for the exec mask.
115 B.setInstr(*WFI.Start);
116 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
117
118 Register SavedExec = MRI.createVirtualRegister(WaveRC);
119
120 // To insert the loop we need to split the block. Move everything before
121 // this point to a new block, and insert a new empty block before this
122 // instruction.
125 MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
126 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
128 ++MBBI;
129 MF.insert(MBBI, LoopBB);
130 MF.insert(MBBI, BodyBB);
131 MF.insert(MBBI, RestoreExecBB);
132 MF.insert(MBBI, RemainderBB);
133
134 LoopBB->addSuccessor(BodyBB);
135 BodyBB->addSuccessor(RestoreExecBB);
136 BodyBB->addSuccessor(LoopBB);
137
138 // Move the rest of the block into a new block.
140 RemainderBB->splice(RemainderBB->begin(), &MBB, EndIt, MBB.end());
141
142 MBB.addSuccessor(LoopBB);
143 RestoreExecBB->addSuccessor(RemainderBB);
144
145 B.setInsertPt(*LoopBB, LoopBB->end());
146
147 // +-MBB:------------+
148 // | ... |
149 // | %0 = G_INST_1 |
150 // | %Dst = MI %Vgpr |
151 // | %1 = G_INST_2 |
152 // | ... |
153 // +-----------------+
154 // ->
155 // +-MBB-------------------------------+
156 // | ... |
157 // | %0 = G_INST_1 |
158 // | %SaveExecReg = S_MOV_B32 $exec_lo |
159 // +----------------|------------------+
160 // | /------------------------------|
161 // V V |
162 // +-LoopBB---------------------------------------------------------------+ |
163 // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
164 // | instead of executing for each lane, see if other lanes had | |
165 // | same value for %Vgpr and execute for them also. | |
166 // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
167 // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
168 // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
169 // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
170 // +----------------|-----------------------------------------------------+ |
171 // V |
172 // +-BodyBB------------------------------------------------------------+ |
173 // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
174 // | executed only for active lanes and written to Dst | |
175 // | $exec = S_XOR_B32 $exec, %SavedExec | |
176 // | set active lanes to 0 in SavedExec, lanes that did not write to | |
177 // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
178 // | SI_WATERFALL_LOOP LoopBB |-----|
179 // +----------------|--------------------------------------------------+
180 // V
181 // +-RestoreExecBB--------------------------+
182 // | $exec_lo = S_MOV_B32_term %SaveExecReg |
183 // +----------------|-----------------------+
184 // V
185 // +-RemainderBB:----------------------+
186 // | %1 = G_INST_2 |
187 // | ... |
188 // +---------------------------------- +
189
190 // Move the instruction into the loop body. Note we moved everything after
191 // Range.end() already into a new block, so Range.end() is no longer valid.
192 BodyBB->splice(BodyBB->end(), &MBB, BeginIt, MBB.end());
193
194 // Figure out the iterator range after splicing the instructions.
195 MachineBasicBlock::iterator NewBegin = BeginIt;
196 auto NewEnd = BodyBB->end();
197 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
198
199 B.setMBB(*LoopBB);
200 Register CondReg;
201
202 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
203 for (MachineOperand &Op : MI.all_uses()) {
204 Register OldReg = Op.getReg();
205 if (!WFI.SgprWaterfallOperandRegs.count(OldReg))
206 continue;
207
208 // See if we already processed this register in another instruction in
209 // the sequence.
210 auto OldVal = WaterfalledRegMap.find(OldReg);
211 if (OldVal != WaterfalledRegMap.end()) {
212 Op.setReg(OldVal->second);
213 continue;
214 }
215
216 Register OpReg = Op.getReg();
217 LLT OpTy = MRI.getType(OpReg);
218
219 // TODO: support for agpr
220 assert(MRI.getRegBank(OpReg) == VgprRB);
221 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
222 buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
223
224 // Build the comparison(s), CurrentLaneReg == OpReg.
225 unsigned OpSize = OpTy.getSizeInBits();
226 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
227 LLT PartTy = LLT::scalar(PartSize);
228 unsigned NumParts = OpSize / PartSize;
230 SmallVector<Register, 8> CurrentLaneParts;
231
232 if (NumParts == 1) {
233 OpParts.push_back(OpReg);
234 CurrentLaneParts.push_back(CurrentLaneReg);
235 } else {
236 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
237 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
238 for (unsigned i = 0; i < NumParts; ++i) {
239 OpParts.push_back(UnmergeOp.getReg(i));
240 CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
241 }
242 }
243
244 for (unsigned i = 0; i < NumParts; ++i) {
245 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
246 B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
247
248 if (!CondReg)
249 CondReg = CmpReg;
250 else
251 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
252 }
253
254 Op.setReg(CurrentLaneReg);
255
256 // Make sure we don't re-process this register again.
257 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
258 }
259 }
260
261 // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
262 Register CondRegLM =
263 MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
264 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
265
266 // Update EXEC, save the original EXEC value to SavedExec.
267 B.buildInstr(LMC.AndSaveExecOpc)
268 .addDef(SavedExec)
269 .addReg(CondRegLM, RegState::Kill);
270 MRI.setSimpleHint(SavedExec, CondRegLM);
271
272 B.setInsertPt(*BodyBB, BodyBB->end());
273
274 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
275 B.buildInstr(LMC.XorTermOpc)
276 .addDef(LMC.ExecReg)
277 .addReg(LMC.ExecReg)
278 .addReg(SavedExec);
279
280 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
281 // s_cbranch_scc0?
282
283 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
284 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
285
286 // Save the EXEC mask before the loop.
287 B.setInsertPt(MBB, MBB.end());
288 B.buildInstr(LMC.MovOpc).addDef(SaveExecReg).addReg(LMC.ExecReg);
289
290 // Restore the EXEC mask after the loop.
291 B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
292 B.buildInstr(LMC.MovTermOpc).addDef(LMC.ExecReg).addReg(SaveExecReg);
293
294 // Set the insert point after the original instruction, so any new
295 // instructions will be in the remainder.
296 B.setInsertPt(*RemainderBB, RemainderBB->begin());
297
298 return true;
299}
300
301bool RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
302 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
303 MachineFunction &MF = B.getMF();
304 assert(MI.getNumMemOperands() == 1);
305 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
306 Register Dst = MI.getOperand(0).getReg();
307 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
308 Register Base = MI.getOperand(1).getReg();
309 LLT PtrTy = MRI.getType(Base);
310 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base);
311 LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
312 SmallVector<Register, 4> LoadPartRegs;
313
314 unsigned ByteOffset = 0;
315 for (LLT PartTy : LLTBreakdown) {
316 Register BasePlusOffset;
317 if (ByteOffset == 0) {
318 BasePlusOffset = Base;
319 } else {
320 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
321 BasePlusOffset =
322 B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0);
323 }
324 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
325 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
326 LoadPartRegs.push_back(LoadPart.getReg(0));
327 ByteOffset += PartTy.getSizeInBytes();
328 }
329
330 if (!MergeTy.isValid()) {
331 // Loads are of same size, concat or merge them together.
332 B.buildMergeLikeInstr(Dst, LoadPartRegs);
333 } else {
334 // Loads are not all of same size, need to unmerge them to smaller pieces
335 // of MergeTy type, then merge pieces to Dst.
336 SmallVector<Register, 4> MergeTyParts;
337 for (Register Reg : LoadPartRegs) {
338 if (MRI.getType(Reg) == MergeTy) {
339 MergeTyParts.push_back(Reg);
340 } else {
341 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg);
342 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
343 MergeTyParts.push_back(Unmerge.getReg(i));
344 }
345 }
346 B.buildMergeLikeInstr(Dst, MergeTyParts);
347 }
348 MI.eraseFromParent();
349 return true;
350}
351
352bool RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
353 LLT MergeTy) {
354 MachineFunction &MF = B.getMF();
355 assert(MI.getNumMemOperands() == 1);
356 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
357 Register Dst = MI.getOperand(0).getReg();
358 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
359 Register Base = MI.getOperand(1).getReg();
360
361 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
362 auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO);
363
364 if (WideTy.isScalar()) {
365 B.buildTrunc(Dst, WideLoad);
366 } else {
367 SmallVector<Register, 4> MergeTyParts;
368 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
369
370 LLT DstTy = MRI.getType(Dst);
371 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
372 for (unsigned i = 0; i < NumElts; ++i) {
373 MergeTyParts.push_back(Unmerge.getReg(i));
374 }
375 B.buildMergeLikeInstr(Dst, MergeTyParts);
376 }
377 MI.eraseFromParent();
378 return true;
379}
380
381bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
382 Register Dst = MI.getDstReg();
383 Register Ptr = MI.getPointerReg();
384 MachineMemOperand &MMO = MI.getMMO();
385 unsigned MemSize = 8 * MMO.getSize().getValue();
386
387 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
388
389 if (MI.getOpcode() == G_LOAD) {
390 B.buildLoad(Dst, Ptr, *WideMMO);
391 } else {
392 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
393
394 if (MI.getOpcode() == G_ZEXTLOAD) {
395 APInt Mask = APInt::getLowBitsSet(S32.getSizeInBits(), MemSize);
396 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
397 B.buildAnd(Dst, Load, MaskCst);
398 } else {
399 assert(MI.getOpcode() == G_SEXTLOAD);
400 B.buildSExtInReg(Dst, Load, MemSize);
401 }
402 }
403
404 MI.eraseFromParent();
405 return true;
406}
407
408bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
409 Register Dst = MI.getOperand(0).getReg();
410 LLT Ty = MRI.getType(Dst);
411 Register Src = MI.getOperand(1).getReg();
412 unsigned Opc = MI.getOpcode();
413 int TrueExtCst = Opc == G_SEXT ? -1 : 1;
414 if (Ty == S32 || Ty == S16) {
415 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
416 auto False = B.buildConstant({VgprRB, Ty}, 0);
417 B.buildSelect(Dst, Src, True, False);
418 } else if (Ty == S64) {
419 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
420 auto False = B.buildConstant({VgprRB_S32}, 0);
421 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
422 MachineInstrBuilder Hi;
423 switch (Opc) {
424 case G_SEXT:
425 Hi = Lo;
426 break;
427 case G_ZEXT:
428 Hi = False;
429 break;
430 case G_ANYEXT:
431 Hi = B.buildUndef({VgprRB_S32});
432 break;
433 default:
435 MF, MORE, "amdgpu-regbanklegalize",
436 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported", MI);
437 return false;
438 }
439
440 B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)});
441 } else {
443 MF, MORE, "amdgpu-regbanklegalize",
444 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported", MI);
445 return false;
446 }
447
448 MI.eraseFromParent();
449 return true;
450}
451
452std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
453 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
454 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
455 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
456 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
457 return {Lo.getReg(0), Hi.getReg(0)};
458}
459
460std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
461 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
462 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
463 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
464 return {Lo.getReg(0), Hi.getReg(0)};
465}
466
467std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
468 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
469 auto Lo = PackedS32;
470 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
471 return {Lo.getReg(0), Hi.getReg(0)};
472}
473
474std::pair<Register, Register>
475RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) {
476 auto [Lo32, Hi32] = unpackAExt(Reg);
477 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
478 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
479}
480
481bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
482 Register Lo, Hi;
483 switch (MI.getOpcode()) {
484 case AMDGPU::G_SHL: {
485 auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg());
486 auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg());
487 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
488 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
489 break;
490 }
491 case AMDGPU::G_LSHR: {
492 auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg());
493 auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg());
494 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
495 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
496 break;
497 }
498 case AMDGPU::G_ASHR: {
499 auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg());
500 auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg());
501 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
502 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
503 break;
504 }
505 default:
507 MF, MORE, "amdgpu-regbanklegalize",
508 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
509 MI);
510 return false;
511 }
512 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
513 MI.eraseFromParent();
514 return true;
515}
516
517bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
518 Register Lo, Hi;
519 switch (MI.getOpcode()) {
520 case AMDGPU::G_SMIN:
521 case AMDGPU::G_SMAX: {
522 // For signed operations, use sign extension
523 auto [Val0_Lo, Val0_Hi] = unpackSExt(MI.getOperand(1).getReg());
524 auto [Val1_Lo, Val1_Hi] = unpackSExt(MI.getOperand(2).getReg());
525 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
526 .getReg(0);
527 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
528 .getReg(0);
529 break;
530 }
531 case AMDGPU::G_UMIN:
532 case AMDGPU::G_UMAX: {
533 // For unsigned operations, use zero extension
534 auto [Val0_Lo, Val0_Hi] = unpackZExt(MI.getOperand(1).getReg());
535 auto [Val1_Lo, Val1_Hi] = unpackZExt(MI.getOperand(2).getReg());
536 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
537 .getReg(0);
538 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
539 .getReg(0);
540 break;
541 }
542 default:
544 MF, MORE, "amdgpu-regbanklegalize",
545 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented", MI);
546 return false;
547 }
548 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
549 MI.eraseFromParent();
550 return true;
551}
552
553bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) {
554 auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg());
555 auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg());
556 auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
557 auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
558 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
559 {ResLo.getReg(0), ResHi.getReg(0)});
560 MI.eraseFromParent();
561 return true;
562}
563
566 return (GI->is(Intrinsic::amdgcn_sbfe));
567
568 return MI.getOpcode() == AMDGPU::G_SBFX;
569}
570
571bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
572 Register Dst = MI.getOperand(0).getReg();
573 assert(MRI.getType(Dst) == LLT::scalar(64));
574 bool Signed = isSignedBFE(MI);
575 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
576 // Extract bitfield from Src, LSBit is the least-significant bit for the
577 // extraction (field offset) and Width is size of bitfield.
578 Register Src = MI.getOperand(FirstOpnd).getReg();
579 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
580 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
581 // Comments are for signed bitfield extract, similar for unsigned. x is sign
582 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
583
584 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
585 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
586 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
587
588 auto ConstWidth = getIConstantVRegValWithLookThrough(Width, MRI);
589
590 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
591 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
592 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
593 if (!ConstWidth) {
594 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
595 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
596 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
597 MI.eraseFromParent();
598 return true;
599 }
600
601 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
602 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
603 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
604 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
605 auto Zero = B.buildConstant({VgprRB, S32}, 0);
606 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
607
608 if (WidthImm <= 32) {
609 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
610 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo, Zero, Width});
611 MachineInstrBuilder Hi;
612 if (Signed) {
613 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
614 Hi = B.buildAShr(VgprRB_S32, Lo, B.buildConstant(VgprRB_S32, 31));
615 } else {
616 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
617 Hi = Zero;
618 }
619 B.buildMergeLikeInstr(Dst, {Lo, Hi});
620 } else {
621 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
622 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
623 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi, Zero, Amt});
624 B.buildMergeLikeInstr(Dst, {SHRSrcLo, Hi});
625 }
626
627 MI.eraseFromParent();
628 return true;
629}
630
631bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
632 Register DstReg = MI.getOperand(0).getReg();
633 LLT Ty = MRI.getType(DstReg);
634 bool Signed = isSignedBFE(MI);
635 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
636 Register Src = MI.getOperand(FirstOpnd).getReg();
637 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
638 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
639 // For uniform bit field extract there are 4 available instructions, but
640 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
641 // field offset in low and size in high 16 bits.
642
643 // Src1 Hi16|Lo16 = Size|FieldOffset
644 auto Mask = B.buildConstant(SgprRB_S32, maskTrailingOnes<unsigned>(6));
645 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
646 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
647 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset, Size);
648 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
649 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
650 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
651
652 // Select machine instruction, because of reg class constraining, insert
653 // copies from reg class to reg bank.
654 auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}},
655 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
656 constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(),
657 *ST.getRegisterInfo(), RBI);
658
659 B.buildCopy(DstReg, S_BFE->getOperand(0).getReg());
660 MI.eraseFromParent();
661 return true;
662}
663
664bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
665 Register Dst = MI.getOperand(0).getReg();
666 LLT DstTy = MRI.getType(Dst);
667 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
668 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
669 auto Op1 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(1).getReg());
670 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
671 unsigned Opc = MI.getOpcode();
672 auto Flags = MI.getFlags();
673 auto Lo =
674 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)}, Flags);
675 auto Hi =
676 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags);
677 B.buildMergeLikeInstr(Dst, {Lo, Hi});
678 MI.eraseFromParent();
679 return true;
680}
681
682bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &MI) {
683 Register Dst = MI.getOperand(0).getReg();
684 assert(MRI.getType(Dst) == S64);
685 auto Op1 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(1).getReg());
686 auto Op2 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(2).getReg());
687
688 // TODO: G_AMDGPU_MAD_* optimizations for G_MUL divergent S64 operation to
689 // match GlobalISel with old regbankselect.
690 auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
691 auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
692 auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
693 auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
694 auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
695 auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
696
697 B.buildMergeLikeInstr(Dst, {Lo, Hi});
698 MI.eraseFromParent();
699 return true;
700}
701
702bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
703 Register Dst = MI.getOperand(0).getReg();
704 assert(MRI.getType(Dst) == V2S16);
705 unsigned Opc = MI.getOpcode();
706 unsigned NumOps = MI.getNumOperands();
707 auto Flags = MI.getFlags();
708
709 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
710
711 if (NumOps == 2) {
712 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags);
713 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags);
714 B.buildMergeLikeInstr(Dst, {Lo, Hi});
715 MI.eraseFromParent();
716 return true;
717 }
718
719 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(MI.getOperand(2).getReg());
720
721 if (NumOps == 3) {
722 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
723 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
724 B.buildMergeLikeInstr(Dst, {Lo, Hi});
725 MI.eraseFromParent();
726 return true;
727 }
728
729 assert(NumOps == 4);
730 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(MI.getOperand(3).getReg());
731 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo}, Flags);
732 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi}, Flags);
733 B.buildMergeLikeInstr(Dst, {Lo, Hi});
734 MI.eraseFromParent();
735 return true;
736}
737
738bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &MI) {
739 Register Dst0 = MI.getOperand(0).getReg();
740 Register Dst1 = MI.getOperand(1).getReg();
741 Register Src0 = MI.getOperand(2).getReg();
742 Register Src1 = MI.getOperand(3).getReg();
743 Register Src2 = MI.getOperand(4).getReg();
744
745 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
746
747 // Keep the multiplication on the SALU.
748 Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
749 Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
750 if (ST.hasScalarMulHiInsts()) {
751 B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
752 } else {
753 auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
754 auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
755 auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
756 buildReadAnyLane(B, DstHi, MulHi.getReg(0), RBI);
757 }
758
759 // Accumulate and produce the "carry-out" bit.
760
761 // The "carry-out" is defined as bit 64 of the result when computed as a
762 // big integer. For unsigned multiply-add, this matches the usual
763 // definition of carry-out.
764 if (mi_match(Src2, MRI, MIPatternMatch::m_ZeroInt())) {
765 // No accumulate: result is just the multiplication, carry is 0.
766 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
767 B.buildConstant(Dst1, 0);
768 } else {
769 // Accumulate: add Src2 to the multiplication result with carry chain.
770 Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
771 Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
772 B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
773
774 auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
775 auto AddHi =
776 B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
777 B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
778 B.buildCopy(Dst1, AddHi.getReg(1));
779 }
780
781 MI.eraseFromParent();
782 return true;
783}
784
785bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
786 Register Dst = MI.getOperand(0).getReg();
787 LLT DstTy = MRI.getType(Dst);
788 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
789 (DstTy.isPointer() && DstTy.getSizeInBits() == 64));
790 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
791 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
792 auto Op3 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(3).getReg());
793 Register Cond = MI.getOperand(1).getReg();
794 auto Flags = MI.getFlags();
795 auto Lo =
796 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(0), Op3.getReg(0), Flags);
797 auto Hi =
798 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(1), Op3.getReg(1), Flags);
799
800 B.buildMergeLikeInstr(Dst, {Lo, Hi});
801 MI.eraseFromParent();
802 return true;
803}
804
805bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
806 auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
807 int Amt = MI.getOperand(2).getImm();
808 Register Lo, Hi;
809 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
810 if (Amt <= 32) {
811 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
812 if (Amt == 32) {
813 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
814 Lo = Freeze.getReg(0);
815 } else {
816 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
817 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
818 }
819
820 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
821 Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0);
822 } else {
823 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
824 Lo = Op1.getReg(0);
825 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
826 }
827
828 B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
829 MI.eraseFromParent();
830 return true;
831}
832
833bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &MI) {
834 // Split 64-bit find-first-bit operations into 32-bit halves:
835 // (ffbh hi:lo) -> umin(ffbh(hi), uaddsat(ffbh(lo), 32))
836 // (ffbl hi:lo) -> umin(ffbl(lo), uaddsat(ffbl(hi), 32))
837 // (ctlz_zero_poison hi:lo) -> umin(ffbh(hi), add(ffbh(lo), 32))
838 // (cttz_zero_poison hi:lo) -> umin(ffbl(lo), add(ffbl(hi), 32))
839 unsigned Opc = MI.getOpcode();
840
841 // FFBH/FFBL return 0xFFFFFFFF on zero input, using uaddsat to avoid
842 // wrapping. CTLZ/CTTZ guarantee non-zero input (zero_poison), so plain add
843 // is fine.
844 unsigned FFBOpc;
845 unsigned AddOpc;
846 bool SearchFromMSB;
847 switch (Opc) {
848 case AMDGPU::G_AMDGPU_FFBH_U32:
849 FFBOpc = Opc;
850 AddOpc = AMDGPU::G_UADDSAT;
851 SearchFromMSB = true;
852 break;
853 case AMDGPU::G_AMDGPU_FFBL_B32:
854 FFBOpc = Opc;
855 AddOpc = AMDGPU::G_UADDSAT;
856 SearchFromMSB = false;
857 break;
858 case AMDGPU::G_CTLZ_ZERO_POISON:
859 FFBOpc = AMDGPU::G_AMDGPU_FFBH_U32;
860 AddOpc = AMDGPU::G_ADD;
861 SearchFromMSB = true;
862 break;
863 case AMDGPU::G_CTTZ_ZERO_POISON:
864 FFBOpc = AMDGPU::G_AMDGPU_FFBL_B32;
865 AddOpc = AMDGPU::G_ADD;
866 SearchFromMSB = false;
867 break;
868 default:
869 llvm_unreachable("unexpected opcode in lowerSplitBitCount64To32");
870 }
871
872 auto Unmerge = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
873 Register Lo = Unmerge.getReg(0);
874 Register Hi = Unmerge.getReg(1);
875
876 // MSB-first (FFBH/CTLZ) searches hi first; LSB-first (FFBL/CTTZ) searches
877 // lo first. The secondary half adds 32 to account for the primary half's
878 // width.
879 auto Primary = B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Hi : Lo});
880 auto Secondary =
881 B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Lo : Hi});
882
883 auto Adjusted = B.buildInstr(AddOpc, {VgprRB_S32},
884 {Secondary, B.buildConstant(VgprRB_S32, 32)});
885 B.buildUMin(MI.getOperand(0).getReg(), Primary, Adjusted);
886
887 MI.eraseFromParent();
888 return true;
889}
890
891bool RegBankLegalizeHelper::lowerExtrVecEltToSel(MachineInstr &MI) {
892 // Lower extract vector element to a compare-select chain:
893 // result = elt[0]
894 // for i in 1..N-1:
895 // result = (idx == i) ? elt[i] : result
896 //
897 // When the index is divergent, each lane may want a different element, so
898 // we must check every element per lane.
899 Register Dst = MI.getOperand(0).getReg();
900 Register Src = MI.getOperand(1).getReg();
901 Register Idx = MI.getOperand(2).getReg();
902
903 LLT VecTy = MRI.getType(Src);
904 LLT ScalarTy = VecTy.getScalarType();
905 unsigned NumElts = VecTy.getNumElements();
906 MachineRegisterInfo::VRegAttrs VgprRB_EltTy = {VgprRB, ScalarTy};
907
908 auto Unmerge = B.buildUnmerge(VgprRB_EltTy, Src);
909
910 if (ScalarTy.getSizeInBits() == 32) {
911 Register PrevSelect = Unmerge.getReg(0);
912 for (unsigned I = 1; I < NumElts; ++I) {
913 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)}, I);
914 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
915 PrevSelect =
916 B.buildSelect(VgprRB_EltTy, Cmp, Unmerge.getReg(I), PrevSelect)
917 .getReg(0);
918 }
919 B.buildCopy(Dst, PrevSelect);
920 } else if (ScalarTy.getSizeInBits() == 64) {
921 auto InitUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(0));
922 Register PrevLo = InitUnmerge.getReg(0);
923 Register PrevHi = InitUnmerge.getReg(1);
924 for (unsigned I = 1; I < NumElts; ++I) {
925 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)}, I);
926 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
927 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(I));
928 PrevLo = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(0), PrevLo)
929 .getReg(0);
930 PrevHi = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(1), PrevHi)
931 .getReg(0);
932 }
933 B.buildMergeLikeInstr(Dst, {PrevLo, PrevHi});
934 } else {
936 MF, MORE, "amdgpu-regbanklegalize",
937 "AMDGPU RegBankLegalize: ExtrVecEltToSel unsupported element type", MI);
938 return false;
939 }
940
941 MI.eraseFromParent();
942 return true;
943}
944
945bool RegBankLegalizeHelper::lowerExtrVecEltTo32(MachineInstr &MI) {
946 // Reduce a 64-bit element extract to two 32-bit extracts:
947 // vec32 = bitcast <N x s64> to <2N x s32>
948 // lo = vec32[idx * 2]
949 // hi = vec32[idx * 2 + 1]
950 // result = merge(lo, hi)
951 //
952 // When the index is uniform, all lanes extract the same element, so we can
953 // just split the s64 extract into two s32 extracts which lower to MOVREL.
954 Register Dst = MI.getOperand(0).getReg();
955 Register Src = MI.getOperand(1).getReg();
956 Register Idx = MI.getOperand(2).getReg();
957
958 LLT SrcTy = MRI.getType(Src);
959 LLT Vec32Ty = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
960
961 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
962 "expected VGPR src and SGPR idx");
963
964 auto CastSrc = B.buildBitcast({VgprRB, Vec32Ty}, Src);
965
966 // Calculate new Lo and Hi indices
967 auto One = B.buildConstant(SgprRB_S32, 1);
968 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
969 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
970
971 auto ExtLo = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxLo);
972 auto ExtHi = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxHi);
973
974 B.buildMergeLikeInstr(Dst, {ExtLo.getReg(0), ExtHi.getReg(0)});
975
976 MI.eraseFromParent();
977 return true;
978}
979
980bool RegBankLegalizeHelper::lowerInsVecEltToSel(MachineInstr &MI) {
981 // Lower insert vector element to a compare-select chain:
982 // for i in 0..N-1:
983 // result[i] = (idx == i) ? elt : srcVec[i]
984 // dst = merge(result[0..N-1])
985 //
986 // VGPR B64 requires splitting to lo/hi s32 pairs since there is no
987 // v_cndmask_b64. SGPR B64/B32 and VGPR B32 can be handled natively.
988 Register Dst = MI.getOperand(0).getReg();
989 Register Src = MI.getOperand(1).getReg();
990 Register Elt = MI.getOperand(2).getReg();
991 Register Idx = MI.getOperand(3).getReg();
992
993 LLT VecTy = MRI.getType(Src);
994 LLT ScalarTy = VecTy.getScalarType();
995 unsigned NumElts = VecTy.getNumElements();
996 const RegisterBank *SrcRB = MRI.getRegBank(Src);
997 bool IsSGPR = (SrcRB == SgprRB);
998 SmallVector<Register, 16> Selects;
999
1000 if (!IsSGPR && ScalarTy.getSizeInBits() == 64) {
1001 // VGPR B64: split to 32-bit lo/hi since there is no v_cndmask_b64.
1002 auto Unmerge = B.buildUnmerge(VgprRB_S32, Src);
1003 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1004 Register EltLo = EltUnmerge.getReg(0);
1005 Register EltHi = EltUnmerge.getReg(1);
1006 for (unsigned I = 0; I < NumElts; ++I) {
1007 auto IdxConst = B.buildConstant(VgprRB_S32, I);
1008 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
1009 Selects.push_back(
1010 B.buildSelect(VgprRB_S32, Cmp, EltLo, Unmerge.getReg(2 * I))
1011 .getReg(0));
1012 Selects.push_back(
1013 B.buildSelect(VgprRB_S32, Cmp, EltHi, Unmerge.getReg(2 * I + 1))
1014 .getReg(0));
1015 }
1016 LLT Vec32Ty = LLT::fixed_vector(2 * NumElts, 32);
1017 auto Vec32 = B.buildBuildVector({VgprRB, Vec32Ty}, Selects);
1018 B.buildBitcast(Dst, Vec32);
1019 } else if (ScalarTy.getSizeInBits() == 32 || ScalarTy.getSizeInBits() == 64) {
1020 // B32 (any bank) and SGPR B64: element-wise select at native width.
1021 MachineRegisterInfo::VRegAttrs SrcRB_EltTy = {SrcRB, ScalarTy};
1022 MachineRegisterInfo::VRegAttrs CmpTy = IsSGPR ? SgprRB_S32 : VccRB_S1;
1023 auto Unmerge = B.buildUnmerge(SrcRB_EltTy, Src);
1024 for (unsigned I = 0; I < NumElts; ++I) {
1025 auto IdxConst = B.buildConstant(SgprRB_S32, I);
1026 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CmpTy, Idx, IdxConst);
1027 Selects.push_back(
1028 B.buildSelect(SrcRB_EltTy, Cmp, Elt, Unmerge.getReg(I)).getReg(0));
1029 }
1030 B.buildMergeLikeInstr(Dst, Selects);
1031 } else {
1033 MF, MORE, "amdgpu-regbanklegalize",
1034 "AMDGPU RegBankLegalize: InsVecEltToSel unsupported element type", MI);
1035 return false;
1036 }
1037
1038 MI.eraseFromParent();
1039 return true;
1040}
1041
1042bool RegBankLegalizeHelper::lowerInsVecEltTo32(MachineInstr &MI) {
1043 // Reduce a 64-bit element insert to two 32-bit inserts:
1044 // vec32 = bitcast <N x s64> to <2N x s32>
1045 // lo, hi = unmerge elt
1046 // vec32[idx * 2] = lo
1047 // vec32[idx * 2 + 1] = hi
1048 // dst = bitcast <2N x s32> to <N x s64>
1049 //
1050 // When the index is uniform, all lanes insert at the same position, so we
1051 // can split the s64 insert into two s32 inserts which lower to MOVREL/GPRIDX.
1052 Register Dst = MI.getOperand(0).getReg();
1053 Register Src = MI.getOperand(1).getReg();
1054 Register Elt = MI.getOperand(2).getReg();
1055 Register Idx = MI.getOperand(3).getReg();
1056
1057 LLT SrcTy = MRI.getType(Src);
1058 LLT Vec32Ty = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
1059
1060 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
1061 "expected VGPR src and SGPR idx");
1062
1063 MachineRegisterInfo::VRegAttrs VgprRB_Vec32Ty = {VgprRB, Vec32Ty};
1064
1065 auto CastSrc = B.buildBitcast(VgprRB_Vec32Ty, Src);
1066 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1067
1068 // Calculate new Lo and Hi indices
1069 auto One = B.buildConstant(SgprRB_S32, 1);
1070 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
1071 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
1072
1073 auto InsLo = B.buildInsertVectorElement(VgprRB_Vec32Ty, CastSrc,
1074 EltUnmerge.getReg(0), IdxLo);
1075 auto InsHi = B.buildInsertVectorElement(VgprRB_Vec32Ty, InsLo,
1076 EltUnmerge.getReg(1), IdxHi);
1077
1078 B.buildBitcast(Dst, InsHi);
1079
1080 MI.eraseFromParent();
1081 return true;
1082}
1083
1084bool RegBankLegalizeHelper::lowerAbsToNegMax(MachineInstr &MI) {
1085 // Lower divergent G_ABS to smax(x, 0 - x) in the VGPR bank:
1086 // zero = 0
1087 // neg = G_SUB zero, x
1088 // dst = G_SMAX x, neg
1089 //
1090 // There is no integer v_abs instruction on AMDGPU, so divergent G_ABS is
1091 // expanded to this sub/smax pair.
1092 Register DstReg = MI.getOperand(0).getReg();
1093 Register SrcReg = MI.getOperand(1).getReg();
1094 LLT Ty = MRI.getType(DstReg);
1095
1096 Register Zero;
1097 if (Ty == V2S16) {
1098 // buildConstant cannot produce a V2S16 directly; pack two S16 zeros.
1099 Register Zero16 = B.buildConstant({VgprRB, S16}, 0).getReg(0);
1100 Zero = B.buildBuildVector({VgprRB, Ty}, {Zero16, Zero16}).getReg(0);
1101 } else {
1102 assert((Ty == S32 || Ty == S16) && "unexpected type for AbsToNegMax");
1103 Zero = B.buildConstant({VgprRB, Ty}, 0).getReg(0);
1104 }
1105
1106 auto Neg = B.buildSub({VgprRB, Ty}, Zero, SrcReg);
1107 B.buildSMax(DstReg, SrcReg, Neg);
1108 MI.eraseFromParent();
1109 return true;
1110}
1111
1112bool RegBankLegalizeHelper::lowerAbsToS32(MachineInstr &MI) {
1113 // Lower uniform V2S16 abs by unpacking the values to two separate SGPR
1114 // registers and re-emitting G_ABS on each:
1115 // packed = bitcast <2 x s16> src to s32
1116 // lo = sext_inreg packed, 16
1117 // hi = ashr packed, 16
1118 // dst = build_vector_trunc G_ABS(lo), G_ABS(hi)
1119 //
1120 // SALU only has s_abs_i32, with no direct uniform V2S16 abs. The
1121 // re-emitted G_ABS(SgprRB, S32) selects to s_abs_i32 on each value.
1122 auto Bitcast = B.buildBitcast({SgprRB_S32}, MI.getOperand(1).getReg());
1123 auto SextInReg = B.buildSExtInReg({SgprRB_S32}, Bitcast, 16);
1124 auto ShiftHi =
1125 B.buildAShr({SgprRB_S32}, Bitcast, B.buildConstant({SgprRB_S32}, 16));
1126
1127 auto AbsLo = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {SextInReg});
1128 auto AbsHi = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {ShiftHi});
1129 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
1130 {AbsLo.getReg(0), AbsHi.getReg(0)});
1131
1132 MI.eraseFromParent();
1133 return true;
1134}
1135
1136bool RegBankLegalizeHelper::lower(MachineInstr &MI,
1137 const RegBankLLTMapping &Mapping,
1138 WaterfallInfo &WFI) {
1139
1140 switch (Mapping.LoweringMethod) {
1141 case DoNotLower:
1142 break;
1143 case VccExtToSel:
1144 return lowerVccExtToSel(MI);
1145 case UniExtToSel: {
1146 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1147 auto True = B.buildConstant({SgprRB, Ty},
1148 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
1149 auto False = B.buildConstant({SgprRB, Ty}, 0);
1150 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare.
1151 // We are making select here. S1 cond was already 'any-extended to S32' +
1152 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg.
1153 B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True,
1154 False);
1155 MI.eraseFromParent();
1156 return true;
1157 }
1158 case UnpackBitShift:
1159 return lowerUnpackBitShift(MI);
1160 case UnpackMinMax:
1161 return lowerUnpackMinMax(MI);
1162 case ScalarizeToS16:
1163 return lowerSplitTo16(MI);
1164 case Ext32To64: {
1165 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
1166 MachineInstrBuilder Hi;
1167 switch (MI.getOpcode()) {
1168 case AMDGPU::G_ZEXT: {
1169 Hi = B.buildConstant({RB, S32}, 0);
1170 break;
1171 }
1172 case AMDGPU::G_SEXT: {
1173 // Replicate sign bit from 32-bit extended part.
1174 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
1175 Hi = B.buildAShr({RB, S32}, MI.getOperand(1).getReg(), ShiftAmt);
1176 break;
1177 }
1178 case AMDGPU::G_ANYEXT: {
1179 Hi = B.buildUndef({RB, S32});
1180 break;
1181 }
1182 default:
1183 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1184 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
1185 MI);
1186 return false;
1187 }
1188
1189 B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
1190 {MI.getOperand(1).getReg(), Hi});
1191 MI.eraseFromParent();
1192 return true;
1193 }
1194 case UniCstExt: {
1195 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
1196 B.buildConstant(MI.getOperand(0).getReg(), ConstVal);
1197
1198 MI.eraseFromParent();
1199 return true;
1200 }
1201 case VgprToVccCopy: {
1202 Register Src = MI.getOperand(1).getReg();
1203 LLT Ty = MRI.getType(Src);
1204 // Take lowest bit from each lane and put it in lane mask.
1205 // Lowering via compare, but we need to clean high bits first as compare
1206 // compares all bits in register.
1207 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
1208 if (Ty == S64) {
1209 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
1210 auto One = B.buildConstant(VgprRB_S32, 1);
1211 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
1212 auto Zero = B.buildConstant(VgprRB_S32, 0);
1213 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
1214 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
1215 } else {
1216 assert(Ty == S32 || Ty == S16);
1217 auto One = B.buildConstant({VgprRB, Ty}, 1);
1218 B.buildAnd(BoolSrc, Src, One);
1219 }
1220 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
1221 B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero);
1222 MI.eraseFromParent();
1223 return true;
1224 }
1225 case V_BFE:
1226 return lowerV_BFE(MI);
1227 case S_BFE:
1228 return lowerS_BFE(MI);
1229 case UniMAD64:
1230 return lowerUniMAD64(MI);
1231 case UniMul64: {
1232 B.buildMul(MI.getOperand(0), MI.getOperand(1), MI.getOperand(2));
1233 MI.eraseFromParent();
1234 return true;
1235 }
1236 case DivSMulToMAD: {
1237 auto Op1 = B.buildTrunc(VgprRB_S32, MI.getOperand(1));
1238 auto Op2 = B.buildTrunc(VgprRB_S32, MI.getOperand(2));
1239 auto Zero = B.buildConstant({VgprRB, S64}, 0);
1240
1241 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
1242 ? AMDGPU::G_AMDGPU_MAD_U64_U32
1243 : AMDGPU::G_AMDGPU_MAD_I64_I32;
1244
1245 B.buildInstr(NewOpc, {MI.getOperand(0).getReg(), {SgprRB, S32}},
1246 {Op1, Op2, Zero});
1247 MI.eraseFromParent();
1248 return true;
1249 }
1250 case SplitTo32:
1251 return lowerSplitTo32(MI);
1252 case SplitTo32Mul:
1253 return lowerSplitTo32Mul(MI);
1254 case SplitTo32Select:
1255 return lowerSplitTo32Select(MI);
1256 case SplitTo32SExtInReg:
1257 return lowerSplitTo32SExtInReg(MI);
1258 case CtPop64To32: {
1259 auto Unmerge = B.buildUnmerge({VgprRB, S32}, MI.getOperand(1).getReg());
1260 auto LoPopCnt = B.buildCTPOP({VgprRB, S32}, Unmerge.getReg(0));
1261 auto HiPopCnt = B.buildCTPOP({VgprRB, S32}, Unmerge.getReg(1));
1262 // Max popcount of two 32-bit values is 64, so this add cannot overflow.
1263 B.buildAdd(MI.getOperand(0).getReg(), LoPopCnt, HiPopCnt,
1265
1266 MI.eraseFromParent();
1267 break;
1268 }
1269 case SplitLoad: {
1270 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1271 unsigned Size = DstTy.getSizeInBits();
1272 // Even split to 128-bit loads
1273 if (Size > 128) {
1274 LLT B128;
1275 if (DstTy.isVector()) {
1276 LLT EltTy = DstTy.getElementType();
1277 B128 = LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1278 } else {
1279 B128 = LLT::scalar(128);
1280 }
1281 if (Size / 128 == 2)
1282 splitLoad(MI, {B128, B128});
1283 else if (Size / 128 == 4)
1284 splitLoad(MI, {B128, B128, B128, B128});
1285 else {
1286 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1287 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1288 MI);
1289 return false;
1290 }
1291 }
1292 // 64 and 32 bit load
1293 else if (DstTy == S96)
1294 splitLoad(MI, {S64, S32}, S32);
1295 else if (DstTy == V3S32)
1296 splitLoad(MI, {V2S32, S32}, S32);
1297 else if (DstTy == V6S16)
1298 splitLoad(MI, {V4S16, V2S16}, V2S16);
1299 else {
1300 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1301 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1302 MI);
1303 return false;
1304 }
1305 return true;
1306 }
1307 case WidenLoad: {
1308 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1309 if (DstTy == S96)
1310 widenLoad(MI, S128);
1311 else if (DstTy == V3S32)
1312 widenLoad(MI, V4S32, S32);
1313 else if (DstTy == V6S16)
1314 widenLoad(MI, V8S16, V2S16);
1315 else {
1316 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1317 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1318 MI);
1319 return false;
1320 }
1321 return true;
1322 }
1323 case UnpackAExt:
1324 return lowerUnpackAExt(MI);
1325 case WidenMMOToS32:
1326 return widenMMOToS32(cast<GAnyLoad>(MI));
1327 case VerifyAllSgpr: {
1328 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1329 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1330 }));
1331 return true;
1332 }
1333 case ApplyAllVgpr: {
1334 assert(llvm::all_of(MI.defs(), [&](const MachineOperand &Op) {
1335 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1336 }));
1337 B.setInstrAndDebugLoc(MI);
1338 for (unsigned i = MI.getNumDefs(); i < MI.getNumOperands(); ++i) {
1339 MachineOperand &Op = MI.getOperand(i);
1340 if (!Op.isReg())
1341 continue;
1342 Register Reg = Op.getReg();
1343 if (MRI.getRegBank(Reg) != VgprRB) {
1344 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1345 Op.setReg(Copy.getReg(0));
1346 }
1347 }
1348 return true;
1349 }
1350 case UnmergeToShiftTrunc: {
1351 GUnmerge *Unmerge = dyn_cast<GUnmerge>(&MI);
1352 LLT Ty = MRI.getType(Unmerge->getSourceReg());
1353 if (Ty.getSizeInBits() % 32 != 0) {
1354 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1355 "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1356 MI);
1357 return false;
1358 }
1359
1360 B.setInstrAndDebugLoc(MI);
1361 if (Ty.getSizeInBits() > 32) {
1362 auto UnmergeV2S16 =
1363 B.buildUnmerge({SgprRB, V2S16}, Unmerge->getSourceReg());
1364 for (unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
1365 auto [Dst0S32, Dst1S32] =
1366 unpackAExt(UnmergeV2S16->getOperand(i).getReg());
1367 B.buildTrunc(MI.getOperand(i * 2).getReg(), Dst0S32);
1368 B.buildTrunc(MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
1369 }
1370 } else {
1371 auto [Dst0S32, Dst1S32] = unpackAExt(MI.getOperand(2).getReg());
1372 B.buildTrunc(MI.getOperand(0).getReg(), Dst0S32);
1373 B.buildTrunc(MI.getOperand(1).getReg(), Dst1S32);
1374 }
1375
1376 MI.eraseFromParent();
1377 return true;
1378 }
1380 Register Dst = MI.getOperand(0).getReg();
1381 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1382 B.setInsertPt(*MI.getParent(), MI.getParent()->getFirstNonPHI());
1383 MI.getOperand(0).setReg(NewDst);
1384 B.buildTrunc(Dst, NewDst);
1385
1386 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1387 Register UseReg = MI.getOperand(i).getReg();
1388
1389 auto DefMI = MRI.getVRegDef(UseReg)->getIterator();
1390 MachineBasicBlock *DefMBB = DefMI->getParent();
1391
1392 B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
1393
1394 auto NewUse = B.buildAnyExt(SgprRB_S32, UseReg);
1395 MI.getOperand(i).setReg(NewUse.getReg(0));
1396 }
1397 break;
1398 }
1399 case VerifyAllSgprGPHI: {
1400 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1401 if (Op.isMBB())
1402 return true;
1403 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1404 }));
1405 return true;
1406 }
1408 assert(MRI.getRegBankOrNull(MI.getOperand(0).getReg()) == VgprRB);
1409 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1410 if (Op.isMBB())
1411 return true;
1412 const RegisterBank *RB = MRI.getRegBankOrNull(Op.getReg());
1413 return RB == VgprRB || RB == SgprRB;
1414 }));
1415 return true;
1416 }
1417 case ApplyINTRIN_IMAGE: {
1418 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1420 assert(RSrcIntrin && RSrcIntrin->IsImage);
1421 // The reported argument index is relative to the IR intrinsic call
1422 // arguments, so shift by the number of defs and the intrinsic ID.
1423 unsigned RsrcIdx = RSrcIntrin->RsrcArg + MI.getNumExplicitDefs() + 1;
1424 return applyRegisterBanksVgprWithSgprRsrc(MI, RsrcIdx);
1425 }
1427 // Rsrc is the last register operand. Base BVH trails an A16 immediate
1428 // after rsrc; dual/BVH8 do not. Scan backwards for the last virtual
1429 // register.
1430 unsigned RsrcIdx = MI.getNumOperands();
1431 while (RsrcIdx-- > MI.getNumExplicitDefs()) {
1432 const MachineOperand &Op = MI.getOperand(RsrcIdx);
1433 if (Op.isReg() && Op.getReg().isVirtual())
1434 break;
1435 }
1436 return applyRegisterBanksVgprWithSgprRsrc(MI, RsrcIdx);
1437 }
1439 return lowerSplitBitCount64To32(MI);
1440 case ExtrVecEltToSel:
1441 return lowerExtrVecEltToSel(MI);
1442 case ExtrVecEltTo32:
1443 return lowerExtrVecEltTo32(MI);
1444 case InsVecEltToSel:
1445 return lowerInsVecEltToSel(MI);
1446 case InsVecEltTo32:
1447 return lowerInsVecEltTo32(MI);
1448 case AbsToNegMax:
1449 return lowerAbsToNegMax(MI);
1450 case AbsToS32:
1451 return lowerAbsToS32(MI);
1452 }
1453
1454 return true;
1455}
1456
1457LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
1458 switch (ID) {
1459 case Vcc:
1460 case UniInVcc:
1461 return LLT::scalar(1);
1462 case Sgpr16:
1463 case Vgpr16:
1464 case UniInVgprS16:
1465 return LLT::scalar(16);
1466 case Sgpr32:
1467 case Sgpr32_WF:
1468 case Sgpr32Trunc:
1469 case Sgpr32AExt:
1471 case Sgpr32SExt:
1472 case Sgpr32ZExt:
1473 case UniInVgprS32:
1474 case Sgpr32ToVgprDst:
1475 case Vgpr32:
1476 case Vgpr32AExt:
1477 case Vgpr32SExt:
1478 case Vgpr32ZExt:
1479 return LLT::scalar(32);
1480 case Sgpr64:
1481 case Vgpr64:
1482 case UniInVgprS64:
1483 case Sgpr64ToVgprDst:
1484 return LLT::scalar(64);
1485 case Sgpr128:
1486 case Vgpr128:
1487 return LLT::scalar(128);
1488 case SgprP0:
1489 case SgprP0Call_WF:
1490 case VgprP0:
1491 return LLT::pointer(0, 64);
1492 case SgprP1:
1493 case VgprP1:
1494 return LLT::pointer(1, 64);
1495 case SgprP2:
1496 case VgprP2:
1497 return LLT::pointer(2, 32);
1498 case SgprP3:
1499 case VgprP3:
1500 return LLT::pointer(3, 32);
1501 case SgprP4:
1502 case SgprP4Call_WF:
1503 case VgprP4:
1504 return LLT::pointer(4, 64);
1505 case SgprP5:
1506 case VgprP5:
1507 return LLT::pointer(5, 32);
1508 case SgprP8:
1509 return LLT::pointer(8, 128);
1510 case SgprV2S16:
1511 case VgprV2S16:
1512 case UniInVgprV2S16:
1513 return LLT::fixed_vector(2, 16);
1514 case SgprV2S32:
1515 case VgprV2S32:
1516 case UniInVgprV2S32:
1517 return LLT::fixed_vector(2, 32);
1518 case VgprV3S32:
1519 return LLT::fixed_vector(3, 32);
1520 case VgprV4S16:
1521 return LLT::fixed_vector(4, 16);
1522 case SgprV4S32:
1523 case SgprV4S32_WF:
1525 case VgprV4S32:
1526 case UniInVgprV4S32:
1527 return LLT::fixed_vector(4, 32);
1528 case VgprV8S32:
1529 return LLT::fixed_vector(8, 32);
1530 case VgprV2S64:
1531 case UniInVgprV2S64:
1532 return LLT::fixed_vector(2, 64);
1533 case VgprV6S32:
1534 return LLT::fixed_vector(6, 32);
1535 case VgprV32S16:
1536 return LLT::fixed_vector(32, 16);
1537 case VgprV32S32:
1538 return LLT::fixed_vector(32, 32);
1539 default:
1540 return LLT();
1541 }
1542}
1543
1544LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
1545 switch (ID) {
1546 case SgprB32:
1547 case VgprB32:
1548 case SgprB32_M0:
1550 case UniInVgprB32:
1551 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
1552 isAnyPtr(Ty, 32))
1553 return Ty;
1554 return LLT();
1555 case SgprPtr32:
1556 case VgprPtr32:
1557 return isAnyPtr(Ty, 32) ? Ty : LLT();
1558 case SgprPtr64:
1559 case VgprPtr64:
1560 return isAnyPtr(Ty, 64) ? Ty : LLT();
1561 case SgprPtr128:
1562 case VgprPtr128:
1563 return isAnyPtr(Ty, 128) ? Ty : LLT();
1564 case SgprB64:
1565 case VgprB64:
1567 case UniInVgprB64:
1568 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
1569 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
1570 return Ty;
1571 return LLT();
1572 case SgprB96:
1573 case VgprB96:
1574 case UniInVgprB96:
1575 if (Ty == LLT::scalar(96) || Ty == LLT::fixed_vector(3, 32) ||
1576 Ty == LLT::fixed_vector(6, 16))
1577 return Ty;
1578 return LLT();
1579 case SgprB128:
1580 case VgprB128:
1581 case UniInVgprB128:
1582 if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) ||
1583 Ty == LLT::fixed_vector(2, 64) || Ty == LLT::fixed_vector(8, 16) ||
1584 isAnyPtr(Ty, 128))
1585 return Ty;
1586 return LLT();
1587 case VgprB160:
1588 case UniInVgprB160:
1589 if (Ty.getSizeInBits() == 160)
1590 return Ty;
1591 return LLT();
1592 case SgprB256:
1593 case VgprB256:
1594 case UniInVgprB256:
1595 if (Ty == LLT::scalar(256) || Ty == LLT::fixed_vector(8, 32) ||
1596 Ty == LLT::fixed_vector(4, 64) || Ty == LLT::fixed_vector(16, 16))
1597 return Ty;
1598 return LLT();
1599 case SgprB512:
1600 case VgprB512:
1601 case UniInVgprB512:
1602 if (Ty == LLT::scalar(512) || Ty == LLT::fixed_vector(16, 32) ||
1603 Ty == LLT::fixed_vector(8, 64))
1604 return Ty;
1605 return LLT();
1606 case SgprBRC: {
1607 const SIRegisterInfo *TRI =
1608 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1609 unsigned LLTSize = Ty.getSizeInBits();
1610 if (LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize))
1611 return Ty;
1612 return LLT();
1613 }
1614 case VgprBRC: {
1615 const SIRegisterInfo *TRI =
1616 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1617 if (TRI->getSGPRClassForBitWidth(Ty.getSizeInBits()))
1618 return Ty;
1619 return LLT();
1620 }
1621 default:
1622 return LLT();
1623 }
1624}
1625
1626const RegisterBank *
1627RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
1628 switch (ID) {
1629 case Vcc:
1630 return VccRB;
1631 case Sgpr16:
1632 case Sgpr32:
1633 case Sgpr32_WF:
1634 case Sgpr64:
1635 case Sgpr128:
1636 case SgprP0:
1637 case SgprP0Call_WF:
1638 case SgprP1:
1639 case SgprP2:
1640 case SgprP3:
1641 case SgprP4:
1642 case SgprP4Call_WF:
1643 case SgprP5:
1644 case SgprP8:
1645 case SgprPtr32:
1646 case SgprPtr64:
1647 case SgprPtr128:
1648 case SgprV2S16:
1649 case SgprV2S32:
1650 case SgprV4S32:
1651 case SgprV4S32_WF:
1653 case SgprB32:
1654 case SgprB64:
1655 case SgprB96:
1656 case SgprB128:
1657 case SgprB256:
1658 case SgprB512:
1659 case SgprBRC:
1660 case UniInVcc:
1661 case UniInVgprS16:
1662 case UniInVgprS32:
1663 case UniInVgprS64:
1664 case UniInVgprV2S16:
1665 case UniInVgprV2S32:
1666 case UniInVgprV4S32:
1667 case UniInVgprV2S64:
1668 case UniInVgprB32:
1669 case UniInVgprB64:
1670 case UniInVgprB96:
1671 case UniInVgprB128:
1672 case UniInVgprB160:
1673 case UniInVgprB256:
1674 case UniInVgprB512:
1675 case Sgpr32Trunc:
1676 case Sgpr32AExt:
1678 case Sgpr32SExt:
1679 case Sgpr32ZExt:
1680 return SgprRB;
1681 case AgprAnyTy:
1682 return AgprRB;
1683 case Vgpr16:
1684 case Vgpr32:
1685 case Vgpr64:
1686 case Vgpr128:
1687 case VgprP0:
1688 case VgprP1:
1689 case VgprP2:
1690 case VgprP3:
1691 case VgprP4:
1692 case VgprP5:
1693 case VgprPtr32:
1694 case VgprPtr64:
1695 case VgprPtr128:
1696 case VgprV2S16:
1697 case VgprV2S32:
1698 case VgprV2S64:
1699 case VgprV3S32:
1700 case VgprV4S16:
1701 case VgprV4S32:
1702 case VgprV6S32:
1703 case VgprV8S32:
1704 case VgprV32S16:
1705 case VgprB32:
1706 case VgprB64:
1707 case VgprB96:
1708 case VgprB128:
1709 case VgprB160:
1710 case VgprB256:
1711 case VgprB512:
1712 case VgprBRC:
1713 case VgprAnyTy:
1714 case Vgpr32AExt:
1715 case Vgpr32SExt:
1716 case Vgpr32ZExt:
1717 case Sgpr32ToVgprDst:
1718 case Sgpr64ToVgprDst:
1719 return VgprRB;
1720 default:
1721 return nullptr;
1722 }
1723}
1724
1725bool RegBankLegalizeHelper::applyMappingDst(
1726 MachineInstr &MI, unsigned &OpIdx,
1727 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1728 // Defs start from operand 0
1729 for (; OpIdx < MethodIDs.size(); ++OpIdx) {
1730 if (MethodIDs[OpIdx] == None)
1731 continue;
1732 MachineOperand &Op = MI.getOperand(OpIdx);
1733 Register Reg = Op.getReg();
1734 LLT Ty = MRI.getType(Reg);
1735 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg);
1736
1737 switch (MethodIDs[OpIdx]) {
1738 // vcc, sgpr and vgpr scalars, pointers and vectors
1739 case Vcc:
1740 case Sgpr16:
1741 case Sgpr32:
1742 case Sgpr64:
1743 case Sgpr128:
1744 case SgprP0:
1745 case SgprP1:
1746 case SgprP3:
1747 case SgprP4:
1748 case SgprP5:
1749 case SgprP8:
1750 case SgprV2S16:
1751 case SgprV2S32:
1752 case SgprV4S32:
1753 case Vgpr16:
1754 case Vgpr32:
1755 case Vgpr64:
1756 case Vgpr128:
1757 case VgprP0:
1758 case VgprP1:
1759 case VgprP2:
1760 case VgprP3:
1761 case VgprP4:
1762 case VgprP5:
1763 case VgprV2S16:
1764 case VgprV2S32:
1765 case VgprV2S64:
1766 case VgprV3S32:
1767 case VgprV4S16:
1768 case VgprV4S32:
1769 case VgprV6S32:
1770 case VgprV8S32:
1771 case VgprV32S16: {
1772 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1773 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1774 break;
1775 }
1776 // sgpr and vgpr B-types
1777 case SgprB32:
1778 case SgprB64:
1779 case SgprB96:
1780 case SgprB128:
1781 case SgprB256:
1782 case SgprB512:
1783 case SgprBRC:
1784 case SgprPtr32:
1785 case SgprPtr64:
1786 case SgprPtr128:
1787 case VgprB32:
1788 case VgprB64:
1789 case VgprB96:
1790 case VgprB128:
1791 case VgprB160:
1792 case VgprB256:
1793 case VgprB512:
1794 case VgprBRC:
1795 case VgprPtr32:
1796 case VgprPtr64:
1797 case VgprPtr128: {
1798 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1799 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1800 break;
1801 }
1802 case VgprAnyTy: {
1803 assert(RB == VgprRB);
1804 break;
1805 }
1806 case AgprAnyTy: {
1807 if (RB == AgprRB)
1808 break;
1809 Register NewAgprDst = MRI.createVirtualRegister({AgprRB, Ty});
1810 Op.setReg(NewAgprDst);
1811 if (!MRI.use_nodbg_empty(Reg))
1812 B.buildCopy(Reg, NewAgprDst);
1813 break;
1814 }
1815 // uniform in vcc/vgpr: scalars, vectors and B-types
1816 case UniInVcc: {
1817 assert(Ty == S1);
1818 assert(RB == SgprRB);
1819 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1820 Op.setReg(NewDst);
1821 if (!MRI.use_empty(Reg)) {
1822 auto CopyS32_Vcc =
1823 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1824 B.buildTrunc(Reg, CopyS32_Vcc);
1825 }
1826 break;
1827 }
1828 case UniInVgprS16: {
1829 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1830 assert(RB == SgprRB);
1831 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1832 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1833 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1834 Op.setReg(NewVgprDstS16);
1835 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1836 buildReadAnyLane(B, NewSgprDstS32, NewVgprDstS32, RBI);
1837 B.buildTrunc(Reg, NewSgprDstS32);
1838 break;
1839 }
1840 case UniInVgprS32:
1841 case UniInVgprS64:
1842 case UniInVgprV2S16:
1843 case UniInVgprV2S32:
1844 case UniInVgprV4S32:
1845 case UniInVgprV2S64: {
1846 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1847 assert(RB == SgprRB);
1848 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1849 Op.setReg(NewVgprDst);
1850 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1851 break;
1852 }
1853 case UniInVgprB32:
1854 case UniInVgprB64:
1855 case UniInVgprB96:
1856 case UniInVgprB128:
1857 case UniInVgprB160:
1858 case UniInVgprB256:
1859 case UniInVgprB512: {
1860 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1861 assert(RB == SgprRB);
1862 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1863 Op.setReg(NewVgprDst);
1864 AMDGPU::buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1865 break;
1866 }
1867 // sgpr trunc
1868 case Sgpr32Trunc: {
1869 assert(Ty.getSizeInBits() < 32);
1870 assert(RB == SgprRB);
1871 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1872 Op.setReg(NewDst);
1873 if (!MRI.use_empty(Reg))
1874 B.buildTrunc(Reg, NewDst);
1875 break;
1876 }
1877 case Sgpr32ToVgprDst:
1878 case Sgpr64ToVgprDst: {
1879 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1880 assert(RB == VgprRB);
1881 Op.setReg(MRI.createVirtualRegister({SgprRB, Ty}));
1882 B.buildCopy(Reg, Op.getReg());
1883 break;
1884 }
1885 case InvalidMapping: {
1887 MF, MORE, "amdgpu-regbanklegalize",
1888 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for", MI);
1889 return false;
1890 }
1891 default:
1893 MF, MORE, "amdgpu-regbanklegalize",
1894 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported", MI);
1895 return false;
1896 }
1897 }
1898
1899 return true;
1900}
1901
1902bool RegBankLegalizeHelper::applyMappingSrc(
1903 MachineInstr &MI, unsigned &OpIdx,
1904 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1905 WaterfallInfo &WFI) {
1906 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) {
1907 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm)
1908 continue;
1909
1910 MachineOperand &Op = MI.getOperand(OpIdx);
1911 Register Reg = Op.getReg();
1912 LLT Ty = MRI.getType(Reg);
1913 const RegisterBank *RB = MRI.getRegBank(Reg);
1914
1915 switch (MethodIDs[i]) {
1916 case Vcc: {
1917 assert(Ty == S1);
1918 assert(RB == VccRB || RB == SgprRB);
1919 if (RB == SgprRB) {
1920 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1921 auto CopyVcc_Scc =
1922 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1923 Op.setReg(CopyVcc_Scc.getReg(0));
1924 }
1925 break;
1926 }
1927 // sgpr scalars, pointers and vectors
1928 case Sgpr16:
1929 case Sgpr32:
1930 case Sgpr64:
1931 case Sgpr128:
1932 case SgprP0:
1933 case SgprP1:
1934 case SgprP3:
1935 case SgprP4:
1936 case SgprP5:
1937 case SgprP8:
1938 case SgprV2S16:
1939 case SgprV2S32:
1940 case SgprV4S32: {
1941 assert(Ty == getTyFromID(MethodIDs[i]));
1942 assert(RB == getRegBankFromID(MethodIDs[i]));
1943 break;
1944 }
1945 // sgpr B-types
1946 case SgprB32:
1947 case SgprB64:
1948 case SgprB96:
1949 case SgprB128:
1950 case SgprB256:
1951 case SgprB512:
1952 case SgprBRC:
1953 case SgprPtr32:
1954 case SgprPtr64:
1955 case SgprPtr128: {
1956 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1957 assert(RB == getRegBankFromID(MethodIDs[i]));
1958 break;
1959 }
1960 // vgpr scalars, pointers and vectors
1961 case Vgpr16:
1962 case Vgpr32:
1963 case Vgpr64:
1964 case Vgpr128:
1965 case VgprP0:
1966 case VgprP1:
1967 case VgprP2:
1968 case VgprP3:
1969 case VgprP4:
1970 case VgprP5:
1971 case VgprV2S16:
1972 case VgprV2S32:
1973 case VgprV2S64:
1974 case VgprV3S32:
1975 case VgprV4S16:
1976 case VgprV4S32:
1977 case VgprV6S32:
1978 case VgprV8S32:
1979 case VgprV32S16:
1980 case VgprV32S32: {
1981 assert(Ty == getTyFromID(MethodIDs[i]));
1982 if (RB != VgprRB) {
1983 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1984 Op.setReg(CopyToVgpr.getReg(0));
1985 }
1986 break;
1987 }
1988 // vgpr B-types
1989 case VgprB32:
1990 case VgprB64:
1991 case VgprB96:
1992 case VgprB128:
1993 case VgprB160:
1994 case VgprB256:
1995 case VgprB512:
1996 case VgprBRC:
1997 case VgprPtr32:
1998 case VgprPtr64:
1999 case VgprPtr128: {
2000 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2001 if (RB != VgprRB) {
2002 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
2003 Op.setReg(CopyToVgpr.getReg(0));
2004 }
2005 break;
2006 }
2007 case VgprAnyTy: {
2008 if (RB != VgprRB) {
2009 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
2010 Op.setReg(CopyToVgpr.getReg(0));
2011 }
2012 break;
2013 }
2014 case AgprAnyTy: {
2015 if (RB != AgprRB) {
2016 auto CopyToAgpr = B.buildCopy({AgprRB, Ty}, Reg);
2017 Op.setReg(CopyToAgpr.getReg(0));
2018 }
2019 break;
2020 }
2021 // sgpr waterfall, scalars, and vectors
2022 case Sgpr32_WF:
2023 case SgprV4S32_WF: {
2024 assert(Ty == getTyFromID(MethodIDs[i]));
2025 if (RB != SgprRB) {
2026 WFI.SgprWaterfallOperandRegs.insert(Reg);
2027 if (!WFI.Start.isValid()) {
2028 WFI.Start = MI.getIterator();
2029 WFI.End = std::next(MI.getIterator());
2030 }
2031 }
2032 break;
2033 }
2034 case SgprP0Call_WF:
2035 case SgprP4Call_WF: {
2036 assert(Ty == getTyFromID(MethodIDs[i]));
2037 if (RB != SgprRB) {
2038 WFI.SgprWaterfallOperandRegs.insert(Reg);
2039
2040 // Find the ADJCALLSTACKUP before the call.
2041 MachineBasicBlock::iterator Start = MI.getIterator();
2042 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
2043 --Start;
2044
2045 // Find the ADJCALLSTACKDOWN after the call (include it in range).
2046 MachineBasicBlock::iterator End = MI.getIterator();
2047 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
2048 ++End;
2049 ++End;
2050
2051 WFI.Start = Start;
2052 WFI.End = End;
2053 }
2054 break;
2055 }
2056 case SgprB32_M0:
2058 case SgprB64_ReadFirstLane: {
2059 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2060 if (RB == SgprRB)
2061 break;
2062 assert(RB == VgprRB);
2063 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2064 buildReadFirstLane(B, NewSGPR, Op.getReg(), RBI);
2065 Op.setReg(NewSGPR);
2066 break;
2067 }
2069 assert(Ty == getTyFromID(MethodIDs[i]));
2070 if (RB == SgprRB)
2071 break;
2072 assert(RB == VgprRB);
2073 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2074 buildReadFirstLane(B, NewSGPR, Op.getReg(), RBI);
2075 Op.setReg(NewSGPR);
2076 break;
2077 }
2078 // sgpr and vgpr scalars with extend
2079 case Sgpr32AExt: {
2080 // Note: this ext allows S1, and it is meant to be combined away.
2081 assert(Ty.getSizeInBits() < 32);
2082 assert(RB == SgprRB);
2083 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
2084 Op.setReg(Aext.getReg(0));
2085 break;
2086 }
2087 case Sgpr32AExtBoolInReg: {
2088 // Note: this ext allows S1, and it is meant to be combined away.
2089 assert(Ty.getSizeInBits() == 1);
2090 assert(RB == SgprRB);
2091 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
2092 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
2093 // most of times meant to be combined away in AMDGPURegBankCombiner.
2094 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
2095 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
2096 Op.setReg(BoolInReg.getReg(0));
2097 break;
2098 }
2099 case Sgpr32SExt: {
2100 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
2101 assert(RB == SgprRB);
2102 auto Sext = B.buildSExt(SgprRB_S32, Reg);
2103 Op.setReg(Sext.getReg(0));
2104 break;
2105 }
2106 case Sgpr32ZExt: {
2107 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
2108 assert(RB == SgprRB);
2109 auto Zext = B.buildZExt({SgprRB, S32}, Reg);
2110 Op.setReg(Zext.getReg(0));
2111 break;
2112 }
2113 case Vgpr32AExt: {
2114 assert(Ty.getSizeInBits() < 32);
2115 assert(RB == VgprRB);
2116 auto Aext = B.buildAnyExt({VgprRB, S32}, Reg);
2117 Op.setReg(Aext.getReg(0));
2118 break;
2119 }
2120 case Vgpr32SExt: {
2121 // Note this ext allows S1, and it is meant to be combined away.
2122 assert(Ty.getSizeInBits() < 32);
2123 assert(RB == VgprRB);
2124 auto Sext = B.buildSExt({VgprRB, S32}, Reg);
2125 Op.setReg(Sext.getReg(0));
2126 break;
2127 }
2128 case Vgpr32ZExt: {
2129 // Note this ext allows S1, and it is meant to be combined away.
2130 assert(Ty.getSizeInBits() < 32);
2131 assert(RB == VgprRB);
2132 auto Zext = B.buildZExt({VgprRB, S32}, Reg);
2133 Op.setReg(Zext.getReg(0));
2134 break;
2135 }
2136 default:
2138 MF, MORE, "amdgpu-regbanklegalize",
2139 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported", MI);
2140 return false;
2141 }
2142 }
2143 return true;
2144}
2145
2146[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
2147 const RegisterBank *RB,
2149 unsigned StartOpIdx,
2150 unsigned EndOpIdx) {
2151 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
2152 if (MRI.getRegBankOrNull(MI.getOperand(i).getReg()) != RB)
2153 return false;
2154 }
2155 return true;
2156}
2157
2158bool RegBankLegalizeHelper::applyRegisterBanksVgprWithSgprRsrc(
2159 MachineInstr &MI, unsigned RsrcIdx) {
2160 const unsigned NumDefs = MI.getNumExplicitDefs();
2161
2162 MachineBasicBlock *MBB = MI.getParent();
2163 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(MI.getIterator())));
2164
2165 // Defs are vgpr.
2166 for (unsigned i = 0; i < NumDefs; ++i) {
2167 Register Reg = MI.getOperand(i).getReg();
2168 if (MRI.getRegBank(Reg) == VgprRB)
2169 continue;
2170
2171 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, MRI.getType(Reg)});
2172 MI.getOperand(i).setReg(NewVgprDst);
2173 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
2174 }
2175
2176 B.setInstrAndDebugLoc(MI);
2177
2178 // Register uses before RsrcIdx are vgpr.
2179 for (unsigned i = NumDefs; i < RsrcIdx; ++i) {
2180 MachineOperand &Op = MI.getOperand(i);
2181 if (!Op.isReg())
2182 continue;
2183
2184 Register Reg = Op.getReg();
2185 if (!Reg.isVirtual())
2186 continue;
2187
2188 if (MRI.getRegBank(Reg) == VgprRB)
2189 continue;
2190
2191 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
2192 Op.setReg(Copy.getReg(0));
2193 }
2194
2195 SmallSet<Register, 4> OpsToWaterfall;
2196
2197 // Register use RsrcIdx (and later register operands) is sgpr.
2198 for (unsigned i = RsrcIdx; i < MI.getNumOperands(); ++i) {
2199 MachineOperand &Op = MI.getOperand(i);
2200 if (!Op.isReg())
2201 continue;
2202
2203 Register Reg = Op.getReg();
2204 if (MRI.getRegBank(Reg) != SgprRB)
2205 OpsToWaterfall.insert(Reg);
2206 }
2207
2208 if (!OpsToWaterfall.empty()) {
2209 MachineBasicBlock::iterator MII = MI.getIterator();
2210 executeInWaterfallLoop(B, {OpsToWaterfall, MII, std::next(MII)});
2211 }
2212
2213 return true;
2214}
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
IRTranslator LLVM IR MI
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
const SmallVectorImpl< MachineOperand > & Cond
static const LaneMaskConstants & get(const GCNSubtarget &ST)
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
@ ICMP_NE
not equal
Definition InstrTypes.h:762
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:85
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:239
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
Register getSourceReg() const
Get the unmerge source register.
constexpr bool isScalar() const
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank * getRegBank(Register Reg) const
Return the register bank of Reg.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
bool empty() const
Definition SmallSet.h:169
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isAnyPtr(LLT Ty, unsigned Width)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Bitcast
Perform the operation on a different, but equivalently sized type.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
@ Offset
Definition DWP.cpp:558
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:156
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
Definition Utils.cpp:258
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
Holds waterfall loop information: the set of SGPR operand registers that need waterfalling,...
MachineBasicBlock::iterator Start
SmallSet< Register, 4 > SgprWaterfallOperandRegs