LLVM 23.0.0git
AMDGPURegBankLegalizeHelper.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Implements actual lowering algorithms for each ID that can be used in
10/// Rule.OperandMapping. Similar to legalizer helper but with register banks.
11//
12//===----------------------------------------------------------------------===//
13
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
20#include "GCNSubtarget.h"
27#include "llvm/IR/IntrinsicsAMDGPU.h"
28
29#define DEBUG_TYPE "amdgpu-regbanklegalize"
30
31using namespace llvm;
32using namespace AMDGPU;
33
36 const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
37 : MF(B.getMF()), ST(MF.getSubtarget<GCNSubtarget>()), B(B),
38 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
39 RBLRules(RBLRules), IsWave32(ST.isWave32()),
40 SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
41 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
42 AgprRB(&RBI.getRegBank(AMDGPU::AGPRRegBankID)),
43 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
44
46 const SetOfRulesForOpcode *RuleSet = RBLRules.getRulesForOpc(MI);
47 if (!RuleSet) {
48 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
49 "No AMDGPU RegBankLegalize rules defined for opcode",
50 MI);
51 return false;
52 }
53
54 const RegBankLLTMapping *Mapping = RuleSet->findMappingForMI(MI, MRI, MUI);
55 if (!Mapping) {
56 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
57 "AMDGPU RegBankLegalize: none of the rules defined with "
58 "'Any' for MI's opcode matched MI",
59 MI);
60 return false;
61 }
62
63 WaterfallInfo WFI;
64 unsigned OpIdx = 0;
65 if (!Mapping->DstOpMapping.empty()) {
66 B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
67 if (!applyMappingDst(MI, OpIdx, Mapping->DstOpMapping))
68 return false;
69 }
70 if (!Mapping->SrcOpMapping.empty()) {
71 B.setInstr(MI);
72 if (!applyMappingSrc(MI, OpIdx, Mapping->SrcOpMapping, WFI))
73 return false;
74 }
75
76 if (!lower(MI, *Mapping, WFI))
77 return false;
78
79 if (!WFI.SgprWaterfallOperandRegs.empty()) {
80 if (!executeInWaterfallLoop(B, WFI))
81 return false;
82 }
83
84 return true;
85}
86
87bool RegBankLegalizeHelper::executeInWaterfallLoop(MachineIRBuilder &B,
88 const WaterfallInfo &WFI) {
89 assert(WFI.Start.isValid() && WFI.End.isValid() &&
90 "Waterfall range not initialized");
91
92 // Track use registers which have already been expanded with a readfirstlane
93 // sequence. This may have multiple uses if moving a sequence.
94 DenseMap<Register, Register> WaterfalledRegMap;
95
96 MachineBasicBlock &MBB = B.getMBB();
97 MachineFunction &MF = B.getMF();
98
101
102 const SIRegisterInfo *TRI = ST.getRegisterInfo();
103 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
105
106#ifndef NDEBUG
107 const int OrigRangeSize = std::distance(BeginIt, EndIt);
108#endif
109
110 MachineRegisterInfo &MRI = *B.getMRI();
111 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
112 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
113
114 // Don't bother using generic instructions/registers for the exec mask.
115 B.setInstr(*WFI.Start);
116 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
117
118 Register SavedExec = MRI.createVirtualRegister(WaveRC);
119
120 // To insert the loop we need to split the block. Move everything before
121 // this point to a new block, and insert a new empty block before this
122 // instruction.
125 MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
126 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
128 ++MBBI;
129 MF.insert(MBBI, LoopBB);
130 MF.insert(MBBI, BodyBB);
131 MF.insert(MBBI, RestoreExecBB);
132 MF.insert(MBBI, RemainderBB);
133
134 LoopBB->addSuccessor(BodyBB);
135 BodyBB->addSuccessor(RestoreExecBB);
136 BodyBB->addSuccessor(LoopBB);
137
138 // Move the rest of the block into a new block.
140 RemainderBB->splice(RemainderBB->begin(), &MBB, EndIt, MBB.end());
141
142 MBB.addSuccessor(LoopBB);
143 RestoreExecBB->addSuccessor(RemainderBB);
144
145 B.setInsertPt(*LoopBB, LoopBB->end());
146
147 // +-MBB:------------+
148 // | ... |
149 // | %0 = G_INST_1 |
150 // | %Dst = MI %Vgpr |
151 // | %1 = G_INST_2 |
152 // | ... |
153 // +-----------------+
154 // ->
155 // +-MBB-------------------------------+
156 // | ... |
157 // | %0 = G_INST_1 |
158 // | %SaveExecReg = S_MOV_B32 $exec_lo |
159 // +----------------|------------------+
160 // | /------------------------------|
161 // V V |
162 // +-LoopBB---------------------------------------------------------------+ |
163 // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
164 // | instead of executing for each lane, see if other lanes had | |
165 // | same value for %Vgpr and execute for them also. | |
166 // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
167 // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
168 // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
169 // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
170 // +----------------|-----------------------------------------------------+ |
171 // V |
172 // +-BodyBB------------------------------------------------------------+ |
173 // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
174 // | executed only for active lanes and written to Dst | |
175 // | $exec = S_XOR_B32 $exec, %SavedExec | |
176 // | set active lanes to 0 in SavedExec, lanes that did not write to | |
177 // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
178 // | SI_WATERFALL_LOOP LoopBB |-----|
179 // +----------------|--------------------------------------------------+
180 // V
181 // +-RestoreExecBB--------------------------+
182 // | $exec_lo = S_MOV_B32_term %SaveExecReg |
183 // +----------------|-----------------------+
184 // V
185 // +-RemainderBB:----------------------+
186 // | %1 = G_INST_2 |
187 // | ... |
188 // +---------------------------------- +
189
190 // Move the instruction into the loop body. Note we moved everything after
191 // Range.end() already into a new block, so Range.end() is no longer valid.
192 BodyBB->splice(BodyBB->end(), &MBB, BeginIt, MBB.end());
193
194 // Figure out the iterator range after splicing the instructions.
195 MachineBasicBlock::iterator NewBegin = BeginIt;
196 auto NewEnd = BodyBB->end();
197 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
198
199 B.setMBB(*LoopBB);
200 Register CondReg;
201
202 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
203 for (MachineOperand &Op : MI.all_uses()) {
204 Register OldReg = Op.getReg();
205 if (!WFI.SgprWaterfallOperandRegs.count(OldReg))
206 continue;
207
208 // See if we already processed this register in another instruction in
209 // the sequence.
210 auto OldVal = WaterfalledRegMap.find(OldReg);
211 if (OldVal != WaterfalledRegMap.end()) {
212 Op.setReg(OldVal->second);
213 continue;
214 }
215
216 Register OpReg = Op.getReg();
217 LLT OpTy = MRI.getType(OpReg);
218
219 // TODO: support for agpr
220 assert(MRI.getRegBank(OpReg) == VgprRB);
221 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
222 buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
223
224 // Build the comparison(s), CurrentLaneReg == OpReg.
225 unsigned OpSize = OpTy.getSizeInBits();
226 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
227 LLT PartTy = LLT::scalar(PartSize);
228 unsigned NumParts = OpSize / PartSize;
230 SmallVector<Register, 8> CurrentLaneParts;
231
232 if (NumParts == 1) {
233 OpParts.push_back(OpReg);
234 CurrentLaneParts.push_back(CurrentLaneReg);
235 } else {
236 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
237 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
238 for (unsigned i = 0; i < NumParts; ++i) {
239 OpParts.push_back(UnmergeOp.getReg(i));
240 CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
241 }
242 }
243
244 for (unsigned i = 0; i < NumParts; ++i) {
245 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
246 B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
247
248 if (!CondReg)
249 CondReg = CmpReg;
250 else
251 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
252 }
253
254 Op.setReg(CurrentLaneReg);
255
256 // Make sure we don't re-process this register again.
257 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
258 }
259 }
260
261 // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
262 Register CondRegLM =
263 MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
264 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
265
266 // Update EXEC, save the original EXEC value to SavedExec.
267 B.buildInstr(LMC.AndSaveExecOpc)
268 .addDef(SavedExec)
269 .addReg(CondRegLM, RegState::Kill);
270 MRI.setSimpleHint(SavedExec, CondRegLM);
271
272 B.setInsertPt(*BodyBB, BodyBB->end());
273
274 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
275 B.buildInstr(LMC.XorTermOpc)
276 .addDef(LMC.ExecReg)
277 .addReg(LMC.ExecReg)
278 .addReg(SavedExec);
279
280 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
281 // s_cbranch_scc0?
282
283 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
284 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
285
286 // Save the EXEC mask before the loop.
287 B.setInsertPt(MBB, MBB.end());
288 B.buildInstr(LMC.MovOpc).addDef(SaveExecReg).addReg(LMC.ExecReg);
289
290 // Restore the EXEC mask after the loop.
291 B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
292 B.buildInstr(LMC.MovTermOpc).addDef(LMC.ExecReg).addReg(SaveExecReg);
293
294 // Set the insert point after the original instruction, so any new
295 // instructions will be in the remainder.
296 B.setInsertPt(*RemainderBB, RemainderBB->begin());
297
298 return true;
299}
300
301bool RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
302 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
303 MachineFunction &MF = B.getMF();
304 assert(MI.getNumMemOperands() == 1);
305 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
306 Register Dst = MI.getOperand(0).getReg();
307 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
308 Register Base = MI.getOperand(1).getReg();
309 LLT PtrTy = MRI.getType(Base);
310 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base);
311 LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
312 SmallVector<Register, 4> LoadPartRegs;
313
314 unsigned ByteOffset = 0;
315 for (LLT PartTy : LLTBreakdown) {
316 Register BasePlusOffset;
317 if (ByteOffset == 0) {
318 BasePlusOffset = Base;
319 } else {
320 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
321 BasePlusOffset =
322 B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0);
323 }
324 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
325 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
326 LoadPartRegs.push_back(LoadPart.getReg(0));
327 ByteOffset += PartTy.getSizeInBytes();
328 }
329
330 if (!MergeTy.isValid()) {
331 // Loads are of same size, concat or merge them together.
332 B.buildMergeLikeInstr(Dst, LoadPartRegs);
333 } else {
334 // Loads are not all of same size, need to unmerge them to smaller pieces
335 // of MergeTy type, then merge pieces to Dst.
336 SmallVector<Register, 4> MergeTyParts;
337 for (Register Reg : LoadPartRegs) {
338 if (MRI.getType(Reg) == MergeTy) {
339 MergeTyParts.push_back(Reg);
340 } else {
341 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg);
342 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
343 MergeTyParts.push_back(Unmerge.getReg(i));
344 }
345 }
346 B.buildMergeLikeInstr(Dst, MergeTyParts);
347 }
348 MI.eraseFromParent();
349 return true;
350}
351
352bool RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
353 LLT MergeTy) {
354 MachineFunction &MF = B.getMF();
355 assert(MI.getNumMemOperands() == 1);
356 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
357 Register Dst = MI.getOperand(0).getReg();
358 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
359 Register Base = MI.getOperand(1).getReg();
360
361 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
362 auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO);
363
364 if (WideTy.isScalar()) {
365 B.buildTrunc(Dst, WideLoad);
366 } else {
367 SmallVector<Register, 4> MergeTyParts;
368 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
369
370 LLT DstTy = MRI.getType(Dst);
371 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
372 for (unsigned i = 0; i < NumElts; ++i) {
373 MergeTyParts.push_back(Unmerge.getReg(i));
374 }
375 B.buildMergeLikeInstr(Dst, MergeTyParts);
376 }
377 MI.eraseFromParent();
378 return true;
379}
380
381bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
382 Register Dst = MI.getDstReg();
383 Register Ptr = MI.getPointerReg();
384 MachineMemOperand &MMO = MI.getMMO();
385 unsigned MemSize = 8 * MMO.getSize().getValue();
386
387 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
388
389 if (MI.getOpcode() == G_LOAD) {
390 B.buildLoad(Dst, Ptr, *WideMMO);
391 } else {
392 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
393
394 if (MI.getOpcode() == G_ZEXTLOAD) {
395 APInt Mask = APInt::getLowBitsSet(S32.getSizeInBits(), MemSize);
396 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
397 B.buildAnd(Dst, Load, MaskCst);
398 } else {
399 assert(MI.getOpcode() == G_SEXTLOAD);
400 B.buildSExtInReg(Dst, Load, MemSize);
401 }
402 }
403
404 MI.eraseFromParent();
405 return true;
406}
407
408bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
409 Register Dst = MI.getOperand(0).getReg();
410 LLT Ty = MRI.getType(Dst);
411 Register Src = MI.getOperand(1).getReg();
412 unsigned Opc = MI.getOpcode();
413 int TrueExtCst = Opc == G_SEXT ? -1 : 1;
414 if (Ty == S32 || Ty == S16) {
415 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
416 auto False = B.buildConstant({VgprRB, Ty}, 0);
417 B.buildSelect(Dst, Src, True, False);
418 } else if (Ty == S64) {
419 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
420 auto False = B.buildConstant({VgprRB_S32}, 0);
421 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
422 MachineInstrBuilder Hi;
423 switch (Opc) {
424 case G_SEXT:
425 Hi = Lo;
426 break;
427 case G_ZEXT:
428 Hi = False;
429 break;
430 case G_ANYEXT:
431 Hi = B.buildUndef({VgprRB_S32});
432 break;
433 default:
435 MF, MORE, "amdgpu-regbanklegalize",
436 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported", MI);
437 return false;
438 }
439
440 B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)});
441 } else {
443 MF, MORE, "amdgpu-regbanklegalize",
444 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported", MI);
445 return false;
446 }
447
448 MI.eraseFromParent();
449 return true;
450}
451
452std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
453 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
454 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
455 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
456 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
457 return {Lo.getReg(0), Hi.getReg(0)};
458}
459
460std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
461 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
462 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
463 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
464 return {Lo.getReg(0), Hi.getReg(0)};
465}
466
467std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
468 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
469 auto Lo = PackedS32;
470 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
471 return {Lo.getReg(0), Hi.getReg(0)};
472}
473
474std::pair<Register, Register>
475RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) {
476 auto [Lo32, Hi32] = unpackAExt(Reg);
477 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
478 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
479}
480
481bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
482 Register Lo, Hi;
483 switch (MI.getOpcode()) {
484 case AMDGPU::G_SHL: {
485 auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg());
486 auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg());
487 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
488 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
489 break;
490 }
491 case AMDGPU::G_LSHR: {
492 auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg());
493 auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg());
494 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
495 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
496 break;
497 }
498 case AMDGPU::G_ASHR: {
499 auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg());
500 auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg());
501 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
502 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
503 break;
504 }
505 default:
507 MF, MORE, "amdgpu-regbanklegalize",
508 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
509 MI);
510 return false;
511 }
512 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
513 MI.eraseFromParent();
514 return true;
515}
516
517bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
518 Register Lo, Hi;
519 switch (MI.getOpcode()) {
520 case AMDGPU::G_SMIN:
521 case AMDGPU::G_SMAX: {
522 // For signed operations, use sign extension
523 auto [Val0_Lo, Val0_Hi] = unpackSExt(MI.getOperand(1).getReg());
524 auto [Val1_Lo, Val1_Hi] = unpackSExt(MI.getOperand(2).getReg());
525 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
526 .getReg(0);
527 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
528 .getReg(0);
529 break;
530 }
531 case AMDGPU::G_UMIN:
532 case AMDGPU::G_UMAX: {
533 // For unsigned operations, use zero extension
534 auto [Val0_Lo, Val0_Hi] = unpackZExt(MI.getOperand(1).getReg());
535 auto [Val1_Lo, Val1_Hi] = unpackZExt(MI.getOperand(2).getReg());
536 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
537 .getReg(0);
538 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
539 .getReg(0);
540 break;
541 }
542 default:
544 MF, MORE, "amdgpu-regbanklegalize",
545 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented", MI);
546 return false;
547 }
548 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
549 MI.eraseFromParent();
550 return true;
551}
552
553bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) {
554 auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg());
555 auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg());
556 auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
557 auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
558 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
559 {ResLo.getReg(0), ResHi.getReg(0)});
560 MI.eraseFromParent();
561 return true;
562}
563
566 return (GI->is(Intrinsic::amdgcn_sbfe));
567
568 return MI.getOpcode() == AMDGPU::G_SBFX;
569}
570
571bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
572 Register Dst = MI.getOperand(0).getReg();
573 assert(MRI.getType(Dst) == LLT::scalar(64));
574 bool Signed = isSignedBFE(MI);
575 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
576 // Extract bitfield from Src, LSBit is the least-significant bit for the
577 // extraction (field offset) and Width is size of bitfield.
578 Register Src = MI.getOperand(FirstOpnd).getReg();
579 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
580 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
581 // Comments are for signed bitfield extract, similar for unsigned. x is sign
582 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
583
584 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
585 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
586 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
587
588 auto ConstWidth = getIConstantVRegValWithLookThrough(Width, MRI);
589
590 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
591 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
592 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
593 if (!ConstWidth) {
594 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
595 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
596 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
597 MI.eraseFromParent();
598 return true;
599 }
600
601 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
602 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
603 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
604 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
605 auto Zero = B.buildConstant({VgprRB, S32}, 0);
606 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
607
608 if (WidthImm <= 32) {
609 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
610 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo, Zero, Width});
611 MachineInstrBuilder Hi;
612 if (Signed) {
613 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
614 Hi = B.buildAShr(VgprRB_S32, Lo, B.buildConstant(VgprRB_S32, 31));
615 } else {
616 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
617 Hi = Zero;
618 }
619 B.buildMergeLikeInstr(Dst, {Lo, Hi});
620 } else {
621 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
622 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
623 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi, Zero, Amt});
624 B.buildMergeLikeInstr(Dst, {SHRSrcLo, Hi});
625 }
626
627 MI.eraseFromParent();
628 return true;
629}
630
631bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
632 Register DstReg = MI.getOperand(0).getReg();
633 LLT Ty = MRI.getType(DstReg);
634 bool Signed = isSignedBFE(MI);
635 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
636 Register Src = MI.getOperand(FirstOpnd).getReg();
637 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
638 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
639 // For uniform bit field extract there are 4 available instructions, but
640 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
641 // field offset in low and size in high 16 bits.
642
643 // Src1 Hi16|Lo16 = Size|FieldOffset
644 auto Mask = B.buildConstant(SgprRB_S32, maskTrailingOnes<unsigned>(6));
645 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
646 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
647 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset, Size);
648 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
649 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
650 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
651
652 // Select machine instruction, because of reg class constraining, insert
653 // copies from reg class to reg bank.
654 auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}},
655 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
656 constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(),
657 *ST.getRegisterInfo(), RBI);
658
659 B.buildCopy(DstReg, S_BFE->getOperand(0).getReg());
660 MI.eraseFromParent();
661 return true;
662}
663
664bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
665 Register Dst = MI.getOperand(0).getReg();
666 LLT DstTy = MRI.getType(Dst);
667 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
668 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
669 auto Op1 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(1).getReg());
670 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
671 unsigned Opc = MI.getOpcode();
672 auto Flags = MI.getFlags();
673 auto Lo =
674 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)}, Flags);
675 auto Hi =
676 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags);
677 B.buildMergeLikeInstr(Dst, {Lo, Hi});
678 MI.eraseFromParent();
679 return true;
680}
681
682bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &MI) {
683 Register Dst = MI.getOperand(0).getReg();
684 assert(MRI.getType(Dst) == S64);
685 auto Op1 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(1).getReg());
686 auto Op2 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(2).getReg());
687
688 // TODO: G_AMDGPU_MAD_* optimizations for G_MUL divergent S64 operation to
689 // match GlobalISel with old regbankselect.
690 auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
691 auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
692 auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
693 auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
694 auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
695 auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
696
697 B.buildMergeLikeInstr(Dst, {Lo, Hi});
698 MI.eraseFromParent();
699 return true;
700}
701
702bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
703 Register Dst = MI.getOperand(0).getReg();
704 assert(MRI.getType(Dst) == V2S16);
705 unsigned Opc = MI.getOpcode();
706 unsigned NumOps = MI.getNumOperands();
707 auto Flags = MI.getFlags();
708
709 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
710
711 if (NumOps == 2) {
712 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags);
713 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags);
714 B.buildMergeLikeInstr(Dst, {Lo, Hi});
715 MI.eraseFromParent();
716 return true;
717 }
718
719 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(MI.getOperand(2).getReg());
720
721 if (NumOps == 3) {
722 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
723 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
724 B.buildMergeLikeInstr(Dst, {Lo, Hi});
725 MI.eraseFromParent();
726 return true;
727 }
728
729 assert(NumOps == 4);
730 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(MI.getOperand(3).getReg());
731 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo}, Flags);
732 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi}, Flags);
733 B.buildMergeLikeInstr(Dst, {Lo, Hi});
734 MI.eraseFromParent();
735 return true;
736}
737
738bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &MI) {
739 Register Dst0 = MI.getOperand(0).getReg();
740 Register Dst1 = MI.getOperand(1).getReg();
741 Register Src0 = MI.getOperand(2).getReg();
742 Register Src1 = MI.getOperand(3).getReg();
743 Register Src2 = MI.getOperand(4).getReg();
744
745 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
746
747 // Keep the multiplication on the SALU.
748 Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
749 Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
750 if (ST.hasScalarMulHiInsts()) {
751 B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
752 } else {
753 auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
754 auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
755 auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
756 buildReadAnyLane(B, DstHi, MulHi.getReg(0), RBI);
757 }
758
759 // Accumulate and produce the "carry-out" bit.
760
761 // The "carry-out" is defined as bit 64 of the result when computed as a
762 // big integer. For unsigned multiply-add, this matches the usual
763 // definition of carry-out.
764 if (mi_match(Src2, MRI, MIPatternMatch::m_ZeroInt())) {
765 // No accumulate: result is just the multiplication, carry is 0.
766 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
767 B.buildConstant(Dst1, 0);
768 } else {
769 // Accumulate: add Src2 to the multiplication result with carry chain.
770 Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
771 Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
772 B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
773
774 auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
775 auto AddHi =
776 B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
777 B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
778 B.buildCopy(Dst1, AddHi.getReg(1));
779 }
780
781 MI.eraseFromParent();
782 return true;
783}
784
785bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
786 Register Dst = MI.getOperand(0).getReg();
787 LLT DstTy = MRI.getType(Dst);
788 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
789 (DstTy.isPointer() && DstTy.getSizeInBits() == 64));
790 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
791 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
792 auto Op3 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(3).getReg());
793 Register Cond = MI.getOperand(1).getReg();
794 auto Flags = MI.getFlags();
795 auto Lo =
796 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(0), Op3.getReg(0), Flags);
797 auto Hi =
798 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(1), Op3.getReg(1), Flags);
799
800 B.buildMergeLikeInstr(Dst, {Lo, Hi});
801 MI.eraseFromParent();
802 return true;
803}
804
805bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
806 auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
807 int Amt = MI.getOperand(2).getImm();
808 Register Lo, Hi;
809 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
810 if (Amt <= 32) {
811 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
812 if (Amt == 32) {
813 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
814 Lo = Freeze.getReg(0);
815 } else {
816 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
817 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
818 }
819
820 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
821 Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0);
822 } else {
823 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
824 Lo = Op1.getReg(0);
825 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
826 }
827
828 B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
829 MI.eraseFromParent();
830 return true;
831}
832
833bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &MI) {
834 // Split 64-bit find-first-bit operations into 32-bit halves:
835 // (ffbh hi:lo) -> umin(ffbh(hi), uaddsat(ffbh(lo), 32))
836 // (ffbl hi:lo) -> umin(ffbl(lo), uaddsat(ffbl(hi), 32))
837 // (ctlz_zero_poison hi:lo) -> umin(ffbh(hi), add(ffbh(lo), 32))
838 // (cttz_zero_poison hi:lo) -> umin(ffbl(lo), add(ffbl(hi), 32))
839 unsigned Opc = MI.getOpcode();
840
841 // FFBH/FFBL return 0xFFFFFFFF on zero input, using uaddsat to avoid
842 // wrapping. CTLZ/CTTZ guarantee non-zero input (zero_poison), so plain add
843 // is fine.
844 unsigned FFBOpc;
845 unsigned AddOpc;
846 bool SearchFromMSB;
847 switch (Opc) {
848 case AMDGPU::G_AMDGPU_FFBH_U32:
849 FFBOpc = Opc;
850 AddOpc = AMDGPU::G_UADDSAT;
851 SearchFromMSB = true;
852 break;
853 case AMDGPU::G_AMDGPU_FFBL_B32:
854 FFBOpc = Opc;
855 AddOpc = AMDGPU::G_UADDSAT;
856 SearchFromMSB = false;
857 break;
858 case AMDGPU::G_CTLZ_ZERO_POISON:
859 FFBOpc = AMDGPU::G_AMDGPU_FFBH_U32;
860 AddOpc = AMDGPU::G_ADD;
861 SearchFromMSB = true;
862 break;
863 case AMDGPU::G_CTTZ_ZERO_POISON:
864 FFBOpc = AMDGPU::G_AMDGPU_FFBL_B32;
865 AddOpc = AMDGPU::G_ADD;
866 SearchFromMSB = false;
867 break;
868 default:
869 llvm_unreachable("unexpected opcode in lowerSplitBitCount64To32");
870 }
871
872 auto Unmerge = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
873 Register Lo = Unmerge.getReg(0);
874 Register Hi = Unmerge.getReg(1);
875
876 // MSB-first (FFBH/CTLZ) searches hi first; LSB-first (FFBL/CTTZ) searches
877 // lo first. The secondary half adds 32 to account for the primary half's
878 // width.
879 auto Primary = B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Hi : Lo});
880 auto Secondary =
881 B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Lo : Hi});
882
883 auto Adjusted = B.buildInstr(AddOpc, {VgprRB_S32},
884 {Secondary, B.buildConstant(VgprRB_S32, 32)});
885 B.buildUMin(MI.getOperand(0).getReg(), Primary, Adjusted);
886
887 MI.eraseFromParent();
888 return true;
889}
890
891bool RegBankLegalizeHelper::lowerExtrVecEltToSel(MachineInstr &MI) {
892 // Lower extract vector element to a compare-select chain:
893 // result = elt[0]
894 // for i in 1..N-1:
895 // result = (idx == i) ? elt[i] : result
896 //
897 // When the index is divergent, each lane may want a different element, so
898 // we must check every element per lane.
899 Register Dst = MI.getOperand(0).getReg();
900 Register Src = MI.getOperand(1).getReg();
901 Register Idx = MI.getOperand(2).getReg();
902
903 LLT VecTy = MRI.getType(Src);
904 LLT ScalarTy = VecTy.getScalarType();
905 unsigned NumElts = VecTy.getNumElements();
906 MachineRegisterInfo::VRegAttrs VgprRB_EltTy = {VgprRB, ScalarTy};
907
908 auto Unmerge = B.buildUnmerge(VgprRB_EltTy, Src);
909
910 if (ScalarTy.getSizeInBits() == 32) {
911 Register PrevSelect = Unmerge.getReg(0);
912 for (unsigned I = 1; I < NumElts; ++I) {
913 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)}, I);
914 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
915 PrevSelect =
916 B.buildSelect(VgprRB_EltTy, Cmp, Unmerge.getReg(I), PrevSelect)
917 .getReg(0);
918 }
919 B.buildCopy(Dst, PrevSelect);
920 } else if (ScalarTy.getSizeInBits() == 64) {
921 auto InitUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(0));
922 Register PrevLo = InitUnmerge.getReg(0);
923 Register PrevHi = InitUnmerge.getReg(1);
924 for (unsigned I = 1; I < NumElts; ++I) {
925 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)}, I);
926 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
927 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(I));
928 PrevLo = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(0), PrevLo)
929 .getReg(0);
930 PrevHi = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(1), PrevHi)
931 .getReg(0);
932 }
933 B.buildMergeLikeInstr(Dst, {PrevLo, PrevHi});
934 } else {
936 MF, MORE, "amdgpu-regbanklegalize",
937 "AMDGPU RegBankLegalize: ExtrVecEltToSel unsupported element type", MI);
938 return false;
939 }
940
941 MI.eraseFromParent();
942 return true;
943}
944
945bool RegBankLegalizeHelper::lowerExtrVecEltTo32(MachineInstr &MI) {
946 // Reduce a 64-bit element extract to two 32-bit extracts:
947 // vec32 = bitcast <N x s64> to <2N x s32>
948 // lo = vec32[idx * 2]
949 // hi = vec32[idx * 2 + 1]
950 // result = merge(lo, hi)
951 //
952 // When the index is uniform, all lanes extract the same element, so we can
953 // just split the s64 extract into two s32 extracts which lower to MOVREL.
954 Register Dst = MI.getOperand(0).getReg();
955 Register Src = MI.getOperand(1).getReg();
956 Register Idx = MI.getOperand(2).getReg();
957
958 LLT SrcTy = MRI.getType(Src);
959 LLT Vec32Ty = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
960
961 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
962 "expected VGPR src and SGPR idx");
963
964 auto CastSrc = B.buildBitcast({VgprRB, Vec32Ty}, Src);
965
966 // Calculate new Lo and Hi indices
967 auto One = B.buildConstant(SgprRB_S32, 1);
968 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
969 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
970
971 auto ExtLo = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxLo);
972 auto ExtHi = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxHi);
973
974 B.buildMergeLikeInstr(Dst, {ExtLo.getReg(0), ExtHi.getReg(0)});
975
976 MI.eraseFromParent();
977 return true;
978}
979
980bool RegBankLegalizeHelper::lowerInsVecEltToSel(MachineInstr &MI) {
981 // Lower insert vector element to a compare-select chain:
982 // for i in 0..N-1:
983 // result[i] = (idx == i) ? elt : srcVec[i]
984 // dst = merge(result[0..N-1])
985 //
986 // VGPR B64 requires splitting to lo/hi s32 pairs since there is no
987 // v_cndmask_b64. SGPR B64/B32 and VGPR B32 can be handled natively.
988 Register Dst = MI.getOperand(0).getReg();
989 Register Src = MI.getOperand(1).getReg();
990 Register Elt = MI.getOperand(2).getReg();
991 Register Idx = MI.getOperand(3).getReg();
992
993 LLT VecTy = MRI.getType(Src);
994 LLT ScalarTy = VecTy.getScalarType();
995 unsigned NumElts = VecTy.getNumElements();
996 const RegisterBank *SrcRB = MRI.getRegBank(Src);
997 bool IsSGPR = (SrcRB == SgprRB);
998 SmallVector<Register, 16> Selects;
999
1000 if (!IsSGPR && ScalarTy.getSizeInBits() == 64) {
1001 // VGPR B64: split to 32-bit lo/hi since there is no v_cndmask_b64.
1002 auto Unmerge = B.buildUnmerge(VgprRB_S32, Src);
1003 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1004 Register EltLo = EltUnmerge.getReg(0);
1005 Register EltHi = EltUnmerge.getReg(1);
1006 for (unsigned I = 0; I < NumElts; ++I) {
1007 auto IdxConst = B.buildConstant(VgprRB_S32, I);
1008 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
1009 Selects.push_back(
1010 B.buildSelect(VgprRB_S32, Cmp, EltLo, Unmerge.getReg(2 * I))
1011 .getReg(0));
1012 Selects.push_back(
1013 B.buildSelect(VgprRB_S32, Cmp, EltHi, Unmerge.getReg(2 * I + 1))
1014 .getReg(0));
1015 }
1016 LLT Vec32Ty = LLT::fixed_vector(2 * NumElts, 32);
1017 auto Vec32 = B.buildBuildVector({VgprRB, Vec32Ty}, Selects);
1018 B.buildBitcast(Dst, Vec32);
1019 } else if (ScalarTy.getSizeInBits() == 32 || ScalarTy.getSizeInBits() == 64) {
1020 // B32 (any bank) and SGPR B64: element-wise select at native width.
1021 MachineRegisterInfo::VRegAttrs SrcRB_EltTy = {SrcRB, ScalarTy};
1022 MachineRegisterInfo::VRegAttrs CmpTy = IsSGPR ? SgprRB_S32 : VccRB_S1;
1023 auto Unmerge = B.buildUnmerge(SrcRB_EltTy, Src);
1024 for (unsigned I = 0; I < NumElts; ++I) {
1025 auto IdxConst = B.buildConstant(SgprRB_S32, I);
1026 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CmpTy, Idx, IdxConst);
1027 Selects.push_back(
1028 B.buildSelect(SrcRB_EltTy, Cmp, Elt, Unmerge.getReg(I)).getReg(0));
1029 }
1030 B.buildMergeLikeInstr(Dst, Selects);
1031 } else {
1033 MF, MORE, "amdgpu-regbanklegalize",
1034 "AMDGPU RegBankLegalize: InsVecEltToSel unsupported element type", MI);
1035 return false;
1036 }
1037
1038 MI.eraseFromParent();
1039 return true;
1040}
1041
1042bool RegBankLegalizeHelper::lowerInsVecEltTo32(MachineInstr &MI) {
1043 // Reduce a 64-bit element insert to two 32-bit inserts:
1044 // vec32 = bitcast <N x s64> to <2N x s32>
1045 // lo, hi = unmerge elt
1046 // vec32[idx * 2] = lo
1047 // vec32[idx * 2 + 1] = hi
1048 // dst = bitcast <2N x s32> to <N x s64>
1049 //
1050 // When the index is uniform, all lanes insert at the same position, so we
1051 // can split the s64 insert into two s32 inserts which lower to MOVREL/GPRIDX.
1052 Register Dst = MI.getOperand(0).getReg();
1053 Register Src = MI.getOperand(1).getReg();
1054 Register Elt = MI.getOperand(2).getReg();
1055 Register Idx = MI.getOperand(3).getReg();
1056
1057 LLT SrcTy = MRI.getType(Src);
1058 LLT Vec32Ty = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
1059
1060 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
1061 "expected VGPR src and SGPR idx");
1062
1063 MachineRegisterInfo::VRegAttrs VgprRB_Vec32Ty = {VgprRB, Vec32Ty};
1064
1065 auto CastSrc = B.buildBitcast(VgprRB_Vec32Ty, Src);
1066 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1067
1068 // Calculate new Lo and Hi indices
1069 auto One = B.buildConstant(SgprRB_S32, 1);
1070 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
1071 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
1072
1073 auto InsLo = B.buildInsertVectorElement(VgprRB_Vec32Ty, CastSrc,
1074 EltUnmerge.getReg(0), IdxLo);
1075 auto InsHi = B.buildInsertVectorElement(VgprRB_Vec32Ty, InsLo,
1076 EltUnmerge.getReg(1), IdxHi);
1077
1078 B.buildBitcast(Dst, InsHi);
1079
1080 MI.eraseFromParent();
1081 return true;
1082}
1083
1084bool RegBankLegalizeHelper::lowerAbsToNegMax(MachineInstr &MI) {
1085 // Lower divergent G_ABS to smax(x, 0 - x) in the VGPR bank:
1086 // zero = 0
1087 // neg = G_SUB zero, x
1088 // dst = G_SMAX x, neg
1089 //
1090 // There is no integer v_abs instruction on AMDGPU, so divergent G_ABS is
1091 // expanded to this sub/smax pair.
1092 Register DstReg = MI.getOperand(0).getReg();
1093 Register SrcReg = MI.getOperand(1).getReg();
1094 LLT Ty = MRI.getType(DstReg);
1095
1096 Register Zero;
1097 if (Ty == V2S16) {
1098 // buildConstant cannot produce a V2S16 directly; pack two S16 zeros.
1099 Register Zero16 = B.buildConstant({VgprRB, S16}, 0).getReg(0);
1100 Zero = B.buildBuildVector({VgprRB, Ty}, {Zero16, Zero16}).getReg(0);
1101 } else {
1102 assert((Ty == S32 || Ty == S16) && "unexpected type for AbsToNegMax");
1103 Zero = B.buildConstant({VgprRB, Ty}, 0).getReg(0);
1104 }
1105
1106 auto Neg = B.buildSub({VgprRB, Ty}, Zero, SrcReg);
1107 B.buildSMax(DstReg, SrcReg, Neg);
1108 MI.eraseFromParent();
1109 return true;
1110}
1111
1112bool RegBankLegalizeHelper::lowerAbsToS32(MachineInstr &MI) {
1113 // Lower uniform V2S16 abs by unpacking the values to two separate SGPR
1114 // registers and re-emitting G_ABS on each:
1115 // packed = bitcast <2 x s16> src to s32
1116 // lo = sext_inreg packed, 16
1117 // hi = ashr packed, 16
1118 // dst = build_vector_trunc G_ABS(lo), G_ABS(hi)
1119 //
1120 // SALU only has s_abs_i32, with no direct uniform V2S16 abs. The
1121 // re-emitted G_ABS(SgprRB, S32) selects to s_abs_i32 on each value.
1122 auto Bitcast = B.buildBitcast({SgprRB_S32}, MI.getOperand(1).getReg());
1123 auto SextInReg = B.buildSExtInReg({SgprRB_S32}, Bitcast, 16);
1124 auto ShiftHi =
1125 B.buildAShr({SgprRB_S32}, Bitcast, B.buildConstant({SgprRB_S32}, 16));
1126
1127 auto AbsLo = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {SextInReg});
1128 auto AbsHi = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {ShiftHi});
1129 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
1130 {AbsLo.getReg(0), AbsHi.getReg(0)});
1131
1132 MI.eraseFromParent();
1133 return true;
1134}
1135
1136bool RegBankLegalizeHelper::lower(MachineInstr &MI,
1137 const RegBankLLTMapping &Mapping,
1138 WaterfallInfo &WFI) {
1139
1140 switch (Mapping.LoweringMethod) {
1141 case DoNotLower:
1142 break;
1143 case VccExtToSel:
1144 return lowerVccExtToSel(MI);
1145 case UniExtToSel: {
1146 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1147 auto True = B.buildConstant({SgprRB, Ty},
1148 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
1149 auto False = B.buildConstant({SgprRB, Ty}, 0);
1150 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare.
1151 // We are making select here. S1 cond was already 'any-extended to S32' +
1152 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg.
1153 B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True,
1154 False);
1155 MI.eraseFromParent();
1156 return true;
1157 }
1158 case UnpackBitShift:
1159 return lowerUnpackBitShift(MI);
1160 case UnpackMinMax:
1161 return lowerUnpackMinMax(MI);
1162 case ScalarizeToS16:
1163 return lowerSplitTo16(MI);
1164 case Ext32To64: {
1165 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
1166 MachineInstrBuilder Hi;
1167 switch (MI.getOpcode()) {
1168 case AMDGPU::G_ZEXT: {
1169 Hi = B.buildConstant({RB, S32}, 0);
1170 break;
1171 }
1172 case AMDGPU::G_SEXT: {
1173 // Replicate sign bit from 32-bit extended part.
1174 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
1175 Hi = B.buildAShr({RB, S32}, MI.getOperand(1).getReg(), ShiftAmt);
1176 break;
1177 }
1178 case AMDGPU::G_ANYEXT: {
1179 Hi = B.buildUndef({RB, S32});
1180 break;
1181 }
1182 default:
1183 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1184 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
1185 MI);
1186 return false;
1187 }
1188
1189 B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
1190 {MI.getOperand(1).getReg(), Hi});
1191 MI.eraseFromParent();
1192 return true;
1193 }
1194 case UniCstExt: {
1195 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
1196 B.buildConstant(MI.getOperand(0).getReg(), ConstVal);
1197
1198 MI.eraseFromParent();
1199 return true;
1200 }
1201 case VgprToVccCopy: {
1202 Register Src = MI.getOperand(1).getReg();
1203 LLT Ty = MRI.getType(Src);
1204 // Take lowest bit from each lane and put it in lane mask.
1205 // Lowering via compare, but we need to clean high bits first as compare
1206 // compares all bits in register.
1207 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
1208 if (Ty == S64) {
1209 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
1210 auto One = B.buildConstant(VgprRB_S32, 1);
1211 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
1212 auto Zero = B.buildConstant(VgprRB_S32, 0);
1213 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
1214 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
1215 } else {
1216 assert(Ty == S32 || Ty == S16);
1217 auto One = B.buildConstant({VgprRB, Ty}, 1);
1218 B.buildAnd(BoolSrc, Src, One);
1219 }
1220 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
1221 B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero);
1222 MI.eraseFromParent();
1223 return true;
1224 }
1225 case V_BFE:
1226 return lowerV_BFE(MI);
1227 case S_BFE:
1228 return lowerS_BFE(MI);
1229 case UniMAD64:
1230 return lowerUniMAD64(MI);
1231 case UniMul64: {
1232 B.buildMul(MI.getOperand(0), MI.getOperand(1), MI.getOperand(2));
1233 MI.eraseFromParent();
1234 return true;
1235 }
1236 case DivSMulToMAD: {
1237 auto Op1 = B.buildTrunc(VgprRB_S32, MI.getOperand(1));
1238 auto Op2 = B.buildTrunc(VgprRB_S32, MI.getOperand(2));
1239 auto Zero = B.buildConstant({VgprRB, S64}, 0);
1240
1241 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
1242 ? AMDGPU::G_AMDGPU_MAD_U64_U32
1243 : AMDGPU::G_AMDGPU_MAD_I64_I32;
1244
1245 B.buildInstr(NewOpc, {MI.getOperand(0).getReg(), {SgprRB, S32}},
1246 {Op1, Op2, Zero});
1247 MI.eraseFromParent();
1248 return true;
1249 }
1250 case SplitTo32:
1251 return lowerSplitTo32(MI);
1252 case SplitTo32Mul:
1253 return lowerSplitTo32Mul(MI);
1254 case SplitTo32Select:
1255 return lowerSplitTo32Select(MI);
1256 case SplitTo32SExtInReg:
1257 return lowerSplitTo32SExtInReg(MI);
1258 case CtPop64To32: {
1259 auto Unmerge = B.buildUnmerge({VgprRB, S32}, MI.getOperand(1).getReg());
1260 auto LoPopCnt = B.buildCTPOP({VgprRB, S32}, Unmerge.getReg(0));
1261 auto HiPopCnt = B.buildCTPOP({VgprRB, S32}, Unmerge.getReg(1));
1262 // Max popcount of two 32-bit values is 64, so this add cannot overflow.
1263 B.buildAdd(MI.getOperand(0).getReg(), LoPopCnt, HiPopCnt,
1265
1266 MI.eraseFromParent();
1267 break;
1268 }
1269 case SplitLoad: {
1270 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1271 unsigned Size = DstTy.getSizeInBits();
1272 // Even split to 128-bit loads
1273 if (Size > 128) {
1274 LLT B128;
1275 if (DstTy.isVector()) {
1276 LLT EltTy = DstTy.getElementType();
1277 B128 = LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1278 } else {
1279 B128 = LLT::scalar(128);
1280 }
1281 if (Size / 128 == 2)
1282 splitLoad(MI, {B128, B128});
1283 else if (Size / 128 == 4)
1284 splitLoad(MI, {B128, B128, B128, B128});
1285 else {
1286 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1287 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1288 MI);
1289 return false;
1290 }
1291 }
1292 // 64 and 32 bit load
1293 else if (DstTy == S96)
1294 splitLoad(MI, {S64, S32}, S32);
1295 else if (DstTy == V3S32)
1296 splitLoad(MI, {V2S32, S32}, S32);
1297 else if (DstTy == V6S16)
1298 splitLoad(MI, {V4S16, V2S16}, V2S16);
1299 else {
1300 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1301 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1302 MI);
1303 return false;
1304 }
1305 return true;
1306 }
1307 case WidenLoad: {
1308 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1309 if (DstTy == S96)
1310 widenLoad(MI, S128);
1311 else if (DstTy == V3S32)
1312 widenLoad(MI, V4S32, S32);
1313 else if (DstTy == V6S16)
1314 widenLoad(MI, V8S16, V2S16);
1315 else {
1316 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1317 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1318 MI);
1319 return false;
1320 }
1321 return true;
1322 }
1323 case UnpackAExt:
1324 return lowerUnpackAExt(MI);
1325 case WidenMMOToS32:
1326 return widenMMOToS32(cast<GAnyLoad>(MI));
1327 case VerifyAllSgpr: {
1328 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1329 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1330 }));
1331 return true;
1332 }
1333 case ApplyAllVgpr: {
1334 assert(llvm::all_of(MI.defs(), [&](const MachineOperand &Op) {
1335 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1336 }));
1337 B.setInstrAndDebugLoc(MI);
1338 for (unsigned i = MI.getNumDefs(); i < MI.getNumOperands(); ++i) {
1339 MachineOperand &Op = MI.getOperand(i);
1340 if (!Op.isReg())
1341 continue;
1342 Register Reg = Op.getReg();
1343 if (MRI.getRegBank(Reg) != VgprRB) {
1344 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1345 Op.setReg(Copy.getReg(0));
1346 }
1347 }
1348 return true;
1349 }
1350 case UnmergeToShiftTrunc: {
1351 GUnmerge *Unmerge = dyn_cast<GUnmerge>(&MI);
1352 LLT Ty = MRI.getType(Unmerge->getSourceReg());
1353 if (Ty.getSizeInBits() % 32 != 0) {
1354 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1355 "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1356 MI);
1357 return false;
1358 }
1359
1360 B.setInstrAndDebugLoc(MI);
1361 if (Ty.getSizeInBits() > 32) {
1362 auto UnmergeV2S16 =
1363 B.buildUnmerge({SgprRB, V2S16}, Unmerge->getSourceReg());
1364 for (unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
1365 auto [Dst0S32, Dst1S32] =
1366 unpackAExt(UnmergeV2S16->getOperand(i).getReg());
1367 B.buildTrunc(MI.getOperand(i * 2).getReg(), Dst0S32);
1368 B.buildTrunc(MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
1369 }
1370 } else {
1371 auto [Dst0S32, Dst1S32] = unpackAExt(MI.getOperand(2).getReg());
1372 B.buildTrunc(MI.getOperand(0).getReg(), Dst0S32);
1373 B.buildTrunc(MI.getOperand(1).getReg(), Dst1S32);
1374 }
1375
1376 MI.eraseFromParent();
1377 return true;
1378 }
1380 Register Dst = MI.getOperand(0).getReg();
1381 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1382 B.setInsertPt(*MI.getParent(), MI.getParent()->getFirstNonPHI());
1383 MI.getOperand(0).setReg(NewDst);
1384 B.buildTrunc(Dst, NewDst);
1385
1386 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1387 Register UseReg = MI.getOperand(i).getReg();
1388
1389 auto DefMI = MRI.getVRegDef(UseReg)->getIterator();
1390 MachineBasicBlock *DefMBB = DefMI->getParent();
1391
1392 B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
1393
1394 auto NewUse = B.buildAnyExt(SgprRB_S32, UseReg);
1395 MI.getOperand(i).setReg(NewUse.getReg(0));
1396 }
1397 break;
1398 }
1399 case VerifyAllSgprGPHI: {
1400 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1401 if (Op.isMBB())
1402 return true;
1403 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1404 }));
1405 return true;
1406 }
1408 assert(MRI.getRegBankOrNull(MI.getOperand(0).getReg()) == VgprRB);
1409 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1410 if (Op.isMBB())
1411 return true;
1412 const RegisterBank *RB = MRI.getRegBankOrNull(Op.getReg());
1413 return RB == VgprRB || RB == SgprRB;
1414 }));
1415 return true;
1416 }
1417 case ApplyINTRIN_IMAGE: {
1418 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1420 assert(RSrcIntrin && RSrcIntrin->IsImage);
1421 // The reported argument index is relative to the IR intrinsic call
1422 // arguments, so shift by the number of defs and the intrinsic ID.
1423 unsigned RsrcIdx = RSrcIntrin->RsrcArg + MI.getNumExplicitDefs() + 1;
1424 return applyRegisterBanksVgprWithSgprRsrc(MI, RsrcIdx);
1425 }
1427 // Rsrc is the last register operand. Base BVH trails an A16 immediate
1428 // after rsrc; dual/BVH8 do not. Scan backwards for the last virtual
1429 // register.
1430 unsigned RsrcIdx = MI.getNumOperands();
1431 while (RsrcIdx-- > MI.getNumExplicitDefs()) {
1432 const MachineOperand &Op = MI.getOperand(RsrcIdx);
1433 if (Op.isReg() && Op.getReg().isVirtual())
1434 break;
1435 }
1436 return applyRegisterBanksVgprWithSgprRsrc(MI, RsrcIdx);
1437 }
1439 return lowerSplitBitCount64To32(MI);
1440 case ExtrVecEltToSel:
1441 return lowerExtrVecEltToSel(MI);
1442 case ExtrVecEltTo32:
1443 return lowerExtrVecEltTo32(MI);
1444 case InsVecEltToSel:
1445 return lowerInsVecEltToSel(MI);
1446 case InsVecEltTo32:
1447 return lowerInsVecEltTo32(MI);
1448 case AbsToNegMax:
1449 return lowerAbsToNegMax(MI);
1450 case AbsToS32:
1451 return lowerAbsToS32(MI);
1452 }
1453
1454 return true;
1455}
1456
1457LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
1458 switch (ID) {
1459 case Vcc:
1460 case UniInVcc:
1461 return LLT::scalar(1);
1462 case Sgpr16:
1463 case Vgpr16:
1464 case UniInVgprS16:
1465 return LLT::scalar(16);
1466 case Sgpr32:
1467 case Sgpr32_WF:
1468 case Sgpr32Trunc:
1469 case Sgpr32AExt:
1471 case Sgpr32SExt:
1472 case Sgpr32ZExt:
1473 case UniInVgprS32:
1474 case Sgpr32ToVgprDst:
1475 case Vgpr32:
1476 case Vgpr32AExt:
1477 case Vgpr32SExt:
1478 case Vgpr32ZExt:
1479 return LLT::scalar(32);
1480 case Sgpr64:
1481 case Vgpr64:
1482 case UniInVgprS64:
1483 case Sgpr64ToVgprDst:
1484 return LLT::scalar(64);
1485 case Sgpr128:
1486 case Vgpr128:
1487 return LLT::scalar(128);
1488 case SgprP0:
1489 case SgprP0Call_WF:
1490 case VgprP0:
1491 return LLT::pointer(0, 64);
1492 case SgprP1:
1493 case VgprP1:
1494 return LLT::pointer(1, 64);
1495 case SgprP2:
1496 case VgprP2:
1497 return LLT::pointer(2, 32);
1498 case SgprP3:
1499 case VgprP3:
1500 return LLT::pointer(3, 32);
1501 case SgprP4:
1502 case SgprP4Call_WF:
1503 case VgprP4:
1504 return LLT::pointer(4, 64);
1505 case SgprP5:
1506 case VgprP5:
1507 return LLT::pointer(5, 32);
1508 case SgprP8:
1509 return LLT::pointer(8, 128);
1510 case SgprV2S16:
1511 case VgprV2S16:
1512 case UniInVgprV2S16:
1513 return LLT::fixed_vector(2, 16);
1514 case SgprV2S32:
1515 case VgprV2S32:
1516 case UniInVgprV2S32:
1517 return LLT::fixed_vector(2, 32);
1518 case VgprV3S32:
1519 case UniInVgprV3S32:
1520 return LLT::fixed_vector(3, 32);
1521 case VgprV4S16:
1522 return LLT::fixed_vector(4, 16);
1523 case VgprV8S16:
1524 case UniInVgprV8S16:
1525 return LLT::fixed_vector(8, 16);
1526 case VgprV16S16:
1527 case UniInVgprV16S16:
1528 return LLT::fixed_vector(16, 16);
1529 case SgprV4S32:
1530 case SgprV4S32_WF:
1532 case VgprV4S32:
1533 case UniInVgprV4S32:
1534 return LLT::fixed_vector(4, 32);
1535 case VgprV8S32:
1536 case UniInVgprV8S32:
1537 return LLT::fixed_vector(8, 32);
1538 case VgprV2S64:
1539 case UniInVgprV2S64:
1540 return LLT::fixed_vector(2, 64);
1541 case VgprV6S32:
1542 case UniInVgprV6S32:
1543 return LLT::fixed_vector(6, 32);
1544 case VgprV16S32:
1545 case UniInVgprV16S32:
1546 return LLT::fixed_vector(16, 32);
1547 case VgprV32S16:
1548 case UniInVgprV32S16:
1549 return LLT::fixed_vector(32, 16);
1550 case VgprV32S32:
1551 case UniInVgprV32S32:
1552 return LLT::fixed_vector(32, 32);
1553 default:
1554 return LLT();
1555 }
1556}
1557
1558LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
1559 switch (ID) {
1560 case SgprB32:
1561 case VgprB32:
1562 case SgprB32_M0:
1564 case UniInVgprB32:
1565 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
1566 isAnyPtr(Ty, 32))
1567 return Ty;
1568 return LLT();
1569 case SgprPtr32:
1570 case VgprPtr32:
1571 return isAnyPtr(Ty, 32) ? Ty : LLT();
1572 case SgprPtr64:
1573 case VgprPtr64:
1574 return isAnyPtr(Ty, 64) ? Ty : LLT();
1575 case SgprPtr128:
1576 case VgprPtr128:
1577 return isAnyPtr(Ty, 128) ? Ty : LLT();
1578 case SgprB64:
1579 case VgprB64:
1581 case UniInVgprB64:
1582 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
1583 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
1584 return Ty;
1585 return LLT();
1586 case SgprB96:
1587 case VgprB96:
1588 case UniInVgprB96:
1589 if (Ty == LLT::scalar(96) || Ty == LLT::fixed_vector(3, 32) ||
1590 Ty == LLT::fixed_vector(6, 16))
1591 return Ty;
1592 return LLT();
1593 case SgprB128:
1594 case VgprB128:
1595 case UniInVgprB128:
1596 if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) ||
1597 Ty == LLT::fixed_vector(2, 64) || Ty == LLT::fixed_vector(8, 16) ||
1598 isAnyPtr(Ty, 128))
1599 return Ty;
1600 return LLT();
1601 case VgprB160:
1602 case UniInVgprB160:
1603 if (Ty.getSizeInBits() == 160)
1604 return Ty;
1605 return LLT();
1606 case SgprB256:
1607 case VgprB256:
1608 case UniInVgprB256:
1609 if (Ty == LLT::scalar(256) || Ty == LLT::fixed_vector(8, 32) ||
1610 Ty == LLT::fixed_vector(4, 64) || Ty == LLT::fixed_vector(16, 16))
1611 return Ty;
1612 return LLT();
1613 case SgprB512:
1614 case VgprB512:
1615 case UniInVgprB512:
1616 if (Ty == LLT::scalar(512) || Ty == LLT::fixed_vector(16, 32) ||
1617 Ty == LLT::fixed_vector(8, 64))
1618 return Ty;
1619 return LLT();
1620 case SgprBRC: {
1621 const SIRegisterInfo *TRI =
1622 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1623 unsigned LLTSize = Ty.getSizeInBits();
1624 if (LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize))
1625 return Ty;
1626 return LLT();
1627 }
1628 case VgprBRC: {
1629 const SIRegisterInfo *TRI =
1630 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1631 if (TRI->getSGPRClassForBitWidth(Ty.getSizeInBits()))
1632 return Ty;
1633 return LLT();
1634 }
1635 default:
1636 return LLT();
1637 }
1638}
1639
1640const RegisterBank *
1641RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
1642 switch (ID) {
1643 case Vcc:
1644 return VccRB;
1645 case Sgpr16:
1646 case Sgpr32:
1647 case Sgpr32_WF:
1648 case Sgpr64:
1649 case Sgpr128:
1650 case SgprP0:
1651 case SgprP0Call_WF:
1652 case SgprP1:
1653 case SgprP2:
1654 case SgprP3:
1655 case SgprP4:
1656 case SgprP4Call_WF:
1657 case SgprP5:
1658 case SgprP8:
1659 case SgprPtr32:
1660 case SgprPtr64:
1661 case SgprPtr128:
1662 case SgprV2S16:
1663 case SgprV2S32:
1664 case SgprV4S32:
1665 case SgprV4S32_WF:
1667 case SgprB32:
1668 case SgprB64:
1669 case SgprB96:
1670 case SgprB128:
1671 case SgprB256:
1672 case SgprB512:
1673 case SgprBRC:
1674 case UniInVcc:
1675 case UniInVgprS16:
1676 case UniInVgprS32:
1677 case UniInVgprS64:
1678 case UniInVgprV2S16:
1679 case UniInVgprV2S32:
1680 case UniInVgprV3S32:
1681 case UniInVgprV4S32:
1682 case UniInVgprV2S64:
1683 case UniInVgprV6S32:
1684 case UniInVgprV8S16:
1685 case UniInVgprV8S32:
1686 case UniInVgprV16S16:
1687 case UniInVgprV16S32:
1688 case UniInVgprV32S16:
1689 case UniInVgprV32S32:
1690 case UniInVgprB32:
1691 case UniInVgprB64:
1692 case UniInVgprB96:
1693 case UniInVgprB128:
1694 case UniInVgprB160:
1695 case UniInVgprB256:
1696 case UniInVgprB512:
1697 case Sgpr32Trunc:
1698 case Sgpr32AExt:
1700 case Sgpr32SExt:
1701 case Sgpr32ZExt:
1702 return SgprRB;
1703 case AgprAnyTy:
1704 return AgprRB;
1705 case Vgpr16:
1706 case Vgpr32:
1707 case Vgpr64:
1708 case Vgpr128:
1709 case VgprP0:
1710 case VgprP1:
1711 case VgprP2:
1712 case VgprP3:
1713 case VgprP4:
1714 case VgprP5:
1715 case VgprPtr32:
1716 case VgprPtr64:
1717 case VgprPtr128:
1718 case VgprV2S16:
1719 case VgprV2S32:
1720 case VgprV2S64:
1721 case VgprV3S32:
1722 case VgprV4S16:
1723 case VgprV8S16:
1724 case VgprV16S16:
1725 case VgprV4S32:
1726 case VgprV6S32:
1727 case VgprV8S32:
1728 case VgprV16S32:
1729 case VgprV32S16:
1730 case VgprV32S32:
1731 case VgprB32:
1732 case VgprB64:
1733 case VgprB96:
1734 case VgprB128:
1735 case VgprB160:
1736 case VgprB256:
1737 case VgprB512:
1738 case VgprBRC:
1739 case VgprAnyTy:
1740 case Vgpr32AExt:
1741 case Vgpr32SExt:
1742 case Vgpr32ZExt:
1743 case Sgpr32ToVgprDst:
1744 case Sgpr64ToVgprDst:
1745 return VgprRB;
1746 default:
1747 return nullptr;
1748 }
1749}
1750
1751bool RegBankLegalizeHelper::applyMappingDst(
1752 MachineInstr &MI, unsigned &OpIdx,
1753 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1754 // Defs start from operand 0
1755 for (; OpIdx < MethodIDs.size(); ++OpIdx) {
1756 if (MethodIDs[OpIdx] == None)
1757 continue;
1758 MachineOperand &Op = MI.getOperand(OpIdx);
1759 Register Reg = Op.getReg();
1760 LLT Ty = MRI.getType(Reg);
1761 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg);
1762
1763 switch (MethodIDs[OpIdx]) {
1764 // vcc, sgpr and vgpr scalars, pointers and vectors
1765 case Vcc:
1766 case Sgpr16:
1767 case Sgpr32:
1768 case Sgpr64:
1769 case Sgpr128:
1770 case SgprP0:
1771 case SgprP1:
1772 case SgprP3:
1773 case SgprP4:
1774 case SgprP5:
1775 case SgprP8:
1776 case SgprV2S16:
1777 case SgprV2S32:
1778 case SgprV4S32:
1779 case Vgpr16:
1780 case Vgpr32:
1781 case Vgpr64:
1782 case Vgpr128:
1783 case VgprP0:
1784 case VgprP1:
1785 case VgprP2:
1786 case VgprP3:
1787 case VgprP4:
1788 case VgprP5:
1789 case VgprV2S16:
1790 case VgprV2S32:
1791 case VgprV2S64:
1792 case VgprV3S32:
1793 case VgprV4S16:
1794 case VgprV8S16:
1795 case VgprV16S16:
1796 case VgprV4S32:
1797 case VgprV6S32:
1798 case VgprV8S32:
1799 case VgprV16S32:
1800 case VgprV32S16:
1801 case VgprV32S32: {
1802 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1803 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1804 break;
1805 }
1806 // sgpr and vgpr B-types
1807 case SgprB32:
1808 case SgprB64:
1809 case SgprB96:
1810 case SgprB128:
1811 case SgprB256:
1812 case SgprB512:
1813 case SgprBRC:
1814 case SgprPtr32:
1815 case SgprPtr64:
1816 case SgprPtr128:
1817 case VgprB32:
1818 case VgprB64:
1819 case VgprB96:
1820 case VgprB128:
1821 case VgprB160:
1822 case VgprB256:
1823 case VgprB512:
1824 case VgprBRC:
1825 case VgprPtr32:
1826 case VgprPtr64:
1827 case VgprPtr128: {
1828 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1829 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1830 break;
1831 }
1832 case VgprAnyTy: {
1833 assert(RB == VgprRB);
1834 break;
1835 }
1836 case AgprAnyTy: {
1837 if (RB == AgprRB)
1838 break;
1839 Register NewAgprDst = MRI.createVirtualRegister({AgprRB, Ty});
1840 Op.setReg(NewAgprDst);
1841 if (!MRI.use_nodbg_empty(Reg))
1842 B.buildCopy(Reg, NewAgprDst);
1843 break;
1844 }
1845 // uniform in vcc/vgpr: scalars, vectors and B-types
1846 case UniInVcc: {
1847 assert(Ty == S1);
1848 assert(RB == SgprRB);
1849 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1850 Op.setReg(NewDst);
1851 if (!MRI.use_empty(Reg)) {
1852 auto CopyS32_Vcc =
1853 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1854 B.buildTrunc(Reg, CopyS32_Vcc);
1855 }
1856 break;
1857 }
1858 case UniInVgprS16: {
1859 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1860 assert(RB == SgprRB);
1861 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1862 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1863 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1864 Op.setReg(NewVgprDstS16);
1865 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1866 buildReadAnyLane(B, NewSgprDstS32, NewVgprDstS32, RBI);
1867 B.buildTrunc(Reg, NewSgprDstS32);
1868 break;
1869 }
1870 case UniInVgprS32:
1871 case UniInVgprS64:
1872 case UniInVgprV2S16:
1873 case UniInVgprV2S32:
1874 case UniInVgprV3S32:
1875 case UniInVgprV4S32:
1876 case UniInVgprV2S64:
1877 case UniInVgprV6S32:
1878 case UniInVgprV8S16:
1879 case UniInVgprV8S32:
1880 case UniInVgprV16S16:
1881 case UniInVgprV16S32:
1882 case UniInVgprV32S16:
1883 case UniInVgprV32S32: {
1884 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1885 assert(RB == SgprRB);
1886 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1887 Op.setReg(NewVgprDst);
1888 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1889 break;
1890 }
1891 case UniInVgprB32:
1892 case UniInVgprB64:
1893 case UniInVgprB96:
1894 case UniInVgprB128:
1895 case UniInVgprB160:
1896 case UniInVgprB256:
1897 case UniInVgprB512: {
1898 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1899 assert(RB == SgprRB);
1900 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1901 Op.setReg(NewVgprDst);
1902 AMDGPU::buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1903 break;
1904 }
1905 // sgpr trunc
1906 case Sgpr32Trunc: {
1907 assert(Ty.getSizeInBits() < 32);
1908 assert(RB == SgprRB);
1909 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1910 Op.setReg(NewDst);
1911 if (!MRI.use_empty(Reg))
1912 B.buildTrunc(Reg, NewDst);
1913 break;
1914 }
1915 case Sgpr32ToVgprDst:
1916 case Sgpr64ToVgprDst: {
1917 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1918 assert(RB == VgprRB);
1919 Op.setReg(MRI.createVirtualRegister({SgprRB, Ty}));
1920 B.buildCopy(Reg, Op.getReg());
1921 break;
1922 }
1923 case InvalidMapping: {
1925 MF, MORE, "amdgpu-regbanklegalize",
1926 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for", MI);
1927 return false;
1928 }
1929 default:
1931 MF, MORE, "amdgpu-regbanklegalize",
1932 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported", MI);
1933 return false;
1934 }
1935 }
1936
1937 return true;
1938}
1939
1940bool RegBankLegalizeHelper::applyMappingSrc(
1941 MachineInstr &MI, unsigned &OpIdx,
1942 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1943 WaterfallInfo &WFI) {
1944 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) {
1945 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm)
1946 continue;
1947
1948 MachineOperand &Op = MI.getOperand(OpIdx);
1949 Register Reg = Op.getReg();
1950 LLT Ty = MRI.getType(Reg);
1951 const RegisterBank *RB = MRI.getRegBank(Reg);
1952
1953 switch (MethodIDs[i]) {
1954 case Vcc: {
1955 assert(Ty == S1);
1956 assert(RB == VccRB || RB == SgprRB);
1957 if (RB == SgprRB) {
1958 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1959 auto CopyVcc_Scc =
1960 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1961 Op.setReg(CopyVcc_Scc.getReg(0));
1962 }
1963 break;
1964 }
1965 // sgpr scalars, pointers and vectors
1966 case Sgpr16:
1967 case Sgpr32:
1968 case Sgpr64:
1969 case Sgpr128:
1970 case SgprP0:
1971 case SgprP1:
1972 case SgprP3:
1973 case SgprP4:
1974 case SgprP5:
1975 case SgprP8:
1976 case SgprV2S16:
1977 case SgprV2S32:
1978 case SgprV4S32: {
1979 assert(Ty == getTyFromID(MethodIDs[i]));
1980 assert(RB == getRegBankFromID(MethodIDs[i]));
1981 break;
1982 }
1983 // sgpr B-types
1984 case SgprB32:
1985 case SgprB64:
1986 case SgprB96:
1987 case SgprB128:
1988 case SgprB256:
1989 case SgprB512:
1990 case SgprBRC:
1991 case SgprPtr32:
1992 case SgprPtr64:
1993 case SgprPtr128: {
1994 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1995 assert(RB == getRegBankFromID(MethodIDs[i]));
1996 break;
1997 }
1998 // vgpr scalars, pointers and vectors
1999 case Vgpr16:
2000 case Vgpr32:
2001 case Vgpr64:
2002 case Vgpr128:
2003 case VgprP0:
2004 case VgprP1:
2005 case VgprP2:
2006 case VgprP3:
2007 case VgprP4:
2008 case VgprP5:
2009 case VgprV2S16:
2010 case VgprV2S32:
2011 case VgprV2S64:
2012 case VgprV3S32:
2013 case VgprV4S16:
2014 case VgprV8S16:
2015 case VgprV16S16:
2016 case VgprV4S32:
2017 case VgprV6S32:
2018 case VgprV8S32:
2019 case VgprV16S32:
2020 case VgprV32S16:
2021 case VgprV32S32: {
2022 assert(Ty == getTyFromID(MethodIDs[i]));
2023 if (RB != VgprRB) {
2024 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
2025 Op.setReg(CopyToVgpr.getReg(0));
2026 }
2027 break;
2028 }
2029 // vgpr B-types
2030 case VgprB32:
2031 case VgprB64:
2032 case VgprB96:
2033 case VgprB128:
2034 case VgprB160:
2035 case VgprB256:
2036 case VgprB512:
2037 case VgprBRC:
2038 case VgprPtr32:
2039 case VgprPtr64:
2040 case VgprPtr128: {
2041 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2042 if (RB != VgprRB) {
2043 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
2044 Op.setReg(CopyToVgpr.getReg(0));
2045 }
2046 break;
2047 }
2048 case VgprAnyTy: {
2049 if (RB != VgprRB) {
2050 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
2051 Op.setReg(CopyToVgpr.getReg(0));
2052 }
2053 break;
2054 }
2055 case AgprAnyTy: {
2056 if (RB != AgprRB) {
2057 auto CopyToAgpr = B.buildCopy({AgprRB, Ty}, Reg);
2058 Op.setReg(CopyToAgpr.getReg(0));
2059 }
2060 break;
2061 }
2062 // sgpr waterfall, scalars, and vectors
2063 case Sgpr32_WF:
2064 case SgprV4S32_WF: {
2065 assert(Ty == getTyFromID(MethodIDs[i]));
2066 if (RB != SgprRB) {
2067 WFI.SgprWaterfallOperandRegs.insert(Reg);
2068 if (!WFI.Start.isValid()) {
2069 WFI.Start = MI.getIterator();
2070 WFI.End = std::next(MI.getIterator());
2071 }
2072 }
2073 break;
2074 }
2075 case SgprP0Call_WF:
2076 case SgprP4Call_WF: {
2077 assert(Ty == getTyFromID(MethodIDs[i]));
2078 if (RB != SgprRB) {
2079 WFI.SgprWaterfallOperandRegs.insert(Reg);
2080
2081 // Find the ADJCALLSTACKUP before the call.
2082 MachineBasicBlock::iterator Start = MI.getIterator();
2083 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
2084 --Start;
2085
2086 // Find the ADJCALLSTACKDOWN after the call (include it in range).
2087 MachineBasicBlock::iterator End = MI.getIterator();
2088 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
2089 ++End;
2090 ++End;
2091
2092 WFI.Start = Start;
2093 WFI.End = End;
2094 }
2095 break;
2096 }
2097 case SgprB32_M0:
2099 case SgprB64_ReadFirstLane: {
2100 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2101 if (RB == SgprRB)
2102 break;
2103 assert(RB == VgprRB);
2104 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2105 buildReadFirstLane(B, NewSGPR, Op.getReg(), RBI);
2106 Op.setReg(NewSGPR);
2107 break;
2108 }
2110 assert(Ty == getTyFromID(MethodIDs[i]));
2111 if (RB == SgprRB)
2112 break;
2113 assert(RB == VgprRB);
2114 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2115 buildReadFirstLane(B, NewSGPR, Op.getReg(), RBI);
2116 Op.setReg(NewSGPR);
2117 break;
2118 }
2119 // sgpr and vgpr scalars with extend
2120 case Sgpr32AExt: {
2121 // Note: this ext allows S1, and it is meant to be combined away.
2122 assert(Ty.getSizeInBits() < 32);
2123 assert(RB == SgprRB);
2124 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
2125 Op.setReg(Aext.getReg(0));
2126 break;
2127 }
2128 case Sgpr32AExtBoolInReg: {
2129 // Note: this ext allows S1, and it is meant to be combined away.
2130 assert(Ty.getSizeInBits() == 1);
2131 assert(RB == SgprRB);
2132 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
2133 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
2134 // most of times meant to be combined away in AMDGPURegBankCombiner.
2135 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
2136 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
2137 Op.setReg(BoolInReg.getReg(0));
2138 break;
2139 }
2140 case Sgpr32SExt: {
2141 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
2142 assert(RB == SgprRB);
2143 auto Sext = B.buildSExt(SgprRB_S32, Reg);
2144 Op.setReg(Sext.getReg(0));
2145 break;
2146 }
2147 case Sgpr32ZExt: {
2148 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
2149 assert(RB == SgprRB);
2150 auto Zext = B.buildZExt({SgprRB, S32}, Reg);
2151 Op.setReg(Zext.getReg(0));
2152 break;
2153 }
2154 case Vgpr32AExt: {
2155 assert(Ty.getSizeInBits() < 32);
2156 assert(RB == VgprRB);
2157 auto Aext = B.buildAnyExt({VgprRB, S32}, Reg);
2158 Op.setReg(Aext.getReg(0));
2159 break;
2160 }
2161 case Vgpr32SExt: {
2162 // Note this ext allows S1, and it is meant to be combined away.
2163 assert(Ty.getSizeInBits() < 32);
2164 assert(RB == VgprRB);
2165 auto Sext = B.buildSExt({VgprRB, S32}, Reg);
2166 Op.setReg(Sext.getReg(0));
2167 break;
2168 }
2169 case Vgpr32ZExt: {
2170 // Note this ext allows S1, and it is meant to be combined away.
2171 assert(Ty.getSizeInBits() < 32);
2172 assert(RB == VgprRB);
2173 auto Zext = B.buildZExt({VgprRB, S32}, Reg);
2174 Op.setReg(Zext.getReg(0));
2175 break;
2176 }
2177 default:
2179 MF, MORE, "amdgpu-regbanklegalize",
2180 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported", MI);
2181 return false;
2182 }
2183 }
2184 return true;
2185}
2186
2187[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
2188 const RegisterBank *RB,
2190 unsigned StartOpIdx,
2191 unsigned EndOpIdx) {
2192 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
2193 if (MRI.getRegBankOrNull(MI.getOperand(i).getReg()) != RB)
2194 return false;
2195 }
2196 return true;
2197}
2198
2199bool RegBankLegalizeHelper::applyRegisterBanksVgprWithSgprRsrc(
2200 MachineInstr &MI, unsigned RsrcIdx) {
2201 const unsigned NumDefs = MI.getNumExplicitDefs();
2202
2203 MachineBasicBlock *MBB = MI.getParent();
2204 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(MI.getIterator())));
2205
2206 // Defs are vgpr.
2207 for (unsigned i = 0; i < NumDefs; ++i) {
2208 Register Reg = MI.getOperand(i).getReg();
2209 if (MRI.getRegBank(Reg) == VgprRB)
2210 continue;
2211
2212 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, MRI.getType(Reg)});
2213 MI.getOperand(i).setReg(NewVgprDst);
2214 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
2215 }
2216
2217 B.setInstrAndDebugLoc(MI);
2218
2219 // Register uses before RsrcIdx are vgpr.
2220 for (unsigned i = NumDefs; i < RsrcIdx; ++i) {
2221 MachineOperand &Op = MI.getOperand(i);
2222 if (!Op.isReg())
2223 continue;
2224
2225 Register Reg = Op.getReg();
2226 if (!Reg.isVirtual())
2227 continue;
2228
2229 if (MRI.getRegBank(Reg) == VgprRB)
2230 continue;
2231
2232 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
2233 Op.setReg(Copy.getReg(0));
2234 }
2235
2236 SmallSet<Register, 4> OpsToWaterfall;
2237
2238 // Register use RsrcIdx (and later register operands) is sgpr.
2239 for (unsigned i = RsrcIdx; i < MI.getNumOperands(); ++i) {
2240 MachineOperand &Op = MI.getOperand(i);
2241 if (!Op.isReg())
2242 continue;
2243
2244 Register Reg = Op.getReg();
2245 if (MRI.getRegBank(Reg) != SgprRB)
2246 OpsToWaterfall.insert(Reg);
2247 }
2248
2249 if (!OpsToWaterfall.empty()) {
2250 MachineBasicBlock::iterator MII = MI.getIterator();
2251 executeInWaterfallLoop(B, {OpsToWaterfall, MII, std::next(MII)});
2252 }
2253
2254 return true;
2255}
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
IRTranslator LLVM IR MI
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
const SmallVectorImpl< MachineOperand > & Cond
static const LaneMaskConstants & get(const GCNSubtarget &ST)
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
@ ICMP_NE
not equal
Definition InstrTypes.h:762
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:225
iterator end()
Definition DenseMap.h:143
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:286
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
Register getSourceReg() const
Get the unmerge source register.
constexpr bool isScalar() const
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank * getRegBank(Register Reg) const
Return the register bank of Reg.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
bool empty() const
Definition SmallSet.h:169
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isAnyPtr(LLT Ty, unsigned Width)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Bitcast
Perform the operation on a different, but equivalently sized type.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
@ Offset
Definition DWP.cpp:558
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:156
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
Definition Utils.cpp:258
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
Holds waterfall loop information: the set of SGPR operand registers that need waterfalling,...
MachineBasicBlock::iterator Start
SmallSet< Register, 4 > SgprWaterfallOperandRegs