LLVM 23.0.0git
AMDGPURegBankLegalizeHelper.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Implements actual lowering algorithms for each ID that can be used in
10/// Rule.OperandMapping. Similar to legalizer helper but with register banks.
11//
12//===----------------------------------------------------------------------===//
13
16#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
27
28#define DEBUG_TYPE "amdgpu-regbanklegalize"
29
30using namespace llvm;
31using namespace AMDGPU;
32
35 const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
36 : MF(B.getMF()), ST(MF.getSubtarget<GCNSubtarget>()), B(B),
37 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
38 RBLRules(RBLRules), IsWave32(ST.isWave32()),
39 SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
40 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
41 AgprRB(&RBI.getRegBank(AMDGPU::AGPRRegBankID)),
42 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
43
45 const SetOfRulesForOpcode *RuleSet = RBLRules.getRulesForOpc(MI);
46 if (!RuleSet) {
47 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
48 "No AMDGPU RegBankLegalize rules defined for opcode",
49 MI);
50 return false;
51 }
52
53 const RegBankLLTMapping *Mapping = RuleSet->findMappingForMI(MI, MRI, MUI);
54 if (!Mapping) {
55 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
56 "AMDGPU RegBankLegalize: none of the rules defined with "
57 "'Any' for MI's opcode matched MI",
58 MI);
59 return false;
60 }
61
62 WaterfallInfo WFI;
63 unsigned OpIdx = 0;
64 if (!Mapping->DstOpMapping.empty()) {
65 B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
66 if (!applyMappingDst(MI, OpIdx, Mapping->DstOpMapping))
67 return false;
68 }
69 if (!Mapping->SrcOpMapping.empty()) {
70 B.setInstr(MI);
71 if (!applyMappingSrc(MI, OpIdx, Mapping->SrcOpMapping, WFI))
72 return false;
73 }
74
75 if (!lower(MI, *Mapping, WFI))
76 return false;
77
78 return true;
79}
80
81bool RegBankLegalizeHelper::executeInWaterfallLoop(MachineIRBuilder &B,
82 const WaterfallInfo &WFI) {
83 assert(WFI.Start.isValid() && WFI.End.isValid() &&
84 "Waterfall range not initialized");
85
86 // Track use registers which have already been expanded with a readfirstlane
87 // sequence. This may have multiple uses if moving a sequence.
88 DenseMap<Register, Register> WaterfalledRegMap;
89
90 MachineBasicBlock &MBB = B.getMBB();
91 MachineFunction &MF = B.getMF();
92
95
97 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
98 unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
99 if (IsWave32) {
100 MovExecOpc = AMDGPU::S_MOV_B32;
101 MovExecTermOpc = AMDGPU::S_MOV_B32_term;
102 XorTermOpc = AMDGPU::S_XOR_B32_term;
103 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
104 ExecReg = AMDGPU::EXEC_LO;
105 } else {
106 MovExecOpc = AMDGPU::S_MOV_B64;
107 MovExecTermOpc = AMDGPU::S_MOV_B64_term;
108 XorTermOpc = AMDGPU::S_XOR_B64_term;
109 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
110 ExecReg = AMDGPU::EXEC;
111 }
112
113#ifndef NDEBUG
114 const int OrigRangeSize = std::distance(BeginIt, EndIt);
115#endif
116
117 MachineRegisterInfo &MRI = *B.getMRI();
118 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
119 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
120
121 // Don't bother using generic instructions/registers for the exec mask.
122 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
123
124 Register SavedExec = MRI.createVirtualRegister(WaveRC);
125
126 // To insert the loop we need to split the block. Move everything before
127 // this point to a new block, and insert a new empty block before this
128 // instruction.
131 MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
132 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
134 ++MBBI;
135 MF.insert(MBBI, LoopBB);
136 MF.insert(MBBI, BodyBB);
137 MF.insert(MBBI, RestoreExecBB);
138 MF.insert(MBBI, RemainderBB);
139
140 LoopBB->addSuccessor(BodyBB);
141 BodyBB->addSuccessor(RestoreExecBB);
142 BodyBB->addSuccessor(LoopBB);
143
144 // Move the rest of the block into a new block.
146 RemainderBB->splice(RemainderBB->begin(), &MBB, EndIt, MBB.end());
147
148 MBB.addSuccessor(LoopBB);
149 RestoreExecBB->addSuccessor(RemainderBB);
150
151 B.setInsertPt(*LoopBB, LoopBB->end());
152
153 // +-MBB:------------+
154 // | ... |
155 // | %0 = G_INST_1 |
156 // | %Dst = MI %Vgpr |
157 // | %1 = G_INST_2 |
158 // | ... |
159 // +-----------------+
160 // ->
161 // +-MBB-------------------------------+
162 // | ... |
163 // | %0 = G_INST_1 |
164 // | %SaveExecReg = S_MOV_B32 $exec_lo |
165 // +----------------|------------------+
166 // | /------------------------------|
167 // V V |
168 // +-LoopBB---------------------------------------------------------------+ |
169 // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
170 // | instead of executing for each lane, see if other lanes had | |
171 // | same value for %Vgpr and execute for them also. | |
172 // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
173 // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
174 // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
175 // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
176 // +----------------|-----------------------------------------------------+ |
177 // V |
178 // +-BodyBB------------------------------------------------------------+ |
179 // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
180 // | executed only for active lanes and written to Dst | |
181 // | $exec = S_XOR_B32 $exec, %SavedExec | |
182 // | set active lanes to 0 in SavedExec, lanes that did not write to | |
183 // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
184 // | SI_WATERFALL_LOOP LoopBB |-----|
185 // +----------------|--------------------------------------------------+
186 // V
187 // +-RestoreExecBB--------------------------+
188 // | $exec_lo = S_MOV_B32_term %SaveExecReg |
189 // +----------------|-----------------------+
190 // V
191 // +-RemainderBB:----------------------+
192 // | %1 = G_INST_2 |
193 // | ... |
194 // +---------------------------------- +
195
196 // Move the instruction into the loop body. Note we moved everything after
197 // Range.end() already into a new block, so Range.end() is no longer valid.
198 BodyBB->splice(BodyBB->end(), &MBB, BeginIt, MBB.end());
199
200 // Figure out the iterator range after splicing the instructions.
201 MachineBasicBlock::iterator NewBegin = BeginIt;
202 auto NewEnd = BodyBB->end();
203 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
204
205 B.setMBB(*LoopBB);
206 Register CondReg;
207
208 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
209 for (MachineOperand &Op : MI.all_uses()) {
210 Register OldReg = Op.getReg();
211 if (!WFI.SgprWaterfallOperandRegs.count(OldReg))
212 continue;
213
214 // See if we already processed this register in another instruction in
215 // the sequence.
216 auto OldVal = WaterfalledRegMap.find(OldReg);
217 if (OldVal != WaterfalledRegMap.end()) {
218 Op.setReg(OldVal->second);
219 continue;
220 }
221
222 Register OpReg = Op.getReg();
223 LLT OpTy = MRI.getType(OpReg);
224
225 // TODO: support for agpr
226 assert(MRI.getRegBank(OpReg) == VgprRB);
227 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
228 buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
229
230 // Build the comparison(s), CurrentLaneReg == OpReg.
231 unsigned OpSize = OpTy.getSizeInBits();
232 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
233 LLT PartTy = LLT::scalar(PartSize);
234 unsigned NumParts = OpSize / PartSize;
236 SmallVector<Register, 8> CurrentLaneParts;
237
238 if (NumParts == 1) {
239 OpParts.push_back(OpReg);
240 CurrentLaneParts.push_back(CurrentLaneReg);
241 } else {
242 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
243 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
244 for (unsigned i = 0; i < NumParts; ++i) {
245 OpParts.push_back(UnmergeOp.getReg(i));
246 CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
247 }
248 }
249
250 for (unsigned i = 0; i < NumParts; ++i) {
251 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
252 B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
253
254 if (!CondReg)
255 CondReg = CmpReg;
256 else
257 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
258 }
259
260 Op.setReg(CurrentLaneReg);
261
262 // Make sure we don't re-process this register again.
263 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
264 }
265 }
266
267 // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
268 Register CondRegLM =
269 MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
270 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
271
272 // Update EXEC, save the original EXEC value to SavedExec.
273 B.buildInstr(AndSaveExecOpc)
274 .addDef(SavedExec)
275 .addReg(CondRegLM, RegState::Kill);
276 MRI.setSimpleHint(SavedExec, CondRegLM);
277
278 B.setInsertPt(*BodyBB, BodyBB->end());
279
280 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
281 B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
282
283 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
284 // s_cbranch_scc0?
285
286 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
287 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
288
289 // Save the EXEC mask before the loop.
290 B.setInsertPt(MBB, MBB.end());
291 B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
292
293 // Restore the EXEC mask after the loop.
294 B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
295 B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
296
297 // Set the insert point after the original instruction, so any new
298 // instructions will be in the remainder.
299 B.setInsertPt(*RemainderBB, RemainderBB->begin());
300
301 return true;
302}
303
304bool RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
305 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
306 MachineFunction &MF = B.getMF();
307 assert(MI.getNumMemOperands() == 1);
308 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
309 Register Dst = MI.getOperand(0).getReg();
310 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
311 Register Base = MI.getOperand(1).getReg();
312 LLT PtrTy = MRI.getType(Base);
313 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base);
314 LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
315 SmallVector<Register, 4> LoadPartRegs;
316
317 unsigned ByteOffset = 0;
318 for (LLT PartTy : LLTBreakdown) {
319 Register BasePlusOffset;
320 if (ByteOffset == 0) {
321 BasePlusOffset = Base;
322 } else {
323 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
324 BasePlusOffset =
325 B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0);
326 }
327 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
328 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
329 LoadPartRegs.push_back(LoadPart.getReg(0));
330 ByteOffset += PartTy.getSizeInBytes();
331 }
332
333 if (!MergeTy.isValid()) {
334 // Loads are of same size, concat or merge them together.
335 B.buildMergeLikeInstr(Dst, LoadPartRegs);
336 } else {
337 // Loads are not all of same size, need to unmerge them to smaller pieces
338 // of MergeTy type, then merge pieces to Dst.
339 SmallVector<Register, 4> MergeTyParts;
340 for (Register Reg : LoadPartRegs) {
341 if (MRI.getType(Reg) == MergeTy) {
342 MergeTyParts.push_back(Reg);
343 } else {
344 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg);
345 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
346 MergeTyParts.push_back(Unmerge.getReg(i));
347 }
348 }
349 B.buildMergeLikeInstr(Dst, MergeTyParts);
350 }
351 MI.eraseFromParent();
352 return true;
353}
354
355bool RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
356 LLT MergeTy) {
357 MachineFunction &MF = B.getMF();
358 assert(MI.getNumMemOperands() == 1);
359 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
360 Register Dst = MI.getOperand(0).getReg();
361 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
362 Register Base = MI.getOperand(1).getReg();
363
364 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
365 auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO);
366
367 if (WideTy.isScalar()) {
368 B.buildTrunc(Dst, WideLoad);
369 } else {
370 SmallVector<Register, 4> MergeTyParts;
371 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
372
373 LLT DstTy = MRI.getType(Dst);
374 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
375 for (unsigned i = 0; i < NumElts; ++i) {
376 MergeTyParts.push_back(Unmerge.getReg(i));
377 }
378 B.buildMergeLikeInstr(Dst, MergeTyParts);
379 }
380 MI.eraseFromParent();
381 return true;
382}
383
384bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
385 Register Dst = MI.getDstReg();
386 Register Ptr = MI.getPointerReg();
387 MachineMemOperand &MMO = MI.getMMO();
388 unsigned MemSize = 8 * MMO.getSize().getValue();
389
390 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
391
392 if (MI.getOpcode() == G_LOAD) {
393 B.buildLoad(Dst, Ptr, *WideMMO);
394 } else {
395 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
396
397 if (MI.getOpcode() == G_ZEXTLOAD) {
398 APInt Mask = APInt::getLowBitsSet(S32.getSizeInBits(), MemSize);
399 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
400 B.buildAnd(Dst, Load, MaskCst);
401 } else {
402 assert(MI.getOpcode() == G_SEXTLOAD);
403 B.buildSExtInReg(Dst, Load, MemSize);
404 }
405 }
406
407 MI.eraseFromParent();
408 return true;
409}
410
411bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
412 Register Dst = MI.getOperand(0).getReg();
413 LLT Ty = MRI.getType(Dst);
414 Register Src = MI.getOperand(1).getReg();
415 unsigned Opc = MI.getOpcode();
416 int TrueExtCst = Opc == G_SEXT ? -1 : 1;
417 if (Ty == S32 || Ty == S16) {
418 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
419 auto False = B.buildConstant({VgprRB, Ty}, 0);
420 B.buildSelect(Dst, Src, True, False);
421 } else if (Ty == S64) {
422 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
423 auto False = B.buildConstant({VgprRB_S32}, 0);
424 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
425 MachineInstrBuilder Hi;
426 switch (Opc) {
427 case G_SEXT:
428 Hi = Lo;
429 break;
430 case G_ZEXT:
431 Hi = False;
432 break;
433 case G_ANYEXT:
434 Hi = B.buildUndef({VgprRB_S32});
435 break;
436 default:
438 MF, MORE, "amdgpu-regbanklegalize",
439 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported", MI);
440 return false;
441 }
442
443 B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)});
444 } else {
446 MF, MORE, "amdgpu-regbanklegalize",
447 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported", MI);
448 return false;
449 }
450
451 MI.eraseFromParent();
452 return true;
453}
454
455std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
456 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
457 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
458 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
459 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
460 return {Lo.getReg(0), Hi.getReg(0)};
461}
462
463std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
464 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
465 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
466 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
467 return {Lo.getReg(0), Hi.getReg(0)};
468}
469
470std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
471 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
472 auto Lo = PackedS32;
473 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
474 return {Lo.getReg(0), Hi.getReg(0)};
475}
476
477std::pair<Register, Register>
478RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) {
479 auto [Lo32, Hi32] = unpackAExt(Reg);
480 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
481 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
482}
483
484bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
485 Register Lo, Hi;
486 switch (MI.getOpcode()) {
487 case AMDGPU::G_SHL: {
488 auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg());
489 auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg());
490 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
491 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
492 break;
493 }
494 case AMDGPU::G_LSHR: {
495 auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg());
496 auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg());
497 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
498 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
499 break;
500 }
501 case AMDGPU::G_ASHR: {
502 auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg());
503 auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg());
504 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
505 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
506 break;
507 }
508 default:
510 MF, MORE, "amdgpu-regbanklegalize",
511 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
512 MI);
513 return false;
514 }
515 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
516 MI.eraseFromParent();
517 return true;
518}
519
520bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
521 Register Lo, Hi;
522 switch (MI.getOpcode()) {
523 case AMDGPU::G_SMIN:
524 case AMDGPU::G_SMAX: {
525 // For signed operations, use sign extension
526 auto [Val0_Lo, Val0_Hi] = unpackSExt(MI.getOperand(1).getReg());
527 auto [Val1_Lo, Val1_Hi] = unpackSExt(MI.getOperand(2).getReg());
528 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
529 .getReg(0);
530 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
531 .getReg(0);
532 break;
533 }
534 case AMDGPU::G_UMIN:
535 case AMDGPU::G_UMAX: {
536 // For unsigned operations, use zero extension
537 auto [Val0_Lo, Val0_Hi] = unpackZExt(MI.getOperand(1).getReg());
538 auto [Val1_Lo, Val1_Hi] = unpackZExt(MI.getOperand(2).getReg());
539 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
540 .getReg(0);
541 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
542 .getReg(0);
543 break;
544 }
545 default:
547 MF, MORE, "amdgpu-regbanklegalize",
548 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented", MI);
549 return false;
550 }
551 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
552 MI.eraseFromParent();
553 return true;
554}
555
556bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) {
557 auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg());
558 auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg());
559 auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
560 auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
561 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
562 {ResLo.getReg(0), ResHi.getReg(0)});
563 MI.eraseFromParent();
564 return true;
565}
566
569 return (GI->is(Intrinsic::amdgcn_sbfe));
570
571 return MI.getOpcode() == AMDGPU::G_SBFX;
572}
573
574bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
575 Register Dst = MI.getOperand(0).getReg();
576 assert(MRI.getType(Dst) == LLT::scalar(64));
577 bool Signed = isSignedBFE(MI);
578 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
579 // Extract bitfield from Src, LSBit is the least-significant bit for the
580 // extraction (field offset) and Width is size of bitfield.
581 Register Src = MI.getOperand(FirstOpnd).getReg();
582 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
583 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
584 // Comments are for signed bitfield extract, similar for unsigned. x is sign
585 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
586
587 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
588 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
589 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
590
591 auto ConstWidth = getIConstantVRegValWithLookThrough(Width, MRI);
592
593 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
594 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
595 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
596 if (!ConstWidth) {
597 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
598 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
599 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
600 MI.eraseFromParent();
601 return true;
602 }
603
604 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
605 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
606 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
607 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
608 auto Zero = B.buildConstant({VgprRB, S32}, 0);
609 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
610
611 if (WidthImm <= 32) {
612 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
613 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo, Zero, Width});
614 MachineInstrBuilder Hi;
615 if (Signed) {
616 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
617 Hi = B.buildAShr(VgprRB_S32, Lo, B.buildConstant(VgprRB_S32, 31));
618 } else {
619 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
620 Hi = Zero;
621 }
622 B.buildMergeLikeInstr(Dst, {Lo, Hi});
623 } else {
624 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
625 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
626 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi, Zero, Amt});
627 B.buildMergeLikeInstr(Dst, {SHRSrcLo, Hi});
628 }
629
630 MI.eraseFromParent();
631 return true;
632}
633
634bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
635 Register DstReg = MI.getOperand(0).getReg();
636 LLT Ty = MRI.getType(DstReg);
637 bool Signed = isSignedBFE(MI);
638 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
639 Register Src = MI.getOperand(FirstOpnd).getReg();
640 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
641 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
642 // For uniform bit field extract there are 4 available instructions, but
643 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
644 // field offset in low and size in high 16 bits.
645
646 // Src1 Hi16|Lo16 = Size|FieldOffset
647 auto Mask = B.buildConstant(SgprRB_S32, maskTrailingOnes<unsigned>(6));
648 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
649 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
650 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset, Size);
651 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
652 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
653 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
654
655 // Select machine instruction, because of reg class constraining, insert
656 // copies from reg class to reg bank.
657 auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}},
658 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
659 constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(),
660 *ST.getRegisterInfo(), RBI);
661
662 B.buildCopy(DstReg, S_BFE->getOperand(0).getReg());
663 MI.eraseFromParent();
664 return true;
665}
666
667bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
668 Register Dst = MI.getOperand(0).getReg();
669 LLT DstTy = MRI.getType(Dst);
670 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
671 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
672 auto Op1 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(1).getReg());
673 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
674 unsigned Opc = MI.getOpcode();
675 auto Flags = MI.getFlags();
676 auto Lo =
677 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)}, Flags);
678 auto Hi =
679 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags);
680 B.buildMergeLikeInstr(Dst, {Lo, Hi});
681 MI.eraseFromParent();
682 return true;
683}
684
685bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &MI) {
686 Register Dst = MI.getOperand(0).getReg();
687 assert(MRI.getType(Dst) == S64);
688 auto Op1 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(1).getReg());
689 auto Op2 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(2).getReg());
690
691 // TODO: G_AMDGPU_MAD_* optimizations for G_MUL divergent S64 operation to
692 // match GlobalISel with old regbankselect.
693 auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
694 auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
695 auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
696 auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
697 auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
698 auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
699
700 B.buildMergeLikeInstr(Dst, {Lo, Hi});
701 MI.eraseFromParent();
702 return true;
703}
704
705bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
706 Register Dst = MI.getOperand(0).getReg();
707 assert(MRI.getType(Dst) == V2S16);
708 unsigned Opc = MI.getOpcode();
709 unsigned NumOps = MI.getNumOperands();
710 auto Flags = MI.getFlags();
711
712 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
713
714 if (NumOps == 2) {
715 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags);
716 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags);
717 B.buildMergeLikeInstr(Dst, {Lo, Hi});
718 MI.eraseFromParent();
719 return true;
720 }
721
722 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(MI.getOperand(2).getReg());
723
724 if (NumOps == 3) {
725 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
726 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
727 B.buildMergeLikeInstr(Dst, {Lo, Hi});
728 MI.eraseFromParent();
729 return true;
730 }
731
732 assert(NumOps == 4);
733 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(MI.getOperand(3).getReg());
734 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo}, Flags);
735 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi}, Flags);
736 B.buildMergeLikeInstr(Dst, {Lo, Hi});
737 MI.eraseFromParent();
738 return true;
739}
740
741bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &MI) {
742 Register Dst0 = MI.getOperand(0).getReg();
743 Register Dst1 = MI.getOperand(1).getReg();
744 Register Src0 = MI.getOperand(2).getReg();
745 Register Src1 = MI.getOperand(3).getReg();
746 Register Src2 = MI.getOperand(4).getReg();
747
748 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
749
750 // Keep the multiplication on the SALU.
751 Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
752 Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
753 if (ST.hasScalarMulHiInsts()) {
754 B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
755 } else {
756 auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
757 auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
758 auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
759 buildReadAnyLane(B, DstHi, MulHi.getReg(0), RBI);
760 }
761
762 // Accumulate and produce the "carry-out" bit.
763
764 // The "carry-out" is defined as bit 64 of the result when computed as a
765 // big integer. For unsigned multiply-add, this matches the usual
766 // definition of carry-out.
767 if (mi_match(Src2, MRI, MIPatternMatch::m_ZeroInt())) {
768 // No accumulate: result is just the multiplication, carry is 0.
769 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
770 B.buildConstant(Dst1, 0);
771 } else {
772 // Accumulate: add Src2 to the multiplication result with carry chain.
773 Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
774 Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
775 B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
776
777 auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
778 auto AddHi =
779 B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
780 B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
781 B.buildCopy(Dst1, AddHi.getReg(1));
782 }
783
784 MI.eraseFromParent();
785 return true;
786}
787
788bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
789 Register Dst = MI.getOperand(0).getReg();
790 LLT DstTy = MRI.getType(Dst);
791 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
792 (DstTy.isPointer() && DstTy.getSizeInBits() == 64));
793 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
794 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
795 auto Op3 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(3).getReg());
796 Register Cond = MI.getOperand(1).getReg();
797 auto Flags = MI.getFlags();
798 auto Lo =
799 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(0), Op3.getReg(0), Flags);
800 auto Hi =
801 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(1), Op3.getReg(1), Flags);
802
803 B.buildMergeLikeInstr(Dst, {Lo, Hi});
804 MI.eraseFromParent();
805 return true;
806}
807
808bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
809 auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
810 int Amt = MI.getOperand(2).getImm();
811 Register Lo, Hi;
812 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
813 if (Amt <= 32) {
814 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
815 if (Amt == 32) {
816 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
817 Lo = Freeze.getReg(0);
818 } else {
819 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
820 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
821 }
822
823 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
824 Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0);
825 } else {
826 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
827 Lo = Op1.getReg(0);
828 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
829 }
830
831 B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
832 MI.eraseFromParent();
833 return true;
834}
835
836bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &MI) {
837 // Split 64-bit find-first-bit operations into 32-bit halves:
838 // (ffbh hi:lo) -> umin(ffbh(hi), uaddsat(ffbh(lo), 32))
839 // (ffbl hi:lo) -> umin(ffbl(lo), uaddsat(ffbl(hi), 32))
840 // (ctlz_zero_undef hi:lo) -> umin(ffbh(hi), add(ffbh(lo), 32))
841 // (cttz_zero_undef hi:lo) -> umin(ffbl(lo), add(ffbl(hi), 32))
842 unsigned Opc = MI.getOpcode();
843
844 // FFBH/FFBL return 0xFFFFFFFF on zero input, using uaddsat to avoid
845 // wrapping. CTLZ/CTTZ guarantee non-zero input (zero_undef), so plain add
846 // is fine.
847 unsigned FFBOpc;
848 unsigned AddOpc;
849 bool SearchFromMSB;
850 switch (Opc) {
851 case AMDGPU::G_AMDGPU_FFBH_U32:
852 FFBOpc = Opc;
853 AddOpc = AMDGPU::G_UADDSAT;
854 SearchFromMSB = true;
855 break;
856 case AMDGPU::G_AMDGPU_FFBL_B32:
857 FFBOpc = Opc;
858 AddOpc = AMDGPU::G_UADDSAT;
859 SearchFromMSB = false;
860 break;
861 case AMDGPU::G_CTLZ_ZERO_UNDEF:
862 FFBOpc = AMDGPU::G_AMDGPU_FFBH_U32;
863 AddOpc = AMDGPU::G_ADD;
864 SearchFromMSB = true;
865 break;
866 case AMDGPU::G_CTTZ_ZERO_UNDEF:
867 FFBOpc = AMDGPU::G_AMDGPU_FFBL_B32;
868 AddOpc = AMDGPU::G_ADD;
869 SearchFromMSB = false;
870 break;
871 default:
872 llvm_unreachable("unexpected opcode in lowerSplitBitCount64To32");
873 }
874
875 auto Unmerge = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
876 Register Lo = Unmerge.getReg(0);
877 Register Hi = Unmerge.getReg(1);
878
879 // MSB-first (FFBH/CTLZ) searches hi first; LSB-first (FFBL/CTTZ) searches
880 // lo first. The secondary half adds 32 to account for the primary half's
881 // width.
882 auto Primary = B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Hi : Lo});
883 auto Secondary =
884 B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Lo : Hi});
885
886 auto Adjusted = B.buildInstr(AddOpc, {VgprRB_S32},
887 {Secondary, B.buildConstant(VgprRB_S32, 32)});
888 B.buildUMin(MI.getOperand(0).getReg(), Primary, Adjusted);
889
890 MI.eraseFromParent();
891 return true;
892}
893
894bool RegBankLegalizeHelper::lowerExtrVecEltToSel(MachineInstr &MI) {
895 // Lower extract vector element to a compare-select chain:
896 // result = elt[0]
897 // for i in 1..N-1:
898 // result = (idx == i) ? elt[i] : result
899 //
900 // When the index is divergent, each lane may want a different element, so
901 // we must check every element per lane.
902 Register Dst = MI.getOperand(0).getReg();
903 Register Src = MI.getOperand(1).getReg();
904 Register Idx = MI.getOperand(2).getReg();
905
906 LLT VecTy = MRI.getType(Src);
907 LLT ScalarTy = VecTy.getScalarType();
908 unsigned NumElts = VecTy.getNumElements();
909 MachineRegisterInfo::VRegAttrs VgprRB_EltTy = {VgprRB, ScalarTy};
910
911 auto Unmerge = B.buildUnmerge(VgprRB_EltTy, Src);
912
913 if (ScalarTy.getSizeInBits() == 32) {
914 Register PrevSelect = Unmerge.getReg(0);
915 for (unsigned I = 1; I < NumElts; ++I) {
916 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)}, I);
917 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
918 PrevSelect =
919 B.buildSelect(VgprRB_EltTy, Cmp, Unmerge.getReg(I), PrevSelect)
920 .getReg(0);
921 }
922 B.buildCopy(Dst, PrevSelect);
923 } else if (ScalarTy.getSizeInBits() == 64) {
924 auto InitUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(0));
925 Register PrevLo = InitUnmerge.getReg(0);
926 Register PrevHi = InitUnmerge.getReg(1);
927 for (unsigned I = 1; I < NumElts; ++I) {
928 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)}, I);
929 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
930 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(I));
931 PrevLo = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(0), PrevLo)
932 .getReg(0);
933 PrevHi = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(1), PrevHi)
934 .getReg(0);
935 }
936 B.buildMergeLikeInstr(Dst, {PrevLo, PrevHi});
937 } else {
939 MF, MORE, "amdgpu-regbanklegalize",
940 "AMDGPU RegBankLegalize: ExtrVecEltToSel unsupported element type", MI);
941 return false;
942 }
943
944 MI.eraseFromParent();
945 return true;
946}
947
948bool RegBankLegalizeHelper::lowerExtrVecEltTo32(MachineInstr &MI) {
949 // Reduce a 64-bit element extract to two 32-bit extracts:
950 // vec32 = bitcast <N x s64> to <2N x s32>
951 // lo = vec32[idx * 2]
952 // hi = vec32[idx * 2 + 1]
953 // result = merge(lo, hi)
954 //
955 // When the index is uniform, all lanes extract the same element, so we can
956 // just split the s64 extract into two s32 extracts which lower to MOVREL.
957 Register Dst = MI.getOperand(0).getReg();
958 Register Src = MI.getOperand(1).getReg();
959 Register Idx = MI.getOperand(2).getReg();
960
961 LLT SrcTy = MRI.getType(Src);
962 LLT Vec32Ty = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
963
964 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
965 "expected VGPR src and SGPR idx");
966
967 auto CastSrc = B.buildBitcast({VgprRB, Vec32Ty}, Src);
968
969 // Calculate new Lo and Hi indices
970 auto One = B.buildConstant(SgprRB_S32, 1);
971 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
972 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
973
974 auto ExtLo = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxLo);
975 auto ExtHi = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxHi);
976
977 B.buildMergeLikeInstr(Dst, {ExtLo.getReg(0), ExtHi.getReg(0)});
978
979 MI.eraseFromParent();
980 return true;
981}
982
983bool RegBankLegalizeHelper::lowerInsVecEltToSel(MachineInstr &MI) {
984 // Lower insert vector element to a compare-select chain:
985 // for i in 0..N-1:
986 // result[i] = (idx == i) ? elt : srcVec[i]
987 // dst = merge(result[0..N-1])
988 //
989 // VGPR B64 requires splitting to lo/hi s32 pairs since there is no
990 // v_cndmask_b64. SGPR B64/B32 and VGPR B32 can be handled natively.
991 Register Dst = MI.getOperand(0).getReg();
992 Register Src = MI.getOperand(1).getReg();
993 Register Elt = MI.getOperand(2).getReg();
994 Register Idx = MI.getOperand(3).getReg();
995
996 LLT VecTy = MRI.getType(Src);
997 LLT ScalarTy = VecTy.getScalarType();
998 unsigned NumElts = VecTy.getNumElements();
999 const RegisterBank *SrcRB = MRI.getRegBank(Src);
1000 bool IsSGPR = (SrcRB == SgprRB);
1001 SmallVector<Register, 16> Selects;
1002
1003 if (!IsSGPR && ScalarTy.getSizeInBits() == 64) {
1004 // VGPR B64: split to 32-bit lo/hi since there is no v_cndmask_b64.
1005 auto Unmerge = B.buildUnmerge(VgprRB_S32, Src);
1006 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1007 Register EltLo = EltUnmerge.getReg(0);
1008 Register EltHi = EltUnmerge.getReg(1);
1009 for (unsigned I = 0; I < NumElts; ++I) {
1010 auto IdxConst = B.buildConstant(VgprRB_S32, I);
1011 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
1012 Selects.push_back(
1013 B.buildSelect(VgprRB_S32, Cmp, EltLo, Unmerge.getReg(2 * I))
1014 .getReg(0));
1015 Selects.push_back(
1016 B.buildSelect(VgprRB_S32, Cmp, EltHi, Unmerge.getReg(2 * I + 1))
1017 .getReg(0));
1018 }
1019 LLT Vec32Ty = LLT::fixed_vector(2 * NumElts, 32);
1020 auto Vec32 = B.buildBuildVector({VgprRB, Vec32Ty}, Selects);
1021 B.buildBitcast(Dst, Vec32);
1022 } else if (ScalarTy.getSizeInBits() == 32 || ScalarTy.getSizeInBits() == 64) {
1023 // B32 (any bank) and SGPR B64: element-wise select at native width.
1024 MachineRegisterInfo::VRegAttrs SrcRB_EltTy = {SrcRB, ScalarTy};
1025 MachineRegisterInfo::VRegAttrs CmpTy = IsSGPR ? SgprRB_S32 : VccRB_S1;
1026 auto Unmerge = B.buildUnmerge(SrcRB_EltTy, Src);
1027 for (unsigned I = 0; I < NumElts; ++I) {
1028 auto IdxConst = B.buildConstant(SgprRB_S32, I);
1029 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CmpTy, Idx, IdxConst);
1030 Selects.push_back(
1031 B.buildSelect(SrcRB_EltTy, Cmp, Elt, Unmerge.getReg(I)).getReg(0));
1032 }
1033 B.buildMergeLikeInstr(Dst, Selects);
1034 } else {
1036 MF, MORE, "amdgpu-regbanklegalize",
1037 "AMDGPU RegBankLegalize: InsVecEltToSel unsupported element type", MI);
1038 return false;
1039 }
1040
1041 MI.eraseFromParent();
1042 return true;
1043}
1044
1045bool RegBankLegalizeHelper::lowerInsVecEltTo32(MachineInstr &MI) {
1046 // Reduce a 64-bit element insert to two 32-bit inserts:
1047 // vec32 = bitcast <N x s64> to <2N x s32>
1048 // lo, hi = unmerge elt
1049 // vec32[idx * 2] = lo
1050 // vec32[idx * 2 + 1] = hi
1051 // dst = bitcast <2N x s32> to <N x s64>
1052 //
1053 // When the index is uniform, all lanes insert at the same position, so we
1054 // can split the s64 insert into two s32 inserts which lower to MOVREL/GPRIDX.
1055 Register Dst = MI.getOperand(0).getReg();
1056 Register Src = MI.getOperand(1).getReg();
1057 Register Elt = MI.getOperand(2).getReg();
1058 Register Idx = MI.getOperand(3).getReg();
1059
1060 LLT SrcTy = MRI.getType(Src);
1061 LLT Vec32Ty = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
1062
1063 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
1064 "expected VGPR src and SGPR idx");
1065
1066 MachineRegisterInfo::VRegAttrs VgprRB_Vec32Ty = {VgprRB, Vec32Ty};
1067
1068 auto CastSrc = B.buildBitcast(VgprRB_Vec32Ty, Src);
1069 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1070
1071 // Calculate new Lo and Hi indices
1072 auto One = B.buildConstant(SgprRB_S32, 1);
1073 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
1074 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
1075
1076 auto InsLo = B.buildInsertVectorElement(VgprRB_Vec32Ty, CastSrc,
1077 EltUnmerge.getReg(0), IdxLo);
1078 auto InsHi = B.buildInsertVectorElement(VgprRB_Vec32Ty, InsLo,
1079 EltUnmerge.getReg(1), IdxHi);
1080
1081 B.buildBitcast(Dst, InsHi);
1082
1083 MI.eraseFromParent();
1084 return true;
1085}
1086
1087bool RegBankLegalizeHelper::lowerAbsToNegMax(MachineInstr &MI) {
1088 // Lower divergent G_ABS to smax(x, 0 - x) in the VGPR bank:
1089 // zero = 0
1090 // neg = G_SUB zero, x
1091 // dst = G_SMAX x, neg
1092 //
1093 // There is no integer v_abs instruction on AMDGPU, so divergent G_ABS is
1094 // expanded to this sub/smax pair.
1095 Register DstReg = MI.getOperand(0).getReg();
1096 Register SrcReg = MI.getOperand(1).getReg();
1097 LLT Ty = MRI.getType(DstReg);
1098
1099 Register Zero;
1100 if (Ty == V2S16) {
1101 // buildConstant cannot produce a V2S16 directly; pack two S16 zeros.
1102 Register Zero16 = B.buildConstant({VgprRB, S16}, 0).getReg(0);
1103 Zero = B.buildBuildVector({VgprRB, Ty}, {Zero16, Zero16}).getReg(0);
1104 } else {
1105 assert((Ty == S32 || Ty == S16) && "unexpected type for AbsToNegMax");
1106 Zero = B.buildConstant({VgprRB, Ty}, 0).getReg(0);
1107 }
1108
1109 auto Neg = B.buildSub({VgprRB, Ty}, Zero, SrcReg);
1110 B.buildSMax(DstReg, SrcReg, Neg);
1111 MI.eraseFromParent();
1112 return true;
1113}
1114
1115bool RegBankLegalizeHelper::lowerAbsToS32(MachineInstr &MI) {
1116 // Lower uniform V2S16 abs by unpacking the values to two separate SGPR
1117 // registers and re-emitting G_ABS on each:
1118 // packed = bitcast <2 x s16> src to s32
1119 // lo = sext_inreg packed, 16
1120 // hi = ashr packed, 16
1121 // dst = build_vector_trunc G_ABS(lo), G_ABS(hi)
1122 //
1123 // SALU only has s_abs_i32, with no direct uniform V2S16 abs. The
1124 // re-emitted G_ABS(SgprRB, S32) selects to s_abs_i32 on each value.
1125 auto Bitcast = B.buildBitcast({SgprRB_S32}, MI.getOperand(1).getReg());
1126 auto SextInReg = B.buildSExtInReg({SgprRB_S32}, Bitcast, 16);
1127 auto ShiftHi =
1128 B.buildAShr({SgprRB_S32}, Bitcast, B.buildConstant({SgprRB_S32}, 16));
1129
1130 auto AbsLo = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {SextInReg});
1131 auto AbsHi = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {ShiftHi});
1132 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
1133 {AbsLo.getReg(0), AbsHi.getReg(0)});
1134
1135 MI.eraseFromParent();
1136 return true;
1137}
1138
1139bool RegBankLegalizeHelper::lower(MachineInstr &MI,
1140 const RegBankLLTMapping &Mapping,
1141 WaterfallInfo &WFI) {
1142
1143 switch (Mapping.LoweringMethod) {
1144 case DoNotLower:
1145 break;
1146 case VccExtToSel:
1147 return lowerVccExtToSel(MI);
1148 case UniExtToSel: {
1149 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1150 auto True = B.buildConstant({SgprRB, Ty},
1151 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
1152 auto False = B.buildConstant({SgprRB, Ty}, 0);
1153 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare.
1154 // We are making select here. S1 cond was already 'any-extended to S32' +
1155 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg.
1156 B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True,
1157 False);
1158 MI.eraseFromParent();
1159 return true;
1160 }
1161 case UnpackBitShift:
1162 return lowerUnpackBitShift(MI);
1163 case UnpackMinMax:
1164 return lowerUnpackMinMax(MI);
1165 case ScalarizeToS16:
1166 return lowerSplitTo16(MI);
1167 case Ext32To64: {
1168 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
1169 MachineInstrBuilder Hi;
1170 switch (MI.getOpcode()) {
1171 case AMDGPU::G_ZEXT: {
1172 Hi = B.buildConstant({RB, S32}, 0);
1173 break;
1174 }
1175 case AMDGPU::G_SEXT: {
1176 // Replicate sign bit from 32-bit extended part.
1177 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
1178 Hi = B.buildAShr({RB, S32}, MI.getOperand(1).getReg(), ShiftAmt);
1179 break;
1180 }
1181 case AMDGPU::G_ANYEXT: {
1182 Hi = B.buildUndef({RB, S32});
1183 break;
1184 }
1185 default:
1186 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1187 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
1188 MI);
1189 return false;
1190 }
1191
1192 B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
1193 {MI.getOperand(1).getReg(), Hi});
1194 MI.eraseFromParent();
1195 return true;
1196 }
1197 case UniCstExt: {
1198 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
1199 B.buildConstant(MI.getOperand(0).getReg(), ConstVal);
1200
1201 MI.eraseFromParent();
1202 return true;
1203 }
1204 case VgprToVccCopy: {
1205 Register Src = MI.getOperand(1).getReg();
1206 LLT Ty = MRI.getType(Src);
1207 // Take lowest bit from each lane and put it in lane mask.
1208 // Lowering via compare, but we need to clean high bits first as compare
1209 // compares all bits in register.
1210 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
1211 if (Ty == S64) {
1212 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
1213 auto One = B.buildConstant(VgprRB_S32, 1);
1214 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
1215 auto Zero = B.buildConstant(VgprRB_S32, 0);
1216 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
1217 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
1218 } else {
1219 assert(Ty == S32 || Ty == S16);
1220 auto One = B.buildConstant({VgprRB, Ty}, 1);
1221 B.buildAnd(BoolSrc, Src, One);
1222 }
1223 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
1224 B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero);
1225 MI.eraseFromParent();
1226 return true;
1227 }
1228 case V_BFE:
1229 return lowerV_BFE(MI);
1230 case S_BFE:
1231 return lowerS_BFE(MI);
1232 case UniMAD64:
1233 return lowerUniMAD64(MI);
1234 case UniMul64: {
1235 B.buildMul(MI.getOperand(0), MI.getOperand(1), MI.getOperand(2));
1236 MI.eraseFromParent();
1237 return true;
1238 }
1239 case DivSMulToMAD: {
1240 auto Op1 = B.buildTrunc(VgprRB_S32, MI.getOperand(1));
1241 auto Op2 = B.buildTrunc(VgprRB_S32, MI.getOperand(2));
1242 auto Zero = B.buildConstant({VgprRB, S64}, 0);
1243
1244 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
1245 ? AMDGPU::G_AMDGPU_MAD_U64_U32
1246 : AMDGPU::G_AMDGPU_MAD_I64_I32;
1247
1248 B.buildInstr(NewOpc, {MI.getOperand(0).getReg(), {SgprRB, S32}},
1249 {Op1, Op2, Zero});
1250 MI.eraseFromParent();
1251 return true;
1252 }
1253 case SplitTo32:
1254 return lowerSplitTo32(MI);
1255 case SplitTo32Mul:
1256 return lowerSplitTo32Mul(MI);
1257 case SplitTo32Select:
1258 return lowerSplitTo32Select(MI);
1259 case SplitTo32SExtInReg:
1260 return lowerSplitTo32SExtInReg(MI);
1261 case SplitLoad: {
1262 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1263 unsigned Size = DstTy.getSizeInBits();
1264 // Even split to 128-bit loads
1265 if (Size > 128) {
1266 LLT B128;
1267 if (DstTy.isVector()) {
1268 LLT EltTy = DstTy.getElementType();
1269 B128 = LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1270 } else {
1271 B128 = LLT::scalar(128);
1272 }
1273 if (Size / 128 == 2)
1274 splitLoad(MI, {B128, B128});
1275 else if (Size / 128 == 4)
1276 splitLoad(MI, {B128, B128, B128, B128});
1277 else {
1278 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1279 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1280 MI);
1281 return false;
1282 }
1283 }
1284 // 64 and 32 bit load
1285 else if (DstTy == S96)
1286 splitLoad(MI, {S64, S32}, S32);
1287 else if (DstTy == V3S32)
1288 splitLoad(MI, {V2S32, S32}, S32);
1289 else if (DstTy == V6S16)
1290 splitLoad(MI, {V4S16, V2S16}, V2S16);
1291 else {
1292 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1293 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1294 MI);
1295 return false;
1296 }
1297 return true;
1298 }
1299 case WidenLoad: {
1300 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1301 if (DstTy == S96)
1302 widenLoad(MI, S128);
1303 else if (DstTy == V3S32)
1304 widenLoad(MI, V4S32, S32);
1305 else if (DstTy == V6S16)
1306 widenLoad(MI, V8S16, V2S16);
1307 else {
1308 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1309 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1310 MI);
1311 return false;
1312 }
1313 return true;
1314 }
1315 case UnpackAExt:
1316 return lowerUnpackAExt(MI);
1317 case WidenMMOToS32:
1318 return widenMMOToS32(cast<GAnyLoad>(MI));
1319 case VerifyAllSgpr: {
1320 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1321 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1322 }));
1323 return true;
1324 }
1325 case ApplyAllVgpr: {
1326 assert(llvm::all_of(MI.defs(), [&](const MachineOperand &Op) {
1327 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1328 }));
1329 B.setInstrAndDebugLoc(MI);
1330 for (unsigned i = MI.getNumDefs(); i < MI.getNumOperands(); ++i) {
1331 MachineOperand &Op = MI.getOperand(i);
1332 if (!Op.isReg())
1333 continue;
1334 Register Reg = Op.getReg();
1335 if (MRI.getRegBank(Reg) != VgprRB) {
1336 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1337 Op.setReg(Copy.getReg(0));
1338 }
1339 }
1340 return true;
1341 }
1342 case UnmergeToShiftTrunc: {
1343 GUnmerge *Unmerge = dyn_cast<GUnmerge>(&MI);
1344 LLT Ty = MRI.getType(Unmerge->getSourceReg());
1345 if (Ty.getSizeInBits() % 32 != 0) {
1346 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1347 "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1348 MI);
1349 return false;
1350 }
1351
1352 B.setInstrAndDebugLoc(MI);
1353 if (Ty.getSizeInBits() > 32) {
1354 auto UnmergeV2S16 =
1355 B.buildUnmerge({SgprRB, V2S16}, Unmerge->getSourceReg());
1356 for (unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
1357 auto [Dst0S32, Dst1S32] =
1358 unpackAExt(UnmergeV2S16->getOperand(i).getReg());
1359 B.buildTrunc(MI.getOperand(i * 2).getReg(), Dst0S32);
1360 B.buildTrunc(MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
1361 }
1362 } else {
1363 auto [Dst0S32, Dst1S32] = unpackAExt(MI.getOperand(2).getReg());
1364 B.buildTrunc(MI.getOperand(0).getReg(), Dst0S32);
1365 B.buildTrunc(MI.getOperand(1).getReg(), Dst1S32);
1366 }
1367
1368 MI.eraseFromParent();
1369 return true;
1370 }
1372 Register Dst = MI.getOperand(0).getReg();
1373 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1374 B.setInsertPt(*MI.getParent(), MI.getParent()->getFirstNonPHI());
1375 MI.getOperand(0).setReg(NewDst);
1376 B.buildTrunc(Dst, NewDst);
1377
1378 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1379 Register UseReg = MI.getOperand(i).getReg();
1380
1381 auto DefMI = MRI.getVRegDef(UseReg)->getIterator();
1382 MachineBasicBlock *DefMBB = DefMI->getParent();
1383
1384 B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
1385
1386 auto NewUse = B.buildAnyExt(SgprRB_S32, UseReg);
1387 MI.getOperand(i).setReg(NewUse.getReg(0));
1388 }
1389 break;
1390 }
1391 case VerifyAllSgprGPHI: {
1392 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1393 if (Op.isMBB())
1394 return true;
1395 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1396 }));
1397 return true;
1398 }
1400 assert(MRI.getRegBankOrNull(MI.getOperand(0).getReg()) == VgprRB);
1401 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1402 if (Op.isMBB())
1403 return true;
1404 const RegisterBank *RB = MRI.getRegBankOrNull(Op.getReg());
1405 return RB == VgprRB || RB == SgprRB;
1406 }));
1407 return true;
1408 }
1409 case ApplyINTRIN_IMAGE: {
1410 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1412 assert(RSrcIntrin && RSrcIntrin->IsImage);
1413 // The reported argument index is relative to the IR intrinsic call
1414 // arguments, so shift by the number of defs and the intrinsic ID.
1415 unsigned RsrcIdx = RSrcIntrin->RsrcArg + MI.getNumExplicitDefs() + 1;
1416 return applyRegisterBanksVgprWithSgprRsrc(MI, RsrcIdx);
1417 }
1419 // Rsrc is the last register operand. Base BVH trails an A16 immediate
1420 // after rsrc; dual/BVH8 do not. Scan backwards for the last virtual
1421 // register.
1422 unsigned RsrcIdx = MI.getNumOperands();
1423 while (RsrcIdx-- > MI.getNumExplicitDefs()) {
1424 const MachineOperand &Op = MI.getOperand(RsrcIdx);
1425 if (Op.isReg() && Op.getReg().isVirtual())
1426 break;
1427 }
1428 return applyRegisterBanksVgprWithSgprRsrc(MI, RsrcIdx);
1429 }
1431 return lowerSplitBitCount64To32(MI);
1432 case ExtrVecEltToSel:
1433 return lowerExtrVecEltToSel(MI);
1434 case ExtrVecEltTo32:
1435 return lowerExtrVecEltTo32(MI);
1436 case InsVecEltToSel:
1437 return lowerInsVecEltToSel(MI);
1438 case InsVecEltTo32:
1439 return lowerInsVecEltTo32(MI);
1440 case AbsToNegMax:
1441 return lowerAbsToNegMax(MI);
1442 case AbsToS32:
1443 return lowerAbsToS32(MI);
1444 }
1445
1446 if (!WFI.SgprWaterfallOperandRegs.empty()) {
1447 if (!executeInWaterfallLoop(B, WFI))
1448 return false;
1449 }
1450 return true;
1451}
1452
1453LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
1454 switch (ID) {
1455 case Vcc:
1456 case UniInVcc:
1457 return LLT::scalar(1);
1458 case Sgpr16:
1459 case Vgpr16:
1460 case UniInVgprS16:
1461 return LLT::scalar(16);
1462 case Sgpr32:
1463 case Sgpr32_WF:
1464 case Sgpr32Trunc:
1465 case Sgpr32AExt:
1467 case Sgpr32SExt:
1468 case Sgpr32ZExt:
1469 case UniInVgprS32:
1470 case Sgpr32ToVgprDst:
1471 case Vgpr32:
1472 case Vgpr32AExt:
1473 case Vgpr32SExt:
1474 case Vgpr32ZExt:
1475 return LLT::scalar(32);
1476 case Sgpr64:
1477 case Vgpr64:
1478 case UniInVgprS64:
1479 case Sgpr64ToVgprDst:
1480 return LLT::scalar(64);
1481 case Sgpr128:
1482 case Vgpr128:
1483 return LLT::scalar(128);
1484 case SgprP0:
1485 case SgprP0Call_WF:
1486 case VgprP0:
1487 return LLT::pointer(0, 64);
1488 case SgprP1:
1489 case VgprP1:
1490 return LLT::pointer(1, 64);
1491 case SgprP2:
1492 case VgprP2:
1493 return LLT::pointer(2, 32);
1494 case SgprP3:
1495 case VgprP3:
1496 return LLT::pointer(3, 32);
1497 case SgprP4:
1498 case SgprP4Call_WF:
1499 case VgprP4:
1500 return LLT::pointer(4, 64);
1501 case SgprP5:
1502 case VgprP5:
1503 return LLT::pointer(5, 32);
1504 case SgprP8:
1505 return LLT::pointer(8, 128);
1506 case SgprV2S16:
1507 case VgprV2S16:
1508 case UniInVgprV2S16:
1509 return LLT::fixed_vector(2, 16);
1510 case SgprV2S32:
1511 case VgprV2S32:
1512 case UniInVgprV2S32:
1513 return LLT::fixed_vector(2, 32);
1514 case VgprV3S32:
1515 return LLT::fixed_vector(3, 32);
1516 case VgprV4S16:
1517 return LLT::fixed_vector(4, 16);
1518 case SgprV4S32:
1519 case SgprV4S32_WF:
1521 case VgprV4S32:
1522 case UniInVgprV4S32:
1523 return LLT::fixed_vector(4, 32);
1524 case VgprV8S32:
1525 return LLT::fixed_vector(8, 32);
1526 case VgprV2S64:
1527 case UniInVgprV2S64:
1528 return LLT::fixed_vector(2, 64);
1529 case VgprV6S32:
1530 return LLT::fixed_vector(6, 32);
1531 case VgprV32S16:
1532 return LLT::fixed_vector(32, 16);
1533 case VgprV32S32:
1534 return LLT::fixed_vector(32, 32);
1535 default:
1536 return LLT();
1537 }
1538}
1539
1540LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
1541 switch (ID) {
1542 case SgprB32:
1543 case VgprB32:
1544 case SgprB32_M0:
1546 case UniInVgprB32:
1547 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
1548 isAnyPtr(Ty, 32))
1549 return Ty;
1550 return LLT();
1551 case SgprPtr32:
1552 case VgprPtr32:
1553 return isAnyPtr(Ty, 32) ? Ty : LLT();
1554 case SgprPtr64:
1555 case VgprPtr64:
1556 return isAnyPtr(Ty, 64) ? Ty : LLT();
1557 case SgprPtr128:
1558 case VgprPtr128:
1559 return isAnyPtr(Ty, 128) ? Ty : LLT();
1560 case SgprB64:
1561 case VgprB64:
1563 case UniInVgprB64:
1564 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
1565 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
1566 return Ty;
1567 return LLT();
1568 case SgprB96:
1569 case VgprB96:
1570 case UniInVgprB96:
1571 if (Ty == LLT::scalar(96) || Ty == LLT::fixed_vector(3, 32) ||
1572 Ty == LLT::fixed_vector(6, 16))
1573 return Ty;
1574 return LLT();
1575 case SgprB128:
1576 case VgprB128:
1577 case UniInVgprB128:
1578 if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) ||
1579 Ty == LLT::fixed_vector(2, 64) || Ty == LLT::fixed_vector(8, 16) ||
1580 isAnyPtr(Ty, 128))
1581 return Ty;
1582 return LLT();
1583 case VgprB160:
1584 case UniInVgprB160:
1585 if (Ty.getSizeInBits() == 160)
1586 return Ty;
1587 return LLT();
1588 case SgprB256:
1589 case VgprB256:
1590 case UniInVgprB256:
1591 if (Ty == LLT::scalar(256) || Ty == LLT::fixed_vector(8, 32) ||
1592 Ty == LLT::fixed_vector(4, 64) || Ty == LLT::fixed_vector(16, 16))
1593 return Ty;
1594 return LLT();
1595 case SgprB512:
1596 case VgprB512:
1597 case UniInVgprB512:
1598 if (Ty == LLT::scalar(512) || Ty == LLT::fixed_vector(16, 32) ||
1599 Ty == LLT::fixed_vector(8, 64))
1600 return Ty;
1601 return LLT();
1602 case SgprBRC: {
1603 const SIRegisterInfo *TRI =
1604 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1605 unsigned LLTSize = Ty.getSizeInBits();
1606 if (LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize))
1607 return Ty;
1608 return LLT();
1609 }
1610 case VgprBRC: {
1611 const SIRegisterInfo *TRI =
1612 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1613 if (TRI->getSGPRClassForBitWidth(Ty.getSizeInBits()))
1614 return Ty;
1615 return LLT();
1616 }
1617 default:
1618 return LLT();
1619 }
1620}
1621
1622const RegisterBank *
1623RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
1624 switch (ID) {
1625 case Vcc:
1626 return VccRB;
1627 case Sgpr16:
1628 case Sgpr32:
1629 case Sgpr32_WF:
1630 case Sgpr64:
1631 case Sgpr128:
1632 case SgprP0:
1633 case SgprP0Call_WF:
1634 case SgprP1:
1635 case SgprP2:
1636 case SgprP3:
1637 case SgprP4:
1638 case SgprP4Call_WF:
1639 case SgprP5:
1640 case SgprP8:
1641 case SgprPtr32:
1642 case SgprPtr64:
1643 case SgprPtr128:
1644 case SgprV2S16:
1645 case SgprV2S32:
1646 case SgprV4S32:
1647 case SgprV4S32_WF:
1649 case SgprB32:
1650 case SgprB64:
1651 case SgprB96:
1652 case SgprB128:
1653 case SgprB256:
1654 case SgprB512:
1655 case SgprBRC:
1656 case UniInVcc:
1657 case UniInVgprS16:
1658 case UniInVgprS32:
1659 case UniInVgprS64:
1660 case UniInVgprV2S16:
1661 case UniInVgprV2S32:
1662 case UniInVgprV4S32:
1663 case UniInVgprV2S64:
1664 case UniInVgprB32:
1665 case UniInVgprB64:
1666 case UniInVgprB96:
1667 case UniInVgprB128:
1668 case UniInVgprB160:
1669 case UniInVgprB256:
1670 case UniInVgprB512:
1671 case Sgpr32Trunc:
1672 case Sgpr32AExt:
1674 case Sgpr32SExt:
1675 case Sgpr32ZExt:
1676 return SgprRB;
1677 case AgprAnyTy:
1678 return AgprRB;
1679 case Vgpr16:
1680 case Vgpr32:
1681 case Vgpr64:
1682 case Vgpr128:
1683 case VgprP0:
1684 case VgprP1:
1685 case VgprP2:
1686 case VgprP3:
1687 case VgprP4:
1688 case VgprP5:
1689 case VgprPtr32:
1690 case VgprPtr64:
1691 case VgprPtr128:
1692 case VgprV2S16:
1693 case VgprV2S32:
1694 case VgprV2S64:
1695 case VgprV3S32:
1696 case VgprV4S16:
1697 case VgprV4S32:
1698 case VgprV6S32:
1699 case VgprV8S32:
1700 case VgprV32S16:
1701 case VgprB32:
1702 case VgprB64:
1703 case VgprB96:
1704 case VgprB128:
1705 case VgprB160:
1706 case VgprB256:
1707 case VgprB512:
1708 case VgprBRC:
1709 case VgprAnyTy:
1710 case Vgpr32AExt:
1711 case Vgpr32SExt:
1712 case Vgpr32ZExt:
1713 case Sgpr32ToVgprDst:
1714 case Sgpr64ToVgprDst:
1715 return VgprRB;
1716 default:
1717 return nullptr;
1718 }
1719}
1720
1721bool RegBankLegalizeHelper::applyMappingDst(
1722 MachineInstr &MI, unsigned &OpIdx,
1723 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1724 // Defs start from operand 0
1725 for (; OpIdx < MethodIDs.size(); ++OpIdx) {
1726 if (MethodIDs[OpIdx] == None)
1727 continue;
1728 MachineOperand &Op = MI.getOperand(OpIdx);
1729 Register Reg = Op.getReg();
1730 LLT Ty = MRI.getType(Reg);
1731 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg);
1732
1733 switch (MethodIDs[OpIdx]) {
1734 // vcc, sgpr and vgpr scalars, pointers and vectors
1735 case Vcc:
1736 case Sgpr16:
1737 case Sgpr32:
1738 case Sgpr64:
1739 case Sgpr128:
1740 case SgprP0:
1741 case SgprP1:
1742 case SgprP3:
1743 case SgprP4:
1744 case SgprP5:
1745 case SgprP8:
1746 case SgprV2S16:
1747 case SgprV2S32:
1748 case SgprV4S32:
1749 case Vgpr16:
1750 case Vgpr32:
1751 case Vgpr64:
1752 case Vgpr128:
1753 case VgprP0:
1754 case VgprP1:
1755 case VgprP2:
1756 case VgprP3:
1757 case VgprP4:
1758 case VgprP5:
1759 case VgprV2S16:
1760 case VgprV2S32:
1761 case VgprV2S64:
1762 case VgprV3S32:
1763 case VgprV4S16:
1764 case VgprV4S32:
1765 case VgprV6S32:
1766 case VgprV8S32:
1767 case VgprV32S16: {
1768 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1769 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1770 break;
1771 }
1772 // sgpr and vgpr B-types
1773 case SgprB32:
1774 case SgprB64:
1775 case SgprB96:
1776 case SgprB128:
1777 case SgprB256:
1778 case SgprB512:
1779 case SgprBRC:
1780 case SgprPtr32:
1781 case SgprPtr64:
1782 case SgprPtr128:
1783 case VgprB32:
1784 case VgprB64:
1785 case VgprB96:
1786 case VgprB128:
1787 case VgprB160:
1788 case VgprB256:
1789 case VgprB512:
1790 case VgprBRC:
1791 case VgprPtr32:
1792 case VgprPtr64:
1793 case VgprPtr128: {
1794 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1795 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1796 break;
1797 }
1798 case VgprAnyTy: {
1799 assert(RB == VgprRB);
1800 break;
1801 }
1802 case AgprAnyTy: {
1803 if (RB == AgprRB)
1804 break;
1805 Register NewAgprDst = MRI.createVirtualRegister({AgprRB, Ty});
1806 Op.setReg(NewAgprDst);
1807 if (!MRI.use_nodbg_empty(Reg))
1808 B.buildCopy(Reg, NewAgprDst);
1809 break;
1810 }
1811 // uniform in vcc/vgpr: scalars, vectors and B-types
1812 case UniInVcc: {
1813 assert(Ty == S1);
1814 assert(RB == SgprRB);
1815 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1816 Op.setReg(NewDst);
1817 if (!MRI.use_empty(Reg)) {
1818 auto CopyS32_Vcc =
1819 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1820 B.buildTrunc(Reg, CopyS32_Vcc);
1821 }
1822 break;
1823 }
1824 case UniInVgprS16: {
1825 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1826 assert(RB == SgprRB);
1827 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1828 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1829 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1830 Op.setReg(NewVgprDstS16);
1831 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1832 buildReadAnyLane(B, NewSgprDstS32, NewVgprDstS32, RBI);
1833 B.buildTrunc(Reg, NewSgprDstS32);
1834 break;
1835 }
1836 case UniInVgprS32:
1837 case UniInVgprS64:
1838 case UniInVgprV2S16:
1839 case UniInVgprV2S32:
1840 case UniInVgprV4S32:
1841 case UniInVgprV2S64: {
1842 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1843 assert(RB == SgprRB);
1844 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1845 Op.setReg(NewVgprDst);
1846 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1847 break;
1848 }
1849 case UniInVgprB32:
1850 case UniInVgprB64:
1851 case UniInVgprB96:
1852 case UniInVgprB128:
1853 case UniInVgprB160:
1854 case UniInVgprB256:
1855 case UniInVgprB512: {
1856 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1857 assert(RB == SgprRB);
1858 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1859 Op.setReg(NewVgprDst);
1860 AMDGPU::buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1861 break;
1862 }
1863 // sgpr trunc
1864 case Sgpr32Trunc: {
1865 assert(Ty.getSizeInBits() < 32);
1866 assert(RB == SgprRB);
1867 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1868 Op.setReg(NewDst);
1869 if (!MRI.use_empty(Reg))
1870 B.buildTrunc(Reg, NewDst);
1871 break;
1872 }
1873 case Sgpr32ToVgprDst:
1874 case Sgpr64ToVgprDst: {
1875 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1876 assert(RB == VgprRB);
1877 Op.setReg(MRI.createVirtualRegister({SgprRB, Ty}));
1878 B.buildCopy(Reg, Op.getReg());
1879 break;
1880 }
1881 case InvalidMapping: {
1883 MF, MORE, "amdgpu-regbanklegalize",
1884 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for", MI);
1885 return false;
1886 }
1887 default:
1889 MF, MORE, "amdgpu-regbanklegalize",
1890 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported", MI);
1891 return false;
1892 }
1893 }
1894
1895 return true;
1896}
1897
1898bool RegBankLegalizeHelper::applyMappingSrc(
1899 MachineInstr &MI, unsigned &OpIdx,
1900 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1901 WaterfallInfo &WFI) {
1902 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) {
1903 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm)
1904 continue;
1905
1906 MachineOperand &Op = MI.getOperand(OpIdx);
1907 Register Reg = Op.getReg();
1908 LLT Ty = MRI.getType(Reg);
1909 const RegisterBank *RB = MRI.getRegBank(Reg);
1910
1911 switch (MethodIDs[i]) {
1912 case Vcc: {
1913 assert(Ty == S1);
1914 assert(RB == VccRB || RB == SgprRB);
1915 if (RB == SgprRB) {
1916 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1917 auto CopyVcc_Scc =
1918 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1919 Op.setReg(CopyVcc_Scc.getReg(0));
1920 }
1921 break;
1922 }
1923 // sgpr scalars, pointers and vectors
1924 case Sgpr16:
1925 case Sgpr32:
1926 case Sgpr64:
1927 case Sgpr128:
1928 case SgprP0:
1929 case SgprP1:
1930 case SgprP3:
1931 case SgprP4:
1932 case SgprP5:
1933 case SgprP8:
1934 case SgprV2S16:
1935 case SgprV2S32:
1936 case SgprV4S32: {
1937 assert(Ty == getTyFromID(MethodIDs[i]));
1938 assert(RB == getRegBankFromID(MethodIDs[i]));
1939 break;
1940 }
1941 // sgpr B-types
1942 case SgprB32:
1943 case SgprB64:
1944 case SgprB96:
1945 case SgprB128:
1946 case SgprB256:
1947 case SgprB512:
1948 case SgprBRC:
1949 case SgprPtr32:
1950 case SgprPtr64:
1951 case SgprPtr128: {
1952 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1953 assert(RB == getRegBankFromID(MethodIDs[i]));
1954 break;
1955 }
1956 // vgpr scalars, pointers and vectors
1957 case Vgpr16:
1958 case Vgpr32:
1959 case Vgpr64:
1960 case Vgpr128:
1961 case VgprP0:
1962 case VgprP1:
1963 case VgprP2:
1964 case VgprP3:
1965 case VgprP4:
1966 case VgprP5:
1967 case VgprV2S16:
1968 case VgprV2S32:
1969 case VgprV2S64:
1970 case VgprV3S32:
1971 case VgprV4S16:
1972 case VgprV4S32:
1973 case VgprV6S32:
1974 case VgprV8S32:
1975 case VgprV32S16:
1976 case VgprV32S32: {
1977 assert(Ty == getTyFromID(MethodIDs[i]));
1978 if (RB != VgprRB) {
1979 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1980 Op.setReg(CopyToVgpr.getReg(0));
1981 }
1982 break;
1983 }
1984 // vgpr B-types
1985 case VgprB32:
1986 case VgprB64:
1987 case VgprB96:
1988 case VgprB128:
1989 case VgprB160:
1990 case VgprB256:
1991 case VgprB512:
1992 case VgprBRC:
1993 case VgprPtr32:
1994 case VgprPtr64:
1995 case VgprPtr128: {
1996 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1997 if (RB != VgprRB) {
1998 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1999 Op.setReg(CopyToVgpr.getReg(0));
2000 }
2001 break;
2002 }
2003 case VgprAnyTy: {
2004 if (RB != VgprRB) {
2005 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
2006 Op.setReg(CopyToVgpr.getReg(0));
2007 }
2008 break;
2009 }
2010 case AgprAnyTy: {
2011 if (RB != AgprRB) {
2012 auto CopyToAgpr = B.buildCopy({AgprRB, Ty}, Reg);
2013 Op.setReg(CopyToAgpr.getReg(0));
2014 }
2015 break;
2016 }
2017 // sgpr waterfall, scalars, and vectors
2018 case Sgpr32_WF:
2019 case SgprV4S32_WF: {
2020 assert(Ty == getTyFromID(MethodIDs[i]));
2021 if (RB != SgprRB) {
2022 WFI.SgprWaterfallOperandRegs.insert(Reg);
2023 if (!WFI.Start.isValid()) {
2024 WFI.Start = MI.getIterator();
2025 WFI.End = std::next(MI.getIterator());
2026 }
2027 }
2028 break;
2029 }
2030 case SgprP0Call_WF:
2031 case SgprP4Call_WF: {
2032 assert(Ty == getTyFromID(MethodIDs[i]));
2033 if (RB != SgprRB) {
2034 WFI.SgprWaterfallOperandRegs.insert(Reg);
2035
2036 // Find the ADJCALLSTACKUP before the call.
2037 MachineBasicBlock::iterator Start = MI.getIterator();
2038 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
2039 --Start;
2040
2041 // Find the ADJCALLSTACKDOWN after the call (include it in range).
2042 MachineBasicBlock::iterator End = MI.getIterator();
2043 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
2044 ++End;
2045 ++End;
2046
2047 B.setInsertPt(*MI.getParent(), Start);
2048 WFI.Start = Start;
2049 WFI.End = End;
2050 }
2051 break;
2052 }
2053 case SgprB32_M0:
2055 case SgprB64_ReadFirstLane: {
2056 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2057 if (RB == SgprRB)
2058 break;
2059 assert(RB == VgprRB);
2060 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2061 buildReadFirstLane(B, NewSGPR, Op.getReg(), RBI);
2062 Op.setReg(NewSGPR);
2063 break;
2064 }
2066 assert(Ty == getTyFromID(MethodIDs[i]));
2067 if (RB == SgprRB)
2068 break;
2069 assert(RB == VgprRB);
2070 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2071 buildReadFirstLane(B, NewSGPR, Op.getReg(), RBI);
2072 Op.setReg(NewSGPR);
2073 break;
2074 }
2075 // sgpr and vgpr scalars with extend
2076 case Sgpr32AExt: {
2077 // Note: this ext allows S1, and it is meant to be combined away.
2078 assert(Ty.getSizeInBits() < 32);
2079 assert(RB == SgprRB);
2080 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
2081 Op.setReg(Aext.getReg(0));
2082 break;
2083 }
2084 case Sgpr32AExtBoolInReg: {
2085 // Note: this ext allows S1, and it is meant to be combined away.
2086 assert(Ty.getSizeInBits() == 1);
2087 assert(RB == SgprRB);
2088 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
2089 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
2090 // most of times meant to be combined away in AMDGPURegBankCombiner.
2091 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
2092 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
2093 Op.setReg(BoolInReg.getReg(0));
2094 break;
2095 }
2096 case Sgpr32SExt: {
2097 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
2098 assert(RB == SgprRB);
2099 auto Sext = B.buildSExt(SgprRB_S32, Reg);
2100 Op.setReg(Sext.getReg(0));
2101 break;
2102 }
2103 case Sgpr32ZExt: {
2104 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
2105 assert(RB == SgprRB);
2106 auto Zext = B.buildZExt({SgprRB, S32}, Reg);
2107 Op.setReg(Zext.getReg(0));
2108 break;
2109 }
2110 case Vgpr32AExt: {
2111 assert(Ty.getSizeInBits() < 32);
2112 assert(RB == VgprRB);
2113 auto Aext = B.buildAnyExt({VgprRB, S32}, Reg);
2114 Op.setReg(Aext.getReg(0));
2115 break;
2116 }
2117 case Vgpr32SExt: {
2118 // Note this ext allows S1, and it is meant to be combined away.
2119 assert(Ty.getSizeInBits() < 32);
2120 assert(RB == VgprRB);
2121 auto Sext = B.buildSExt({VgprRB, S32}, Reg);
2122 Op.setReg(Sext.getReg(0));
2123 break;
2124 }
2125 case Vgpr32ZExt: {
2126 // Note this ext allows S1, and it is meant to be combined away.
2127 assert(Ty.getSizeInBits() < 32);
2128 assert(RB == VgprRB);
2129 auto Zext = B.buildZExt({VgprRB, S32}, Reg);
2130 Op.setReg(Zext.getReg(0));
2131 break;
2132 }
2133 default:
2135 MF, MORE, "amdgpu-regbanklegalize",
2136 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported", MI);
2137 return false;
2138 }
2139 }
2140 return true;
2141}
2142
2143[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
2144 const RegisterBank *RB,
2146 unsigned StartOpIdx,
2147 unsigned EndOpIdx) {
2148 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
2149 if (MRI.getRegBankOrNull(MI.getOperand(i).getReg()) != RB)
2150 return false;
2151 }
2152 return true;
2153}
2154
2155bool RegBankLegalizeHelper::applyRegisterBanksVgprWithSgprRsrc(
2156 MachineInstr &MI, unsigned RsrcIdx) {
2157 const unsigned NumDefs = MI.getNumExplicitDefs();
2158
2159 MachineBasicBlock *MBB = MI.getParent();
2160 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(MI.getIterator())));
2161
2162 // Defs are vgpr.
2163 for (unsigned i = 0; i < NumDefs; ++i) {
2164 Register Reg = MI.getOperand(i).getReg();
2165 if (MRI.getRegBank(Reg) == VgprRB)
2166 continue;
2167
2168 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, MRI.getType(Reg)});
2169 MI.getOperand(i).setReg(NewVgprDst);
2170 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
2171 }
2172
2173 B.setInstrAndDebugLoc(MI);
2174
2175 // Register uses before RsrcIdx are vgpr.
2176 for (unsigned i = NumDefs; i < RsrcIdx; ++i) {
2177 MachineOperand &Op = MI.getOperand(i);
2178 if (!Op.isReg())
2179 continue;
2180
2181 Register Reg = Op.getReg();
2182 if (!Reg.isVirtual())
2183 continue;
2184
2185 if (MRI.getRegBank(Reg) == VgprRB)
2186 continue;
2187
2188 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
2189 Op.setReg(Copy.getReg(0));
2190 }
2191
2192 SmallSet<Register, 4> OpsToWaterfall;
2193
2194 // Register use RsrcIdx (and later register operands) is sgpr.
2195 for (unsigned i = RsrcIdx; i < MI.getNumOperands(); ++i) {
2196 MachineOperand &Op = MI.getOperand(i);
2197 if (!Op.isReg())
2198 continue;
2199
2200 Register Reg = Op.getReg();
2201 if (MRI.getRegBank(Reg) != SgprRB)
2202 OpsToWaterfall.insert(Reg);
2203 }
2204
2205 if (!OpsToWaterfall.empty()) {
2206 MachineBasicBlock::iterator MII = MI.getIterator();
2207 executeInWaterfallLoop(B, {OpsToWaterfall, MII, std::next(MII)});
2208 }
2209
2210 return true;
2211}
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
IRTranslator LLVM IR MI
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
const SmallVectorImpl< MachineOperand > & Cond
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
@ ICMP_NE
not equal
Definition InstrTypes.h:698
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
Register getSourceReg() const
Get the unmerge source register.
constexpr bool isScalar() const
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
bool empty() const
Definition SmallSet.h:169
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
void push_back(const T &Elt)
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isAnyPtr(LLT Ty, unsigned Width)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Bitcast
Perform the operation on a different, but equivalently sized type.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:155
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
Definition Utils.cpp:257
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:432
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
Holds waterfall loop information: the set of SGPR operand registers that need waterfalling,...
MachineBasicBlock::iterator Start
SmallSet< Register, 4 > SgprWaterfallOperandRegs