LLVM 23.0.0git
AMDGPURegBankLegalizeHelper.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Implements actual lowering algorithms for each ID that can be used in
10/// Rule.OperandMapping. Similar to legalizer helper but with register banks.
11//
12//===----------------------------------------------------------------------===//
13
16#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
27
28#define DEBUG_TYPE "amdgpu-regbanklegalize"
29
30using namespace llvm;
31using namespace AMDGPU;
32
35 const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
36 : MF(B.getMF()), ST(MF.getSubtarget<GCNSubtarget>()), B(B),
37 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
38 RBLRules(RBLRules), IsWave32(ST.isWave32()),
39 SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
40 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
41 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
42
44 const SetOfRulesForOpcode *RuleSet = RBLRules.getRulesForOpc(MI);
45 if (!RuleSet) {
46 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
47 "No AMDGPU RegBankLegalize rules defined for opcode",
48 MI);
49 return false;
50 }
51
52 const RegBankLLTMapping *Mapping = RuleSet->findMappingForMI(MI, MRI, MUI);
53 if (!Mapping) {
54 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
55 "AMDGPU RegBankLegalize: none of the rules defined with "
56 "'Any' for MI's opcode matched MI",
57 MI);
58 return false;
59 }
60
61 WaterfallInfo WFI;
62 unsigned OpIdx = 0;
63 if (!Mapping->DstOpMapping.empty()) {
64 B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
65 if (!applyMappingDst(MI, OpIdx, Mapping->DstOpMapping))
66 return false;
67 }
68 if (!Mapping->SrcOpMapping.empty()) {
69 B.setInstr(MI);
70 if (!applyMappingSrc(MI, OpIdx, Mapping->SrcOpMapping, WFI))
71 return false;
72 }
73
74 if (!lower(MI, *Mapping, WFI))
75 return false;
76
77 return true;
78}
79
80bool RegBankLegalizeHelper::executeInWaterfallLoop(MachineIRBuilder &B,
81 const WaterfallInfo &WFI) {
82 assert(WFI.Start.isValid() && WFI.End.isValid() &&
83 "Waterfall range not initialized");
84
85 // Track use registers which have already been expanded with a readfirstlane
86 // sequence. This may have multiple uses if moving a sequence.
87 DenseMap<Register, Register> WaterfalledRegMap;
88
89 MachineBasicBlock &MBB = B.getMBB();
90 MachineFunction &MF = B.getMF();
91
94
96 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
97 unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
98 if (IsWave32) {
99 MovExecOpc = AMDGPU::S_MOV_B32;
100 MovExecTermOpc = AMDGPU::S_MOV_B32_term;
101 XorTermOpc = AMDGPU::S_XOR_B32_term;
102 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
103 ExecReg = AMDGPU::EXEC_LO;
104 } else {
105 MovExecOpc = AMDGPU::S_MOV_B64;
106 MovExecTermOpc = AMDGPU::S_MOV_B64_term;
107 XorTermOpc = AMDGPU::S_XOR_B64_term;
108 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
109 ExecReg = AMDGPU::EXEC;
110 }
111
112#ifndef NDEBUG
113 const int OrigRangeSize = std::distance(BeginIt, EndIt);
114#endif
115
116 MachineRegisterInfo &MRI = *B.getMRI();
117 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
118 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
119
120 // Don't bother using generic instructions/registers for the exec mask.
121 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
122
123 Register SavedExec = MRI.createVirtualRegister(WaveRC);
124
125 // To insert the loop we need to split the block. Move everything before
126 // this point to a new block, and insert a new empty block before this
127 // instruction.
130 MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
131 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
133 ++MBBI;
134 MF.insert(MBBI, LoopBB);
135 MF.insert(MBBI, BodyBB);
136 MF.insert(MBBI, RestoreExecBB);
137 MF.insert(MBBI, RemainderBB);
138
139 LoopBB->addSuccessor(BodyBB);
140 BodyBB->addSuccessor(RestoreExecBB);
141 BodyBB->addSuccessor(LoopBB);
142
143 // Move the rest of the block into a new block.
145 RemainderBB->splice(RemainderBB->begin(), &MBB, EndIt, MBB.end());
146
147 MBB.addSuccessor(LoopBB);
148 RestoreExecBB->addSuccessor(RemainderBB);
149
150 B.setInsertPt(*LoopBB, LoopBB->end());
151
152 // +-MBB:------------+
153 // | ... |
154 // | %0 = G_INST_1 |
155 // | %Dst = MI %Vgpr |
156 // | %1 = G_INST_2 |
157 // | ... |
158 // +-----------------+
159 // ->
160 // +-MBB-------------------------------+
161 // | ... |
162 // | %0 = G_INST_1 |
163 // | %SaveExecReg = S_MOV_B32 $exec_lo |
164 // +----------------|------------------+
165 // | /------------------------------|
166 // V V |
167 // +-LoopBB---------------------------------------------------------------+ |
168 // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
169 // | instead of executing for each lane, see if other lanes had | |
170 // | same value for %Vgpr and execute for them also. | |
171 // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
172 // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
173 // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
174 // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
175 // +----------------|-----------------------------------------------------+ |
176 // V |
177 // +-BodyBB------------------------------------------------------------+ |
178 // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
179 // | executed only for active lanes and written to Dst | |
180 // | $exec = S_XOR_B32 $exec, %SavedExec | |
181 // | set active lanes to 0 in SavedExec, lanes that did not write to | |
182 // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
183 // | SI_WATERFALL_LOOP LoopBB |-----|
184 // +----------------|--------------------------------------------------+
185 // V
186 // +-RestoreExecBB--------------------------+
187 // | $exec_lo = S_MOV_B32_term %SaveExecReg |
188 // +----------------|-----------------------+
189 // V
190 // +-RemainderBB:----------------------+
191 // | %1 = G_INST_2 |
192 // | ... |
193 // +---------------------------------- +
194
195 // Move the instruction into the loop body. Note we moved everything after
196 // Range.end() already into a new block, so Range.end() is no longer valid.
197 BodyBB->splice(BodyBB->end(), &MBB, BeginIt, MBB.end());
198
199 // Figure out the iterator range after splicing the instructions.
200 MachineBasicBlock::iterator NewBegin = BeginIt;
201 auto NewEnd = BodyBB->end();
202 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
203
204 B.setMBB(*LoopBB);
205 Register CondReg;
206
207 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
208 for (MachineOperand &Op : MI.all_uses()) {
209 Register OldReg = Op.getReg();
210 if (!WFI.SgprWaterfallOperandRegs.count(OldReg))
211 continue;
212
213 // See if we already processed this register in another instruction in
214 // the sequence.
215 auto OldVal = WaterfalledRegMap.find(OldReg);
216 if (OldVal != WaterfalledRegMap.end()) {
217 Op.setReg(OldVal->second);
218 continue;
219 }
220
221 Register OpReg = Op.getReg();
222 LLT OpTy = MRI.getType(OpReg);
223
224 // TODO: support for agpr
225 assert(MRI.getRegBank(OpReg) == VgprRB);
226 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
227 buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
228
229 // Build the comparison(s), CurrentLaneReg == OpReg.
230 unsigned OpSize = OpTy.getSizeInBits();
231 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
232 LLT PartTy = LLT::scalar(PartSize);
233 unsigned NumParts = OpSize / PartSize;
235 SmallVector<Register, 8> CurrentLaneParts;
236
237 if (NumParts == 1) {
238 OpParts.push_back(OpReg);
239 CurrentLaneParts.push_back(CurrentLaneReg);
240 } else {
241 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
242 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
243 for (unsigned i = 0; i < NumParts; ++i) {
244 OpParts.push_back(UnmergeOp.getReg(i));
245 CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
246 }
247 }
248
249 for (unsigned i = 0; i < NumParts; ++i) {
250 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
251 B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
252
253 if (!CondReg)
254 CondReg = CmpReg;
255 else
256 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
257 }
258
259 Op.setReg(CurrentLaneReg);
260
261 // Make sure we don't re-process this register again.
262 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
263 }
264 }
265
266 // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
267 Register CondRegLM =
268 MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
269 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
270
271 // Update EXEC, save the original EXEC value to SavedExec.
272 B.buildInstr(AndSaveExecOpc)
273 .addDef(SavedExec)
274 .addReg(CondRegLM, RegState::Kill);
275 MRI.setSimpleHint(SavedExec, CondRegLM);
276
277 B.setInsertPt(*BodyBB, BodyBB->end());
278
279 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
280 B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
281
282 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
283 // s_cbranch_scc0?
284
285 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
286 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
287
288 // Save the EXEC mask before the loop.
289 B.setInsertPt(MBB, MBB.end());
290 B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
291
292 // Restore the EXEC mask after the loop.
293 B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
294 B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
295
296 // Set the insert point after the original instruction, so any new
297 // instructions will be in the remainder.
298 B.setInsertPt(*RemainderBB, RemainderBB->begin());
299
300 return true;
301}
302
303bool RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
304 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
305 MachineFunction &MF = B.getMF();
306 assert(MI.getNumMemOperands() == 1);
307 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
308 Register Dst = MI.getOperand(0).getReg();
309 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
310 Register Base = MI.getOperand(1).getReg();
311 LLT PtrTy = MRI.getType(Base);
312 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base);
313 LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
314 SmallVector<Register, 4> LoadPartRegs;
315
316 unsigned ByteOffset = 0;
317 for (LLT PartTy : LLTBreakdown) {
318 Register BasePlusOffset;
319 if (ByteOffset == 0) {
320 BasePlusOffset = Base;
321 } else {
322 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
323 BasePlusOffset =
324 B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0);
325 }
326 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
327 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
328 LoadPartRegs.push_back(LoadPart.getReg(0));
329 ByteOffset += PartTy.getSizeInBytes();
330 }
331
332 if (!MergeTy.isValid()) {
333 // Loads are of same size, concat or merge them together.
334 B.buildMergeLikeInstr(Dst, LoadPartRegs);
335 } else {
336 // Loads are not all of same size, need to unmerge them to smaller pieces
337 // of MergeTy type, then merge pieces to Dst.
338 SmallVector<Register, 4> MergeTyParts;
339 for (Register Reg : LoadPartRegs) {
340 if (MRI.getType(Reg) == MergeTy) {
341 MergeTyParts.push_back(Reg);
342 } else {
343 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg);
344 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
345 MergeTyParts.push_back(Unmerge.getReg(i));
346 }
347 }
348 B.buildMergeLikeInstr(Dst, MergeTyParts);
349 }
350 MI.eraseFromParent();
351 return true;
352}
353
354bool RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
355 LLT MergeTy) {
356 MachineFunction &MF = B.getMF();
357 assert(MI.getNumMemOperands() == 1);
358 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
359 Register Dst = MI.getOperand(0).getReg();
360 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
361 Register Base = MI.getOperand(1).getReg();
362
363 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
364 auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO);
365
366 if (WideTy.isScalar()) {
367 B.buildTrunc(Dst, WideLoad);
368 } else {
369 SmallVector<Register, 4> MergeTyParts;
370 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
371
372 LLT DstTy = MRI.getType(Dst);
373 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
374 for (unsigned i = 0; i < NumElts; ++i) {
375 MergeTyParts.push_back(Unmerge.getReg(i));
376 }
377 B.buildMergeLikeInstr(Dst, MergeTyParts);
378 }
379 MI.eraseFromParent();
380 return true;
381}
382
383bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
384 Register Dst = MI.getDstReg();
385 Register Ptr = MI.getPointerReg();
386 MachineMemOperand &MMO = MI.getMMO();
387 unsigned MemSize = 8 * MMO.getSize().getValue();
388
389 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
390
391 if (MI.getOpcode() == G_LOAD) {
392 B.buildLoad(Dst, Ptr, *WideMMO);
393 } else {
394 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
395
396 if (MI.getOpcode() == G_ZEXTLOAD) {
397 APInt Mask = APInt::getLowBitsSet(S32.getSizeInBits(), MemSize);
398 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
399 B.buildAnd(Dst, Load, MaskCst);
400 } else {
401 assert(MI.getOpcode() == G_SEXTLOAD);
402 B.buildSExtInReg(Dst, Load, MemSize);
403 }
404 }
405
406 MI.eraseFromParent();
407 return true;
408}
409
410bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
411 Register Dst = MI.getOperand(0).getReg();
412 LLT Ty = MRI.getType(Dst);
413 Register Src = MI.getOperand(1).getReg();
414 unsigned Opc = MI.getOpcode();
415 int TrueExtCst = Opc == G_SEXT ? -1 : 1;
416 if (Ty == S32 || Ty == S16) {
417 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
418 auto False = B.buildConstant({VgprRB, Ty}, 0);
419 B.buildSelect(Dst, Src, True, False);
420 } else if (Ty == S64) {
421 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
422 auto False = B.buildConstant({VgprRB_S32}, 0);
423 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
424 MachineInstrBuilder Hi;
425 switch (Opc) {
426 case G_SEXT:
427 Hi = Lo;
428 break;
429 case G_ZEXT:
430 Hi = False;
431 break;
432 case G_ANYEXT:
433 Hi = B.buildUndef({VgprRB_S32});
434 break;
435 default:
437 MF, MORE, "amdgpu-regbanklegalize",
438 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported", MI);
439 return false;
440 }
441
442 B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)});
443 } else {
445 MF, MORE, "amdgpu-regbanklegalize",
446 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported", MI);
447 return false;
448 }
449
450 MI.eraseFromParent();
451 return true;
452}
453
454std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
455 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
456 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
457 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
458 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
459 return {Lo.getReg(0), Hi.getReg(0)};
460}
461
462std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
463 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
464 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
465 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
466 return {Lo.getReg(0), Hi.getReg(0)};
467}
468
469std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
470 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
471 auto Lo = PackedS32;
472 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
473 return {Lo.getReg(0), Hi.getReg(0)};
474}
475
476std::pair<Register, Register>
477RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) {
478 auto [Lo32, Hi32] = unpackAExt(Reg);
479 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
480 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
481}
482
483bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
484 Register Lo, Hi;
485 switch (MI.getOpcode()) {
486 case AMDGPU::G_SHL: {
487 auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg());
488 auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg());
489 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
490 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
491 break;
492 }
493 case AMDGPU::G_LSHR: {
494 auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg());
495 auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg());
496 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
497 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
498 break;
499 }
500 case AMDGPU::G_ASHR: {
501 auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg());
502 auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg());
503 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
504 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
505 break;
506 }
507 default:
509 MF, MORE, "amdgpu-regbanklegalize",
510 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
511 MI);
512 return false;
513 }
514 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
515 MI.eraseFromParent();
516 return true;
517}
518
519bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
520 Register Lo, Hi;
521 switch (MI.getOpcode()) {
522 case AMDGPU::G_SMIN:
523 case AMDGPU::G_SMAX: {
524 // For signed operations, use sign extension
525 auto [Val0_Lo, Val0_Hi] = unpackSExt(MI.getOperand(1).getReg());
526 auto [Val1_Lo, Val1_Hi] = unpackSExt(MI.getOperand(2).getReg());
527 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
528 .getReg(0);
529 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
530 .getReg(0);
531 break;
532 }
533 case AMDGPU::G_UMIN:
534 case AMDGPU::G_UMAX: {
535 // For unsigned operations, use zero extension
536 auto [Val0_Lo, Val0_Hi] = unpackZExt(MI.getOperand(1).getReg());
537 auto [Val1_Lo, Val1_Hi] = unpackZExt(MI.getOperand(2).getReg());
538 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
539 .getReg(0);
540 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
541 .getReg(0);
542 break;
543 }
544 default:
546 MF, MORE, "amdgpu-regbanklegalize",
547 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented", MI);
548 return false;
549 }
550 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
551 MI.eraseFromParent();
552 return true;
553}
554
555bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) {
556 auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg());
557 auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg());
558 auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
559 auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
560 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
561 {ResLo.getReg(0), ResHi.getReg(0)});
562 MI.eraseFromParent();
563 return true;
564}
565
568 return (GI->is(Intrinsic::amdgcn_sbfe));
569
570 return MI.getOpcode() == AMDGPU::G_SBFX;
571}
572
573bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
574 Register Dst = MI.getOperand(0).getReg();
575 assert(MRI.getType(Dst) == LLT::scalar(64));
576 bool Signed = isSignedBFE(MI);
577 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
578 // Extract bitfield from Src, LSBit is the least-significant bit for the
579 // extraction (field offset) and Width is size of bitfield.
580 Register Src = MI.getOperand(FirstOpnd).getReg();
581 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
582 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
583 // Comments are for signed bitfield extract, similar for unsigned. x is sign
584 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
585
586 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
587 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
588 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
589
590 auto ConstWidth = getIConstantVRegValWithLookThrough(Width, MRI);
591
592 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
593 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
594 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
595 if (!ConstWidth) {
596 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
597 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
598 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
599 MI.eraseFromParent();
600 return true;
601 }
602
603 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
604 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
605 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
606 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
607 auto Zero = B.buildConstant({VgprRB, S32}, 0);
608 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
609
610 if (WidthImm <= 32) {
611 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
612 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo, Zero, Width});
613 MachineInstrBuilder Hi;
614 if (Signed) {
615 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
616 Hi = B.buildAShr(VgprRB_S32, Lo, B.buildConstant(VgprRB_S32, 31));
617 } else {
618 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
619 Hi = Zero;
620 }
621 B.buildMergeLikeInstr(Dst, {Lo, Hi});
622 } else {
623 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
624 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
625 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi, Zero, Amt});
626 B.buildMergeLikeInstr(Dst, {SHRSrcLo, Hi});
627 }
628
629 MI.eraseFromParent();
630 return true;
631}
632
633bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
634 Register DstReg = MI.getOperand(0).getReg();
635 LLT Ty = MRI.getType(DstReg);
636 bool Signed = isSignedBFE(MI);
637 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
638 Register Src = MI.getOperand(FirstOpnd).getReg();
639 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
640 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
641 // For uniform bit field extract there are 4 available instructions, but
642 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
643 // field offset in low and size in high 16 bits.
644
645 // Src1 Hi16|Lo16 = Size|FieldOffset
646 auto Mask = B.buildConstant(SgprRB_S32, maskTrailingOnes<unsigned>(6));
647 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
648 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
649 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset, Size);
650 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
651 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
652 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
653
654 // Select machine instruction, because of reg class constraining, insert
655 // copies from reg class to reg bank.
656 auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}},
657 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
658 constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(),
659 *ST.getRegisterInfo(), RBI);
660
661 B.buildCopy(DstReg, S_BFE->getOperand(0).getReg());
662 MI.eraseFromParent();
663 return true;
664}
665
666bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
667 Register Dst = MI.getOperand(0).getReg();
668 LLT DstTy = MRI.getType(Dst);
669 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
670 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
671 auto Op1 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(1).getReg());
672 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
673 unsigned Opc = MI.getOpcode();
674 auto Flags = MI.getFlags();
675 auto Lo =
676 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)}, Flags);
677 auto Hi =
678 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags);
679 B.buildMergeLikeInstr(Dst, {Lo, Hi});
680 MI.eraseFromParent();
681 return true;
682}
683
684bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &MI) {
685 Register Dst = MI.getOperand(0).getReg();
686 assert(MRI.getType(Dst) == S64);
687 auto Op1 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(1).getReg());
688 auto Op2 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(2).getReg());
689
690 // TODO: G_AMDGPU_MAD_* optimizations for G_MUL divergent S64 operation to
691 // match GlobalISel with old regbankselect.
692 auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
693 auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
694 auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
695 auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
696 auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
697 auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
698
699 B.buildMergeLikeInstr(Dst, {Lo, Hi});
700 MI.eraseFromParent();
701 return true;
702}
703
704bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
705 Register Dst = MI.getOperand(0).getReg();
706 assert(MRI.getType(Dst) == V2S16);
707 unsigned Opc = MI.getOpcode();
708 unsigned NumOps = MI.getNumOperands();
709 auto Flags = MI.getFlags();
710
711 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
712
713 if (NumOps == 2) {
714 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags);
715 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags);
716 B.buildMergeLikeInstr(Dst, {Lo, Hi});
717 MI.eraseFromParent();
718 return true;
719 }
720
721 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(MI.getOperand(2).getReg());
722
723 if (NumOps == 3) {
724 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
725 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
726 B.buildMergeLikeInstr(Dst, {Lo, Hi});
727 MI.eraseFromParent();
728 return true;
729 }
730
731 assert(NumOps == 4);
732 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(MI.getOperand(3).getReg());
733 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo}, Flags);
734 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi}, Flags);
735 B.buildMergeLikeInstr(Dst, {Lo, Hi});
736 MI.eraseFromParent();
737 return true;
738}
739
740bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &MI) {
741 Register Dst0 = MI.getOperand(0).getReg();
742 Register Dst1 = MI.getOperand(1).getReg();
743 Register Src0 = MI.getOperand(2).getReg();
744 Register Src1 = MI.getOperand(3).getReg();
745 Register Src2 = MI.getOperand(4).getReg();
746
747 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
748
749 // Keep the multiplication on the SALU.
750 Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
751 Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
752 if (ST.hasScalarMulHiInsts()) {
753 B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
754 } else {
755 auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
756 auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
757 auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
758 buildReadAnyLane(B, DstHi, MulHi.getReg(0), RBI);
759 }
760
761 // Accumulate and produce the "carry-out" bit.
762
763 // The "carry-out" is defined as bit 64 of the result when computed as a
764 // big integer. For unsigned multiply-add, this matches the usual
765 // definition of carry-out.
766 if (mi_match(Src2, MRI, MIPatternMatch::m_ZeroInt())) {
767 // No accumulate: result is just the multiplication, carry is 0.
768 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
769 B.buildConstant(Dst1, 0);
770 } else {
771 // Accumulate: add Src2 to the multiplication result with carry chain.
772 Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
773 Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
774 B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
775
776 auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
777 auto AddHi =
778 B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
779 B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
780 B.buildCopy(Dst1, AddHi.getReg(1));
781 }
782
783 MI.eraseFromParent();
784 return true;
785}
786
787bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
788 Register Dst = MI.getOperand(0).getReg();
789 LLT DstTy = MRI.getType(Dst);
790 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
791 (DstTy.isPointer() && DstTy.getSizeInBits() == 64));
792 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
793 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
794 auto Op3 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(3).getReg());
795 Register Cond = MI.getOperand(1).getReg();
796 auto Flags = MI.getFlags();
797 auto Lo =
798 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(0), Op3.getReg(0), Flags);
799 auto Hi =
800 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(1), Op3.getReg(1), Flags);
801
802 B.buildMergeLikeInstr(Dst, {Lo, Hi});
803 MI.eraseFromParent();
804 return true;
805}
806
807bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
808 auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
809 int Amt = MI.getOperand(2).getImm();
810 Register Lo, Hi;
811 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
812 if (Amt <= 32) {
813 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
814 if (Amt == 32) {
815 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
816 Lo = Freeze.getReg(0);
817 } else {
818 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
819 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
820 }
821
822 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
823 Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0);
824 } else {
825 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
826 Lo = Op1.getReg(0);
827 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
828 }
829
830 B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
831 MI.eraseFromParent();
832 return true;
833}
834
835bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &MI) {
836 // Split 64-bit find-first-bit operations into 32-bit halves:
837 // (ffbh hi:lo) -> umin(ffbh(hi), uaddsat(ffbh(lo), 32))
838 // (ffbl hi:lo) -> umin(ffbl(lo), uaddsat(ffbl(hi), 32))
839 // (ctlz_zero_undef hi:lo) -> umin(ffbh(hi), add(ffbh(lo), 32))
840 // (cttz_zero_undef hi:lo) -> umin(ffbl(lo), add(ffbl(hi), 32))
841 unsigned Opc = MI.getOpcode();
842
843 // FFBH/FFBL return 0xFFFFFFFF on zero input, using uaddsat to avoid
844 // wrapping. CTLZ/CTTZ guarantee non-zero input (zero_undef), so plain add
845 // is fine.
846 unsigned FFBOpc;
847 unsigned AddOpc;
848 bool SearchFromMSB;
849 switch (Opc) {
850 case AMDGPU::G_AMDGPU_FFBH_U32:
851 FFBOpc = Opc;
852 AddOpc = AMDGPU::G_UADDSAT;
853 SearchFromMSB = true;
854 break;
855 case AMDGPU::G_AMDGPU_FFBL_B32:
856 FFBOpc = Opc;
857 AddOpc = AMDGPU::G_UADDSAT;
858 SearchFromMSB = false;
859 break;
860 case AMDGPU::G_CTLZ_ZERO_UNDEF:
861 FFBOpc = AMDGPU::G_AMDGPU_FFBH_U32;
862 AddOpc = AMDGPU::G_ADD;
863 SearchFromMSB = true;
864 break;
865 case AMDGPU::G_CTTZ_ZERO_UNDEF:
866 FFBOpc = AMDGPU::G_AMDGPU_FFBL_B32;
867 AddOpc = AMDGPU::G_ADD;
868 SearchFromMSB = false;
869 break;
870 default:
871 llvm_unreachable("unexpected opcode in lowerSplitBitCount64To32");
872 }
873
874 auto Unmerge = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
875 Register Lo = Unmerge.getReg(0);
876 Register Hi = Unmerge.getReg(1);
877
878 // MSB-first (FFBH/CTLZ) searches hi first; LSB-first (FFBL/CTTZ) searches
879 // lo first. The secondary half adds 32 to account for the primary half's
880 // width.
881 auto Primary = B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Hi : Lo});
882 auto Secondary =
883 B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Lo : Hi});
884
885 auto Adjusted = B.buildInstr(AddOpc, {VgprRB_S32},
886 {Secondary, B.buildConstant(VgprRB_S32, 32)});
887 B.buildUMin(MI.getOperand(0).getReg(), Primary, Adjusted);
888
889 MI.eraseFromParent();
890 return true;
891}
892
893bool RegBankLegalizeHelper::lowerExtrVecEltToSel(MachineInstr &MI) {
894 // Lower extract vector element to a compare-select chain:
895 // result = elt[0]
896 // for i in 1..N-1:
897 // result = (idx == i) ? elt[i] : result
898 //
899 // When the index is divergent, each lane may want a different element, so
900 // we must check every element per lane.
901 Register Dst = MI.getOperand(0).getReg();
902 Register Src = MI.getOperand(1).getReg();
903 Register Idx = MI.getOperand(2).getReg();
904
905 LLT VecTy = MRI.getType(Src);
906 LLT ScalarTy = VecTy.getScalarType();
907 unsigned NumElts = VecTy.getNumElements();
908 MachineRegisterInfo::VRegAttrs VgprRB_EltTy = {VgprRB, ScalarTy};
909
910 auto Unmerge = B.buildUnmerge(VgprRB_EltTy, Src);
911
912 if (ScalarTy.getSizeInBits() == 32) {
913 Register PrevSelect = Unmerge.getReg(0);
914 for (unsigned I = 1; I < NumElts; ++I) {
915 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)}, I);
916 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
917 PrevSelect =
918 B.buildSelect(VgprRB_EltTy, Cmp, Unmerge.getReg(I), PrevSelect)
919 .getReg(0);
920 }
921 B.buildCopy(Dst, PrevSelect);
922 } else if (ScalarTy.getSizeInBits() == 64) {
923 auto InitUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(0));
924 Register PrevLo = InitUnmerge.getReg(0);
925 Register PrevHi = InitUnmerge.getReg(1);
926 for (unsigned I = 1; I < NumElts; ++I) {
927 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)}, I);
928 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
929 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(I));
930 PrevLo = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(0), PrevLo)
931 .getReg(0);
932 PrevHi = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(1), PrevHi)
933 .getReg(0);
934 }
935 B.buildMergeLikeInstr(Dst, {PrevLo, PrevHi});
936 } else {
938 MF, MORE, "amdgpu-regbanklegalize",
939 "AMDGPU RegBankLegalize: ExtrVecEltToSel unsupported element type", MI);
940 return false;
941 }
942
943 MI.eraseFromParent();
944 return true;
945}
946
947bool RegBankLegalizeHelper::lowerExtrVecEltTo32(MachineInstr &MI) {
948 // Reduce a 64-bit element extract to two 32-bit extracts:
949 // vec32 = bitcast <N x s64> to <2N x s32>
950 // lo = vec32[idx * 2]
951 // hi = vec32[idx * 2 + 1]
952 // result = merge(lo, hi)
953 //
954 // When the index is uniform, all lanes extract the same element, so we can
955 // just split the s64 extract into two s32 extracts which lower to MOVREL.
956 Register Dst = MI.getOperand(0).getReg();
957 Register Src = MI.getOperand(1).getReg();
958 Register Idx = MI.getOperand(2).getReg();
959
960 LLT SrcTy = MRI.getType(Src);
961 LLT Vec32Ty = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
962
963 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
964 "expected VGPR src and SGPR idx");
965
966 auto CastSrc = B.buildBitcast({VgprRB, Vec32Ty}, Src);
967
968 // Calculate new Lo and Hi indices
969 auto One = B.buildConstant(SgprRB_S32, 1);
970 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
971 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
972
973 auto ExtLo = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxLo);
974 auto ExtHi = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxHi);
975
976 B.buildMergeLikeInstr(Dst, {ExtLo.getReg(0), ExtHi.getReg(0)});
977
978 MI.eraseFromParent();
979 return true;
980}
981
982bool RegBankLegalizeHelper::lowerInsVecEltToSel(MachineInstr &MI) {
983 // Lower insert vector element to a compare-select chain:
984 // for i in 0..N-1:
985 // result[i] = (idx == i) ? elt : srcVec[i]
986 // dst = merge(result[0..N-1])
987 //
988 // VGPR B64 requires splitting to lo/hi s32 pairs since there is no
989 // v_cndmask_b64. SGPR B64/B32 and VGPR B32 can be handled natively.
990 Register Dst = MI.getOperand(0).getReg();
991 Register Src = MI.getOperand(1).getReg();
992 Register Elt = MI.getOperand(2).getReg();
993 Register Idx = MI.getOperand(3).getReg();
994
995 LLT VecTy = MRI.getType(Src);
996 LLT ScalarTy = VecTy.getScalarType();
997 unsigned NumElts = VecTy.getNumElements();
998 const RegisterBank *SrcRB = MRI.getRegBank(Src);
999 bool IsSGPR = (SrcRB == SgprRB);
1000 SmallVector<Register, 16> Selects;
1001
1002 if (!IsSGPR && ScalarTy.getSizeInBits() == 64) {
1003 // VGPR B64: split to 32-bit lo/hi since there is no v_cndmask_b64.
1004 auto Unmerge = B.buildUnmerge(VgprRB_S32, Src);
1005 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1006 Register EltLo = EltUnmerge.getReg(0);
1007 Register EltHi = EltUnmerge.getReg(1);
1008 for (unsigned I = 0; I < NumElts; ++I) {
1009 auto IdxConst = B.buildConstant(VgprRB_S32, I);
1010 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
1011 Selects.push_back(
1012 B.buildSelect(VgprRB_S32, Cmp, EltLo, Unmerge.getReg(2 * I))
1013 .getReg(0));
1014 Selects.push_back(
1015 B.buildSelect(VgprRB_S32, Cmp, EltHi, Unmerge.getReg(2 * I + 1))
1016 .getReg(0));
1017 }
1018 LLT Vec32Ty = LLT::fixed_vector(2 * NumElts, 32);
1019 auto Vec32 = B.buildBuildVector({VgprRB, Vec32Ty}, Selects);
1020 B.buildBitcast(Dst, Vec32);
1021 } else if (ScalarTy.getSizeInBits() == 32 || ScalarTy.getSizeInBits() == 64) {
1022 // B32 (any bank) and SGPR B64: element-wise select at native width.
1023 MachineRegisterInfo::VRegAttrs SrcRB_EltTy = {SrcRB, ScalarTy};
1024 MachineRegisterInfo::VRegAttrs CmpTy = IsSGPR ? SgprRB_S32 : VccRB_S1;
1025 auto Unmerge = B.buildUnmerge(SrcRB_EltTy, Src);
1026 for (unsigned I = 0; I < NumElts; ++I) {
1027 auto IdxConst = B.buildConstant(SgprRB_S32, I);
1028 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CmpTy, Idx, IdxConst);
1029 Selects.push_back(
1030 B.buildSelect(SrcRB_EltTy, Cmp, Elt, Unmerge.getReg(I)).getReg(0));
1031 }
1032 B.buildMergeLikeInstr(Dst, Selects);
1033 } else {
1035 MF, MORE, "amdgpu-regbanklegalize",
1036 "AMDGPU RegBankLegalize: InsVecEltToSel unsupported element type", MI);
1037 return false;
1038 }
1039
1040 MI.eraseFromParent();
1041 return true;
1042}
1043
1044bool RegBankLegalizeHelper::lowerInsVecEltTo32(MachineInstr &MI) {
1045 // Reduce a 64-bit element insert to two 32-bit inserts:
1046 // vec32 = bitcast <N x s64> to <2N x s32>
1047 // lo, hi = unmerge elt
1048 // vec32[idx * 2] = lo
1049 // vec32[idx * 2 + 1] = hi
1050 // dst = bitcast <2N x s32> to <N x s64>
1051 //
1052 // When the index is uniform, all lanes insert at the same position, so we
1053 // can split the s64 insert into two s32 inserts which lower to MOVREL/GPRIDX.
1054 Register Dst = MI.getOperand(0).getReg();
1055 Register Src = MI.getOperand(1).getReg();
1056 Register Elt = MI.getOperand(2).getReg();
1057 Register Idx = MI.getOperand(3).getReg();
1058
1059 LLT SrcTy = MRI.getType(Src);
1060 LLT Vec32Ty = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
1061
1062 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
1063 "expected VGPR src and SGPR idx");
1064
1065 MachineRegisterInfo::VRegAttrs VgprRB_Vec32Ty = {VgprRB, Vec32Ty};
1066
1067 auto CastSrc = B.buildBitcast(VgprRB_Vec32Ty, Src);
1068 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1069
1070 // Calculate new Lo and Hi indices
1071 auto One = B.buildConstant(SgprRB_S32, 1);
1072 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
1073 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
1074
1075 auto InsLo = B.buildInsertVectorElement(VgprRB_Vec32Ty, CastSrc,
1076 EltUnmerge.getReg(0), IdxLo);
1077 auto InsHi = B.buildInsertVectorElement(VgprRB_Vec32Ty, InsLo,
1078 EltUnmerge.getReg(1), IdxHi);
1079
1080 B.buildBitcast(Dst, InsHi);
1081
1082 MI.eraseFromParent();
1083 return true;
1084}
1085
1086bool RegBankLegalizeHelper::lowerAbsToNegMax(MachineInstr &MI) {
1087 // Lower divergent G_ABS to smax(x, 0 - x) in the VGPR bank:
1088 // zero = 0
1089 // neg = G_SUB zero, x
1090 // dst = G_SMAX x, neg
1091 //
1092 // There is no integer v_abs instruction on AMDGPU, so divergent G_ABS is
1093 // expanded to this sub/smax pair.
1094 Register DstReg = MI.getOperand(0).getReg();
1095 Register SrcReg = MI.getOperand(1).getReg();
1096 LLT Ty = MRI.getType(DstReg);
1097
1098 Register Zero;
1099 if (Ty == V2S16) {
1100 // buildConstant cannot produce a V2S16 directly; pack two S16 zeros.
1101 Register Zero16 = B.buildConstant({VgprRB, S16}, 0).getReg(0);
1102 Zero = B.buildBuildVector({VgprRB, Ty}, {Zero16, Zero16}).getReg(0);
1103 } else {
1104 assert((Ty == S32 || Ty == S16) && "unexpected type for AbsToNegMax");
1105 Zero = B.buildConstant({VgprRB, Ty}, 0).getReg(0);
1106 }
1107
1108 auto Neg = B.buildSub({VgprRB, Ty}, Zero, SrcReg);
1109 B.buildSMax(DstReg, SrcReg, Neg);
1110 MI.eraseFromParent();
1111 return true;
1112}
1113
1114bool RegBankLegalizeHelper::lowerAbsToS32(MachineInstr &MI) {
1115 // Lower uniform V2S16 abs by unpacking the values to two separate SGPR
1116 // registers and re-emitting G_ABS on each:
1117 // packed = bitcast <2 x s16> src to s32
1118 // lo = sext_inreg packed, 16
1119 // hi = ashr packed, 16
1120 // dst = build_vector_trunc G_ABS(lo), G_ABS(hi)
1121 //
1122 // SALU only has s_abs_i32, with no direct uniform V2S16 abs. The
1123 // re-emitted G_ABS(SgprRB, S32) selects to s_abs_i32 on each value.
1124 auto Bitcast = B.buildBitcast({SgprRB_S32}, MI.getOperand(1).getReg());
1125 auto SextInReg = B.buildSExtInReg({SgprRB_S32}, Bitcast, 16);
1126 auto ShiftHi =
1127 B.buildAShr({SgprRB_S32}, Bitcast, B.buildConstant({SgprRB_S32}, 16));
1128
1129 auto AbsLo = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {SextInReg});
1130 auto AbsHi = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {ShiftHi});
1131 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
1132 {AbsLo.getReg(0), AbsHi.getReg(0)});
1133
1134 MI.eraseFromParent();
1135 return true;
1136}
1137
1138bool RegBankLegalizeHelper::lower(MachineInstr &MI,
1139 const RegBankLLTMapping &Mapping,
1140 WaterfallInfo &WFI) {
1141
1142 switch (Mapping.LoweringMethod) {
1143 case DoNotLower:
1144 break;
1145 case VccExtToSel:
1146 return lowerVccExtToSel(MI);
1147 case UniExtToSel: {
1148 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1149 auto True = B.buildConstant({SgprRB, Ty},
1150 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
1151 auto False = B.buildConstant({SgprRB, Ty}, 0);
1152 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare.
1153 // We are making select here. S1 cond was already 'any-extended to S32' +
1154 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg.
1155 B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True,
1156 False);
1157 MI.eraseFromParent();
1158 return true;
1159 }
1160 case UnpackBitShift:
1161 return lowerUnpackBitShift(MI);
1162 case UnpackMinMax:
1163 return lowerUnpackMinMax(MI);
1164 case ScalarizeToS16:
1165 return lowerSplitTo16(MI);
1166 case Ext32To64: {
1167 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
1168 MachineInstrBuilder Hi;
1169 switch (MI.getOpcode()) {
1170 case AMDGPU::G_ZEXT: {
1171 Hi = B.buildConstant({RB, S32}, 0);
1172 break;
1173 }
1174 case AMDGPU::G_SEXT: {
1175 // Replicate sign bit from 32-bit extended part.
1176 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
1177 Hi = B.buildAShr({RB, S32}, MI.getOperand(1).getReg(), ShiftAmt);
1178 break;
1179 }
1180 case AMDGPU::G_ANYEXT: {
1181 Hi = B.buildUndef({RB, S32});
1182 break;
1183 }
1184 default:
1185 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1186 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
1187 MI);
1188 return false;
1189 }
1190
1191 B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
1192 {MI.getOperand(1).getReg(), Hi});
1193 MI.eraseFromParent();
1194 return true;
1195 }
1196 case UniCstExt: {
1197 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
1198 B.buildConstant(MI.getOperand(0).getReg(), ConstVal);
1199
1200 MI.eraseFromParent();
1201 return true;
1202 }
1203 case VgprToVccCopy: {
1204 Register Src = MI.getOperand(1).getReg();
1205 LLT Ty = MRI.getType(Src);
1206 // Take lowest bit from each lane and put it in lane mask.
1207 // Lowering via compare, but we need to clean high bits first as compare
1208 // compares all bits in register.
1209 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
1210 if (Ty == S64) {
1211 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
1212 auto One = B.buildConstant(VgprRB_S32, 1);
1213 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
1214 auto Zero = B.buildConstant(VgprRB_S32, 0);
1215 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
1216 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
1217 } else {
1218 assert(Ty == S32 || Ty == S16);
1219 auto One = B.buildConstant({VgprRB, Ty}, 1);
1220 B.buildAnd(BoolSrc, Src, One);
1221 }
1222 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
1223 B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero);
1224 MI.eraseFromParent();
1225 return true;
1226 }
1227 case V_BFE:
1228 return lowerV_BFE(MI);
1229 case S_BFE:
1230 return lowerS_BFE(MI);
1231 case UniMAD64:
1232 return lowerUniMAD64(MI);
1233 case UniMul64: {
1234 B.buildMul(MI.getOperand(0), MI.getOperand(1), MI.getOperand(2));
1235 MI.eraseFromParent();
1236 return true;
1237 }
1238 case DivSMulToMAD: {
1239 auto Op1 = B.buildTrunc(VgprRB_S32, MI.getOperand(1));
1240 auto Op2 = B.buildTrunc(VgprRB_S32, MI.getOperand(2));
1241 auto Zero = B.buildConstant({VgprRB, S64}, 0);
1242
1243 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
1244 ? AMDGPU::G_AMDGPU_MAD_U64_U32
1245 : AMDGPU::G_AMDGPU_MAD_I64_I32;
1246
1247 B.buildInstr(NewOpc, {MI.getOperand(0).getReg(), {SgprRB, S32}},
1248 {Op1, Op2, Zero});
1249 MI.eraseFromParent();
1250 return true;
1251 }
1252 case SplitTo32:
1253 return lowerSplitTo32(MI);
1254 case SplitTo32Mul:
1255 return lowerSplitTo32Mul(MI);
1256 case SplitTo32Select:
1257 return lowerSplitTo32Select(MI);
1258 case SplitTo32SExtInReg:
1259 return lowerSplitTo32SExtInReg(MI);
1260 case SplitLoad: {
1261 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1262 unsigned Size = DstTy.getSizeInBits();
1263 // Even split to 128-bit loads
1264 if (Size > 128) {
1265 LLT B128;
1266 if (DstTy.isVector()) {
1267 LLT EltTy = DstTy.getElementType();
1268 B128 = LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1269 } else {
1270 B128 = LLT::scalar(128);
1271 }
1272 if (Size / 128 == 2)
1273 splitLoad(MI, {B128, B128});
1274 else if (Size / 128 == 4)
1275 splitLoad(MI, {B128, B128, B128, B128});
1276 else {
1277 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1278 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1279 MI);
1280 return false;
1281 }
1282 }
1283 // 64 and 32 bit load
1284 else if (DstTy == S96)
1285 splitLoad(MI, {S64, S32}, S32);
1286 else if (DstTy == V3S32)
1287 splitLoad(MI, {V2S32, S32}, S32);
1288 else if (DstTy == V6S16)
1289 splitLoad(MI, {V4S16, V2S16}, V2S16);
1290 else {
1291 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1292 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1293 MI);
1294 return false;
1295 }
1296 return true;
1297 }
1298 case WidenLoad: {
1299 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1300 if (DstTy == S96)
1301 widenLoad(MI, S128);
1302 else if (DstTy == V3S32)
1303 widenLoad(MI, V4S32, S32);
1304 else if (DstTy == V6S16)
1305 widenLoad(MI, V8S16, V2S16);
1306 else {
1307 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1308 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1309 MI);
1310 return false;
1311 }
1312 return true;
1313 }
1314 case UnpackAExt:
1315 return lowerUnpackAExt(MI);
1316 case WidenMMOToS32:
1317 return widenMMOToS32(cast<GAnyLoad>(MI));
1318 case VerifyAllSgpr: {
1319 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1320 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1321 }));
1322 return true;
1323 }
1324 case ApplyAllVgpr: {
1325 assert(llvm::all_of(MI.defs(), [&](const MachineOperand &Op) {
1326 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1327 }));
1328 B.setInstrAndDebugLoc(MI);
1329 for (unsigned i = MI.getNumDefs(); i < MI.getNumOperands(); ++i) {
1330 MachineOperand &Op = MI.getOperand(i);
1331 if (!Op.isReg())
1332 continue;
1333 Register Reg = Op.getReg();
1334 if (MRI.getRegBank(Reg) != VgprRB) {
1335 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1336 Op.setReg(Copy.getReg(0));
1337 }
1338 }
1339 return true;
1340 }
1341 case UnmergeToShiftTrunc: {
1342 GUnmerge *Unmerge = dyn_cast<GUnmerge>(&MI);
1343 LLT Ty = MRI.getType(Unmerge->getSourceReg());
1344 if (Ty.getSizeInBits() % 32 != 0) {
1345 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1346 "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1347 MI);
1348 return false;
1349 }
1350
1351 B.setInstrAndDebugLoc(MI);
1352 if (Ty.getSizeInBits() > 32) {
1353 auto UnmergeV2S16 =
1354 B.buildUnmerge({SgprRB, V2S16}, Unmerge->getSourceReg());
1355 for (unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
1356 auto [Dst0S32, Dst1S32] =
1357 unpackAExt(UnmergeV2S16->getOperand(i).getReg());
1358 B.buildTrunc(MI.getOperand(i * 2).getReg(), Dst0S32);
1359 B.buildTrunc(MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
1360 }
1361 } else {
1362 auto [Dst0S32, Dst1S32] = unpackAExt(MI.getOperand(2).getReg());
1363 B.buildTrunc(MI.getOperand(0).getReg(), Dst0S32);
1364 B.buildTrunc(MI.getOperand(1).getReg(), Dst1S32);
1365 }
1366
1367 MI.eraseFromParent();
1368 return true;
1369 }
1371 Register Dst = MI.getOperand(0).getReg();
1372 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1373 B.setInsertPt(*MI.getParent(), MI.getParent()->getFirstNonPHI());
1374 MI.getOperand(0).setReg(NewDst);
1375 B.buildTrunc(Dst, NewDst);
1376
1377 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1378 Register UseReg = MI.getOperand(i).getReg();
1379
1380 auto DefMI = MRI.getVRegDef(UseReg)->getIterator();
1381 MachineBasicBlock *DefMBB = DefMI->getParent();
1382
1383 B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
1384
1385 auto NewUse = B.buildAnyExt(SgprRB_S32, UseReg);
1386 MI.getOperand(i).setReg(NewUse.getReg(0));
1387 }
1388 break;
1389 }
1390 case VerifyAllSgprGPHI: {
1391 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1392 if (Op.isMBB())
1393 return true;
1394 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1395 }));
1396 return true;
1397 }
1399 assert(MRI.getRegBankOrNull(MI.getOperand(0).getReg()) == VgprRB);
1400 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1401 if (Op.isMBB())
1402 return true;
1403 const RegisterBank *RB = MRI.getRegBankOrNull(Op.getReg());
1404 return RB == VgprRB || RB == SgprRB;
1405 }));
1406 return true;
1407 }
1408 case ApplyINTRIN_IMAGE: {
1409 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1411 assert(RSrcIntrin && RSrcIntrin->IsImage);
1412 // The reported argument index is relative to the IR intrinsic call
1413 // arguments, so shift by the number of defs and the intrinsic ID.
1414 unsigned RsrcIdx = RSrcIntrin->RsrcArg + MI.getNumExplicitDefs() + 1;
1415 return applyRegisterBanksVgprWithSgprRsrc(MI, RsrcIdx);
1416 }
1418 // Rsrc is the last register operand. Base BVH trails an A16 immediate
1419 // after rsrc; dual/BVH8 do not. Scan backwards for the last virtual
1420 // register.
1421 unsigned RsrcIdx = MI.getNumOperands();
1422 while (RsrcIdx-- > MI.getNumExplicitDefs()) {
1423 const MachineOperand &Op = MI.getOperand(RsrcIdx);
1424 if (Op.isReg() && Op.getReg().isVirtual())
1425 break;
1426 }
1427 return applyRegisterBanksVgprWithSgprRsrc(MI, RsrcIdx);
1428 }
1430 return lowerSplitBitCount64To32(MI);
1431 case ExtrVecEltToSel:
1432 return lowerExtrVecEltToSel(MI);
1433 case ExtrVecEltTo32:
1434 return lowerExtrVecEltTo32(MI);
1435 case InsVecEltToSel:
1436 return lowerInsVecEltToSel(MI);
1437 case InsVecEltTo32:
1438 return lowerInsVecEltTo32(MI);
1439 case AbsToNegMax:
1440 return lowerAbsToNegMax(MI);
1441 case AbsToS32:
1442 return lowerAbsToS32(MI);
1443 }
1444
1445 if (!WFI.SgprWaterfallOperandRegs.empty()) {
1446 if (!executeInWaterfallLoop(B, WFI))
1447 return false;
1448 }
1449 return true;
1450}
1451
1452LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
1453 switch (ID) {
1454 case Vcc:
1455 case UniInVcc:
1456 return LLT::scalar(1);
1457 case Sgpr16:
1458 case Vgpr16:
1459 case UniInVgprS16:
1460 return LLT::scalar(16);
1461 case Sgpr32:
1462 case Sgpr32_WF:
1463 case Sgpr32Trunc:
1464 case Sgpr32AExt:
1466 case Sgpr32SExt:
1467 case Sgpr32ZExt:
1468 case UniInVgprS32:
1469 case Sgpr32ToVgprDst:
1470 case Vgpr32:
1471 case Vgpr32AExt:
1472 case Vgpr32SExt:
1473 case Vgpr32ZExt:
1474 return LLT::scalar(32);
1475 case Sgpr64:
1476 case Vgpr64:
1477 case UniInVgprS64:
1478 case Sgpr64ToVgprDst:
1479 return LLT::scalar(64);
1480 case Sgpr128:
1481 case Vgpr128:
1482 return LLT::scalar(128);
1483 case SgprP0:
1484 case SgprP0Call_WF:
1485 case VgprP0:
1486 return LLT::pointer(0, 64);
1487 case SgprP1:
1488 case VgprP1:
1489 return LLT::pointer(1, 64);
1490 case SgprP2:
1491 case VgprP2:
1492 return LLT::pointer(2, 32);
1493 case SgprP3:
1494 case VgprP3:
1495 return LLT::pointer(3, 32);
1496 case SgprP4:
1497 case SgprP4Call_WF:
1498 case VgprP4:
1499 return LLT::pointer(4, 64);
1500 case SgprP5:
1501 case VgprP5:
1502 return LLT::pointer(5, 32);
1503 case SgprP8:
1504 return LLT::pointer(8, 128);
1505 case SgprV2S16:
1506 case VgprV2S16:
1507 case UniInVgprV2S16:
1508 return LLT::fixed_vector(2, 16);
1509 case SgprV2S32:
1510 case VgprV2S32:
1511 case UniInVgprV2S32:
1512 return LLT::fixed_vector(2, 32);
1513 case VgprV3S32:
1514 return LLT::fixed_vector(3, 32);
1515 case VgprV4S16:
1516 return LLT::fixed_vector(4, 16);
1517 case SgprV4S32:
1518 case SgprV4S32_WF:
1520 case VgprV4S32:
1521 case UniInVgprV4S32:
1522 return LLT::fixed_vector(4, 32);
1523 case VgprV8S32:
1524 return LLT::fixed_vector(8, 32);
1525 case VgprV2S64:
1526 case UniInVgprV2S64:
1527 return LLT::fixed_vector(2, 64);
1528 case VgprV6S32:
1529 return LLT::fixed_vector(6, 32);
1530 case VgprV32S16:
1531 return LLT::fixed_vector(32, 16);
1532 case VgprV32S32:
1533 return LLT::fixed_vector(32, 32);
1534 default:
1535 return LLT();
1536 }
1537}
1538
1539LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
1540 switch (ID) {
1541 case SgprB32:
1542 case VgprB32:
1543 case SgprB32_M0:
1545 case UniInVgprB32:
1546 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
1547 isAnyPtr(Ty, 32))
1548 return Ty;
1549 return LLT();
1550 case SgprPtr32:
1551 case VgprPtr32:
1552 return isAnyPtr(Ty, 32) ? Ty : LLT();
1553 case SgprPtr64:
1554 case VgprPtr64:
1555 return isAnyPtr(Ty, 64) ? Ty : LLT();
1556 case SgprPtr128:
1557 case VgprPtr128:
1558 return isAnyPtr(Ty, 128) ? Ty : LLT();
1559 case SgprB64:
1560 case VgprB64:
1562 case UniInVgprB64:
1563 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
1564 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
1565 return Ty;
1566 return LLT();
1567 case SgprB96:
1568 case VgprB96:
1569 case UniInVgprB96:
1570 if (Ty == LLT::scalar(96) || Ty == LLT::fixed_vector(3, 32) ||
1571 Ty == LLT::fixed_vector(6, 16))
1572 return Ty;
1573 return LLT();
1574 case SgprB128:
1575 case VgprB128:
1576 case UniInVgprB128:
1577 if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) ||
1578 Ty == LLT::fixed_vector(2, 64) || Ty == LLT::fixed_vector(8, 16) ||
1579 isAnyPtr(Ty, 128))
1580 return Ty;
1581 return LLT();
1582 case VgprB160:
1583 case UniInVgprB160:
1584 if (Ty.getSizeInBits() == 160)
1585 return Ty;
1586 return LLT();
1587 case SgprB256:
1588 case VgprB256:
1589 case UniInVgprB256:
1590 if (Ty == LLT::scalar(256) || Ty == LLT::fixed_vector(8, 32) ||
1591 Ty == LLT::fixed_vector(4, 64) || Ty == LLT::fixed_vector(16, 16))
1592 return Ty;
1593 return LLT();
1594 case SgprB512:
1595 case VgprB512:
1596 case UniInVgprB512:
1597 if (Ty == LLT::scalar(512) || Ty == LLT::fixed_vector(16, 32) ||
1598 Ty == LLT::fixed_vector(8, 64))
1599 return Ty;
1600 return LLT();
1601 case SgprBRC: {
1602 const SIRegisterInfo *TRI =
1603 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1604 unsigned LLTSize = Ty.getSizeInBits();
1605 if (LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize))
1606 return Ty;
1607 return LLT();
1608 }
1609 case VgprBRC: {
1610 const SIRegisterInfo *TRI =
1611 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1612 if (TRI->getSGPRClassForBitWidth(Ty.getSizeInBits()))
1613 return Ty;
1614 return LLT();
1615 }
1616 default:
1617 return LLT();
1618 }
1619}
1620
1621const RegisterBank *
1622RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
1623 switch (ID) {
1624 case Vcc:
1625 return VccRB;
1626 case Sgpr16:
1627 case Sgpr32:
1628 case Sgpr32_WF:
1629 case Sgpr64:
1630 case Sgpr128:
1631 case SgprP0:
1632 case SgprP0Call_WF:
1633 case SgprP1:
1634 case SgprP2:
1635 case SgprP3:
1636 case SgprP4:
1637 case SgprP4Call_WF:
1638 case SgprP5:
1639 case SgprP8:
1640 case SgprPtr32:
1641 case SgprPtr64:
1642 case SgprPtr128:
1643 case SgprV2S16:
1644 case SgprV2S32:
1645 case SgprV4S32:
1646 case SgprV4S32_WF:
1648 case SgprB32:
1649 case SgprB64:
1650 case SgprB96:
1651 case SgprB128:
1652 case SgprB256:
1653 case SgprB512:
1654 case SgprBRC:
1655 case UniInVcc:
1656 case UniInVgprS16:
1657 case UniInVgprS32:
1658 case UniInVgprS64:
1659 case UniInVgprV2S16:
1660 case UniInVgprV2S32:
1661 case UniInVgprV4S32:
1662 case UniInVgprV2S64:
1663 case UniInVgprB32:
1664 case UniInVgprB64:
1665 case UniInVgprB96:
1666 case UniInVgprB128:
1667 case UniInVgprB160:
1668 case UniInVgprB256:
1669 case UniInVgprB512:
1670 case Sgpr32Trunc:
1671 case Sgpr32AExt:
1673 case Sgpr32SExt:
1674 case Sgpr32ZExt:
1675 return SgprRB;
1676 case Vgpr16:
1677 case Vgpr32:
1678 case Vgpr64:
1679 case Vgpr128:
1680 case VgprP0:
1681 case VgprP1:
1682 case VgprP2:
1683 case VgprP3:
1684 case VgprP4:
1685 case VgprP5:
1686 case VgprPtr32:
1687 case VgprPtr64:
1688 case VgprPtr128:
1689 case VgprV2S16:
1690 case VgprV2S32:
1691 case VgprV2S64:
1692 case VgprV3S32:
1693 case VgprV4S16:
1694 case VgprV4S32:
1695 case VgprV6S32:
1696 case VgprV8S32:
1697 case VgprV32S16:
1698 case VgprB32:
1699 case VgprB64:
1700 case VgprB96:
1701 case VgprB128:
1702 case VgprB160:
1703 case VgprB256:
1704 case VgprB512:
1705 case VgprBRC:
1706 case Vgpr32AExt:
1707 case Vgpr32SExt:
1708 case Vgpr32ZExt:
1709 case Sgpr32ToVgprDst:
1710 case Sgpr64ToVgprDst:
1711 return VgprRB;
1712 default:
1713 return nullptr;
1714 }
1715}
1716
1717bool RegBankLegalizeHelper::applyMappingDst(
1718 MachineInstr &MI, unsigned &OpIdx,
1719 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1720 // Defs start from operand 0
1721 for (; OpIdx < MethodIDs.size(); ++OpIdx) {
1722 if (MethodIDs[OpIdx] == None)
1723 continue;
1724 MachineOperand &Op = MI.getOperand(OpIdx);
1725 Register Reg = Op.getReg();
1726 LLT Ty = MRI.getType(Reg);
1727 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg);
1728
1729 switch (MethodIDs[OpIdx]) {
1730 // vcc, sgpr and vgpr scalars, pointers and vectors
1731 case Vcc:
1732 case Sgpr16:
1733 case Sgpr32:
1734 case Sgpr64:
1735 case Sgpr128:
1736 case SgprP0:
1737 case SgprP1:
1738 case SgprP3:
1739 case SgprP4:
1740 case SgprP5:
1741 case SgprP8:
1742 case SgprV2S16:
1743 case SgprV2S32:
1744 case SgprV4S32:
1745 case Vgpr16:
1746 case Vgpr32:
1747 case Vgpr64:
1748 case Vgpr128:
1749 case VgprP0:
1750 case VgprP1:
1751 case VgprP2:
1752 case VgprP3:
1753 case VgprP4:
1754 case VgprP5:
1755 case VgprV2S16:
1756 case VgprV2S32:
1757 case VgprV2S64:
1758 case VgprV3S32:
1759 case VgprV4S16:
1760 case VgprV4S32:
1761 case VgprV6S32:
1762 case VgprV8S32:
1763 case VgprV32S16: {
1764 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1765 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1766 break;
1767 }
1768 // sgpr and vgpr B-types
1769 case SgprB32:
1770 case SgprB64:
1771 case SgprB96:
1772 case SgprB128:
1773 case SgprB256:
1774 case SgprB512:
1775 case SgprBRC:
1776 case SgprPtr32:
1777 case SgprPtr64:
1778 case SgprPtr128:
1779 case VgprB32:
1780 case VgprB64:
1781 case VgprB96:
1782 case VgprB128:
1783 case VgprB160:
1784 case VgprB256:
1785 case VgprB512:
1786 case VgprBRC:
1787 case VgprPtr32:
1788 case VgprPtr64:
1789 case VgprPtr128: {
1790 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1791 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1792 break;
1793 }
1794 // uniform in vcc/vgpr: scalars, vectors and B-types
1795 case UniInVcc: {
1796 assert(Ty == S1);
1797 assert(RB == SgprRB);
1798 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1799 Op.setReg(NewDst);
1800 if (!MRI.use_empty(Reg)) {
1801 auto CopyS32_Vcc =
1802 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1803 B.buildTrunc(Reg, CopyS32_Vcc);
1804 }
1805 break;
1806 }
1807 case UniInVgprS16: {
1808 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1809 assert(RB == SgprRB);
1810 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1811 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1812 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1813 Op.setReg(NewVgprDstS16);
1814 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1815 buildReadAnyLane(B, NewSgprDstS32, NewVgprDstS32, RBI);
1816 B.buildTrunc(Reg, NewSgprDstS32);
1817 break;
1818 }
1819 case UniInVgprS32:
1820 case UniInVgprS64:
1821 case UniInVgprV2S16:
1822 case UniInVgprV2S32:
1823 case UniInVgprV4S32:
1824 case UniInVgprV2S64: {
1825 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1826 assert(RB == SgprRB);
1827 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1828 Op.setReg(NewVgprDst);
1829 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1830 break;
1831 }
1832 case UniInVgprB32:
1833 case UniInVgprB64:
1834 case UniInVgprB96:
1835 case UniInVgprB128:
1836 case UniInVgprB160:
1837 case UniInVgprB256:
1838 case UniInVgprB512: {
1839 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1840 assert(RB == SgprRB);
1841 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1842 Op.setReg(NewVgprDst);
1843 AMDGPU::buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1844 break;
1845 }
1846 // sgpr trunc
1847 case Sgpr32Trunc: {
1848 assert(Ty.getSizeInBits() < 32);
1849 assert(RB == SgprRB);
1850 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1851 Op.setReg(NewDst);
1852 if (!MRI.use_empty(Reg))
1853 B.buildTrunc(Reg, NewDst);
1854 break;
1855 }
1856 case Sgpr32ToVgprDst:
1857 case Sgpr64ToVgprDst: {
1858 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1859 assert(RB == VgprRB);
1860 Op.setReg(MRI.createVirtualRegister({SgprRB, Ty}));
1861 B.buildCopy(Reg, Op.getReg());
1862 break;
1863 }
1864 case InvalidMapping: {
1866 MF, MORE, "amdgpu-regbanklegalize",
1867 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for", MI);
1868 return false;
1869 }
1870 default:
1872 MF, MORE, "amdgpu-regbanklegalize",
1873 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported", MI);
1874 return false;
1875 }
1876 }
1877
1878 return true;
1879}
1880
1881bool RegBankLegalizeHelper::applyMappingSrc(
1882 MachineInstr &MI, unsigned &OpIdx,
1883 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1884 WaterfallInfo &WFI) {
1885 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) {
1886 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm)
1887 continue;
1888
1889 MachineOperand &Op = MI.getOperand(OpIdx);
1890 Register Reg = Op.getReg();
1891 LLT Ty = MRI.getType(Reg);
1892 const RegisterBank *RB = MRI.getRegBank(Reg);
1893
1894 switch (MethodIDs[i]) {
1895 case Vcc: {
1896 assert(Ty == S1);
1897 assert(RB == VccRB || RB == SgprRB);
1898 if (RB == SgprRB) {
1899 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1900 auto CopyVcc_Scc =
1901 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1902 Op.setReg(CopyVcc_Scc.getReg(0));
1903 }
1904 break;
1905 }
1906 // sgpr scalars, pointers and vectors
1907 case Sgpr16:
1908 case Sgpr32:
1909 case Sgpr64:
1910 case Sgpr128:
1911 case SgprP0:
1912 case SgprP1:
1913 case SgprP3:
1914 case SgprP4:
1915 case SgprP5:
1916 case SgprP8:
1917 case SgprV2S16:
1918 case SgprV2S32:
1919 case SgprV4S32: {
1920 assert(Ty == getTyFromID(MethodIDs[i]));
1921 assert(RB == getRegBankFromID(MethodIDs[i]));
1922 break;
1923 }
1924 // sgpr B-types
1925 case SgprB32:
1926 case SgprB64:
1927 case SgprB96:
1928 case SgprB128:
1929 case SgprB256:
1930 case SgprB512:
1931 case SgprBRC:
1932 case SgprPtr32:
1933 case SgprPtr64:
1934 case SgprPtr128: {
1935 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1936 assert(RB == getRegBankFromID(MethodIDs[i]));
1937 break;
1938 }
1939 // vgpr scalars, pointers and vectors
1940 case Vgpr16:
1941 case Vgpr32:
1942 case Vgpr64:
1943 case Vgpr128:
1944 case VgprP0:
1945 case VgprP1:
1946 case VgprP2:
1947 case VgprP3:
1948 case VgprP4:
1949 case VgprP5:
1950 case VgprV2S16:
1951 case VgprV2S32:
1952 case VgprV2S64:
1953 case VgprV3S32:
1954 case VgprV4S16:
1955 case VgprV4S32:
1956 case VgprV6S32:
1957 case VgprV8S32:
1958 case VgprV32S16:
1959 case VgprV32S32: {
1960 assert(Ty == getTyFromID(MethodIDs[i]));
1961 if (RB != VgprRB) {
1962 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1963 Op.setReg(CopyToVgpr.getReg(0));
1964 }
1965 break;
1966 }
1967 // vgpr B-types
1968 case VgprB32:
1969 case VgprB64:
1970 case VgprB96:
1971 case VgprB128:
1972 case VgprB160:
1973 case VgprB256:
1974 case VgprB512:
1975 case VgprBRC:
1976 case VgprPtr32:
1977 case VgprPtr64:
1978 case VgprPtr128: {
1979 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1980 if (RB != VgprRB) {
1981 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1982 Op.setReg(CopyToVgpr.getReg(0));
1983 }
1984 break;
1985 }
1986 // sgpr waterfall, scalars, and vectors
1987 case Sgpr32_WF:
1988 case SgprV4S32_WF: {
1989 assert(Ty == getTyFromID(MethodIDs[i]));
1990 if (RB != SgprRB) {
1991 WFI.SgprWaterfallOperandRegs.insert(Reg);
1992 if (!WFI.Start.isValid()) {
1993 WFI.Start = MI.getIterator();
1994 WFI.End = std::next(MI.getIterator());
1995 }
1996 }
1997 break;
1998 }
1999 case SgprP0Call_WF:
2000 case SgprP4Call_WF: {
2001 assert(Ty == getTyFromID(MethodIDs[i]));
2002 if (RB != SgprRB) {
2003 WFI.SgprWaterfallOperandRegs.insert(Reg);
2004
2005 // Find the ADJCALLSTACKUP before the call.
2006 MachineBasicBlock::iterator Start = MI.getIterator();
2007 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
2008 --Start;
2009
2010 // Find the ADJCALLSTACKDOWN after the call (include it in range).
2011 MachineBasicBlock::iterator End = MI.getIterator();
2012 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
2013 ++End;
2014 ++End;
2015
2016 B.setInsertPt(*MI.getParent(), Start);
2017 WFI.Start = Start;
2018 WFI.End = End;
2019 }
2020 break;
2021 }
2022 case SgprB32_M0:
2024 case SgprB64_ReadFirstLane: {
2025 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2026 if (RB == SgprRB)
2027 break;
2028 assert(RB == VgprRB);
2029 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2030 buildReadFirstLane(B, NewSGPR, Op.getReg(), RBI);
2031 Op.setReg(NewSGPR);
2032 break;
2033 }
2035 assert(Ty == getTyFromID(MethodIDs[i]));
2036 if (RB == SgprRB)
2037 break;
2038 assert(RB == VgprRB);
2039 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2040 buildReadFirstLane(B, NewSGPR, Op.getReg(), RBI);
2041 Op.setReg(NewSGPR);
2042 break;
2043 }
2044 // sgpr and vgpr scalars with extend
2045 case Sgpr32AExt: {
2046 // Note: this ext allows S1, and it is meant to be combined away.
2047 assert(Ty.getSizeInBits() < 32);
2048 assert(RB == SgprRB);
2049 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
2050 Op.setReg(Aext.getReg(0));
2051 break;
2052 }
2053 case Sgpr32AExtBoolInReg: {
2054 // Note: this ext allows S1, and it is meant to be combined away.
2055 assert(Ty.getSizeInBits() == 1);
2056 assert(RB == SgprRB);
2057 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
2058 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
2059 // most of times meant to be combined away in AMDGPURegBankCombiner.
2060 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
2061 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
2062 Op.setReg(BoolInReg.getReg(0));
2063 break;
2064 }
2065 case Sgpr32SExt: {
2066 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
2067 assert(RB == SgprRB);
2068 auto Sext = B.buildSExt(SgprRB_S32, Reg);
2069 Op.setReg(Sext.getReg(0));
2070 break;
2071 }
2072 case Sgpr32ZExt: {
2073 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
2074 assert(RB == SgprRB);
2075 auto Zext = B.buildZExt({SgprRB, S32}, Reg);
2076 Op.setReg(Zext.getReg(0));
2077 break;
2078 }
2079 case Vgpr32AExt: {
2080 assert(Ty.getSizeInBits() < 32);
2081 assert(RB == VgprRB);
2082 auto Aext = B.buildAnyExt({VgprRB, S32}, Reg);
2083 Op.setReg(Aext.getReg(0));
2084 break;
2085 }
2086 case Vgpr32SExt: {
2087 // Note this ext allows S1, and it is meant to be combined away.
2088 assert(Ty.getSizeInBits() < 32);
2089 assert(RB == VgprRB);
2090 auto Sext = B.buildSExt({VgprRB, S32}, Reg);
2091 Op.setReg(Sext.getReg(0));
2092 break;
2093 }
2094 case Vgpr32ZExt: {
2095 // Note this ext allows S1, and it is meant to be combined away.
2096 assert(Ty.getSizeInBits() < 32);
2097 assert(RB == VgprRB);
2098 auto Zext = B.buildZExt({VgprRB, S32}, Reg);
2099 Op.setReg(Zext.getReg(0));
2100 break;
2101 }
2102 default:
2104 MF, MORE, "amdgpu-regbanklegalize",
2105 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported", MI);
2106 return false;
2107 }
2108 }
2109 return true;
2110}
2111
2112[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
2113 const RegisterBank *RB,
2115 unsigned StartOpIdx,
2116 unsigned EndOpIdx) {
2117 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
2118 if (MRI.getRegBankOrNull(MI.getOperand(i).getReg()) != RB)
2119 return false;
2120 }
2121 return true;
2122}
2123
2124bool RegBankLegalizeHelper::applyRegisterBanksVgprWithSgprRsrc(
2125 MachineInstr &MI, unsigned RsrcIdx) {
2126 const unsigned NumDefs = MI.getNumExplicitDefs();
2127
2128 MachineBasicBlock *MBB = MI.getParent();
2129 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(MI.getIterator())));
2130
2131 // Defs are vgpr.
2132 for (unsigned i = 0; i < NumDefs; ++i) {
2133 Register Reg = MI.getOperand(i).getReg();
2134 if (MRI.getRegBank(Reg) == VgprRB)
2135 continue;
2136
2137 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, MRI.getType(Reg)});
2138 MI.getOperand(i).setReg(NewVgprDst);
2139 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
2140 }
2141
2142 B.setInstrAndDebugLoc(MI);
2143
2144 // Register uses before RsrcIdx are vgpr.
2145 for (unsigned i = NumDefs; i < RsrcIdx; ++i) {
2146 MachineOperand &Op = MI.getOperand(i);
2147 if (!Op.isReg())
2148 continue;
2149
2150 Register Reg = Op.getReg();
2151 if (!Reg.isVirtual())
2152 continue;
2153
2154 if (MRI.getRegBank(Reg) == VgprRB)
2155 continue;
2156
2157 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
2158 Op.setReg(Copy.getReg(0));
2159 }
2160
2161 SmallSet<Register, 4> OpsToWaterfall;
2162
2163 // Register use RsrcIdx (and later register operands) is sgpr.
2164 for (unsigned i = RsrcIdx; i < MI.getNumOperands(); ++i) {
2165 MachineOperand &Op = MI.getOperand(i);
2166 if (!Op.isReg())
2167 continue;
2168
2169 Register Reg = Op.getReg();
2170 if (MRI.getRegBank(Reg) != SgprRB)
2171 OpsToWaterfall.insert(Reg);
2172 }
2173
2174 if (!OpsToWaterfall.empty()) {
2175 MachineBasicBlock::iterator MII = MI.getIterator();
2176 executeInWaterfallLoop(B, {OpsToWaterfall, MII, std::next(MII)});
2177 }
2178
2179 return true;
2180}
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
IRTranslator LLVM IR MI
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
const SmallVectorImpl< MachineOperand > & Cond
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
@ ICMP_NE
not equal
Definition InstrTypes.h:698
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
Register getSourceReg() const
Get the unmerge source register.
constexpr bool isScalar() const
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
bool empty() const
Definition SmallSet.h:169
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
void push_back(const T &Elt)
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isAnyPtr(LLT Ty, unsigned Width)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Bitcast
Perform the operation on a different, but equivalently sized type.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:155
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
Definition Utils.cpp:257
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:432
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
Holds waterfall loop information: the set of SGPR operand registers that need waterfalling,...
MachineBasicBlock::iterator Start
SmallSet< Register, 4 > SgprWaterfallOperandRegs