LLVM 22.0.0git
AMDGPURegBankLegalizeHelper.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Implements actual lowering algorithms for each ID that can be used in
10/// Rule.OperandMapping. Similar to legalizer helper but with register banks.
11//
12//===----------------------------------------------------------------------===//
13
16#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
25#include "llvm/IR/IntrinsicsAMDGPU.h"
26
27#define DEBUG_TYPE "amdgpu-regbanklegalize"
28
29using namespace llvm;
30using namespace AMDGPU;
31
34 const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
35 : ST(B.getMF().getSubtarget<GCNSubtarget>()), B(B), MRI(*B.getMRI()),
36 MUI(MUI), RBI(RBI), RBLRules(RBLRules), IsWave32(ST.isWave32()),
37 SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
38 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
39 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
40
42 const SetOfRulesForOpcode &RuleSet = RBLRules.getRulesForOpc(MI);
43 const RegBankLLTMapping &Mapping = RuleSet.findMappingForMI(MI, MRI, MUI);
44
45 SmallSet<Register, 4> WaterfallSgprs;
46 unsigned OpIdx = 0;
47 if (Mapping.DstOpMapping.size() > 0) {
48 B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
49 applyMappingDst(MI, OpIdx, Mapping.DstOpMapping);
50 }
51 if (Mapping.SrcOpMapping.size() > 0) {
52 B.setInstr(MI);
53 applyMappingSrc(MI, OpIdx, Mapping.SrcOpMapping, WaterfallSgprs);
54 }
55
56 lower(MI, Mapping, WaterfallSgprs);
57}
58
59bool RegBankLegalizeHelper::executeInWaterfallLoop(
61 SmallSet<Register, 4> &SGPROperandRegs) {
62 // Track use registers which have already been expanded with a readfirstlane
63 // sequence. This may have multiple uses if moving a sequence.
64 DenseMap<Register, Register> WaterfalledRegMap;
65
66 MachineBasicBlock &MBB = B.getMBB();
67 MachineFunction &MF = B.getMF();
68
70 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
71 unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
72 if (IsWave32) {
73 MovExecOpc = AMDGPU::S_MOV_B32;
74 MovExecTermOpc = AMDGPU::S_MOV_B32_term;
75 XorTermOpc = AMDGPU::S_XOR_B32_term;
76 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
77 ExecReg = AMDGPU::EXEC_LO;
78 } else {
79 MovExecOpc = AMDGPU::S_MOV_B64;
80 MovExecTermOpc = AMDGPU::S_MOV_B64_term;
81 XorTermOpc = AMDGPU::S_XOR_B64_term;
82 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
83 ExecReg = AMDGPU::EXEC;
84 }
85
86#ifndef NDEBUG
87 const int OrigRangeSize = std::distance(Range.begin(), Range.end());
88#endif
89
90 MachineRegisterInfo &MRI = *B.getMRI();
91 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
92 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
93
94 // Don't bother using generic instructions/registers for the exec mask.
95 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
96
97 Register SavedExec = MRI.createVirtualRegister(WaveRC);
98
99 // To insert the loop we need to split the block. Move everything before
100 // this point to a new block, and insert a new empty block before this
101 // instruction.
104 MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
105 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
107 ++MBBI;
108 MF.insert(MBBI, LoopBB);
109 MF.insert(MBBI, BodyBB);
110 MF.insert(MBBI, RestoreExecBB);
111 MF.insert(MBBI, RemainderBB);
112
113 LoopBB->addSuccessor(BodyBB);
114 BodyBB->addSuccessor(RestoreExecBB);
115 BodyBB->addSuccessor(LoopBB);
116
117 // Move the rest of the block into a new block.
119 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
120
121 MBB.addSuccessor(LoopBB);
122 RestoreExecBB->addSuccessor(RemainderBB);
123
124 B.setInsertPt(*LoopBB, LoopBB->end());
125
126 // +-MBB:------------+
127 // | ... |
128 // | %0 = G_INST_1 |
129 // | %Dst = MI %Vgpr |
130 // | %1 = G_INST_2 |
131 // | ... |
132 // +-----------------+
133 // ->
134 // +-MBB-------------------------------+
135 // | ... |
136 // | %0 = G_INST_1 |
137 // | %SaveExecReg = S_MOV_B32 $exec_lo |
138 // +----------------|------------------+
139 // | /------------------------------|
140 // V V |
141 // +-LoopBB---------------------------------------------------------------+ |
142 // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
143 // | instead of executing for each lane, see if other lanes had | |
144 // | same value for %Vgpr and execute for them also. | |
145 // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
146 // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
147 // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
148 // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
149 // +----------------|-----------------------------------------------------+ |
150 // V |
151 // +-BodyBB------------------------------------------------------------+ |
152 // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
153 // | executed only for active lanes and written to Dst | |
154 // | $exec = S_XOR_B32 $exec, %SavedExec | |
155 // | set active lanes to 0 in SavedExec, lanes that did not write to | |
156 // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
157 // | SI_WATERFALL_LOOP LoopBB |-----|
158 // +----------------|--------------------------------------------------+
159 // V
160 // +-RestoreExecBB--------------------------+
161 // | $exec_lo = S_MOV_B32_term %SaveExecReg |
162 // +----------------|-----------------------+
163 // V
164 // +-RemainderBB:----------------------+
165 // | %1 = G_INST_2 |
166 // | ... |
167 // +---------------------------------- +
168
169 // Move the instruction into the loop body. Note we moved everything after
170 // Range.end() already into a new block, so Range.end() is no longer valid.
171 BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
172
173 // Figure out the iterator range after splicing the instructions.
174 MachineBasicBlock::iterator NewBegin = Range.begin()->getIterator();
175 auto NewEnd = BodyBB->end();
176 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
177
178 B.setMBB(*LoopBB);
179 Register CondReg;
180
181 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
182 for (MachineOperand &Op : MI.all_uses()) {
183 Register OldReg = Op.getReg();
184 if (!SGPROperandRegs.count(OldReg))
185 continue;
186
187 // See if we already processed this register in another instruction in
188 // the sequence.
189 auto OldVal = WaterfalledRegMap.find(OldReg);
190 if (OldVal != WaterfalledRegMap.end()) {
191 Op.setReg(OldVal->second);
192 continue;
193 }
194
195 Register OpReg = Op.getReg();
196 LLT OpTy = MRI.getType(OpReg);
197
198 // TODO: support for agpr
199 assert(MRI.getRegBank(OpReg) == VgprRB);
200 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
201 buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
202
203 // Build the comparison(s), CurrentLaneReg == OpReg.
204 unsigned OpSize = OpTy.getSizeInBits();
205 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
206 LLT PartTy = LLT::scalar(PartSize);
207 unsigned NumParts = OpSize / PartSize;
209 SmallVector<Register, 8> CurrentLaneParts;
210
211 if (NumParts == 1) {
212 OpParts.push_back(OpReg);
213 CurrentLaneParts.push_back(CurrentLaneReg);
214 } else {
215 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
216 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
217 for (unsigned i = 0; i < NumParts; ++i) {
218 OpParts.push_back(UnmergeOp.getReg(i));
219 CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
220 }
221 }
222
223 for (unsigned i = 0; i < NumParts; ++i) {
224 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
225 B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
226
227 if (!CondReg)
228 CondReg = CmpReg;
229 else
230 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
231 }
232
233 Op.setReg(CurrentLaneReg);
234
235 // Make sure we don't re-process this register again.
236 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
237 }
238 }
239
240 // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
241 Register CondRegLM =
242 MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
243 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
244
245 // Update EXEC, save the original EXEC value to SavedExec.
246 B.buildInstr(AndSaveExecOpc)
247 .addDef(SavedExec)
248 .addReg(CondRegLM, RegState::Kill);
249 MRI.setSimpleHint(SavedExec, CondRegLM);
250
251 B.setInsertPt(*BodyBB, BodyBB->end());
252
253 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
254 B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
255
256 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
257 // s_cbranch_scc0?
258
259 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
260 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
261
262 // Save the EXEC mask before the loop.
263 B.setInsertPt(MBB, MBB.end());
264 B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
265
266 // Restore the EXEC mask after the loop.
267 B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
268 B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
269
270 // Set the insert point after the original instruction, so any new
271 // instructions will be in the remainder.
272 B.setInsertPt(*RemainderBB, RemainderBB->begin());
273
274 return true;
275}
276
277void RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
278 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
279 MachineFunction &MF = B.getMF();
280 assert(MI.getNumMemOperands() == 1);
281 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
282 Register Dst = MI.getOperand(0).getReg();
283 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
284 Register Base = MI.getOperand(1).getReg();
285 LLT PtrTy = MRI.getType(Base);
286 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base);
287 LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
288 SmallVector<Register, 4> LoadPartRegs;
289
290 unsigned ByteOffset = 0;
291 for (LLT PartTy : LLTBreakdown) {
292 Register BasePlusOffset;
293 if (ByteOffset == 0) {
294 BasePlusOffset = Base;
295 } else {
296 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
297 BasePlusOffset =
298 B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0);
299 }
300 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
301 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
302 LoadPartRegs.push_back(LoadPart.getReg(0));
303 ByteOffset += PartTy.getSizeInBytes();
304 }
305
306 if (!MergeTy.isValid()) {
307 // Loads are of same size, concat or merge them together.
308 B.buildMergeLikeInstr(Dst, LoadPartRegs);
309 } else {
310 // Loads are not all of same size, need to unmerge them to smaller pieces
311 // of MergeTy type, then merge pieces to Dst.
312 SmallVector<Register, 4> MergeTyParts;
313 for (Register Reg : LoadPartRegs) {
314 if (MRI.getType(Reg) == MergeTy) {
315 MergeTyParts.push_back(Reg);
316 } else {
317 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg);
318 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
319 MergeTyParts.push_back(Unmerge.getReg(i));
320 }
321 }
322 B.buildMergeLikeInstr(Dst, MergeTyParts);
323 }
324 MI.eraseFromParent();
325}
326
327void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
328 LLT MergeTy) {
329 MachineFunction &MF = B.getMF();
330 assert(MI.getNumMemOperands() == 1);
331 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
332 Register Dst = MI.getOperand(0).getReg();
333 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
334 Register Base = MI.getOperand(1).getReg();
335
336 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
337 auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO);
338
339 if (WideTy.isScalar()) {
340 B.buildTrunc(Dst, WideLoad);
341 } else {
342 SmallVector<Register, 4> MergeTyParts;
343 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
344
345 LLT DstTy = MRI.getType(Dst);
346 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
347 for (unsigned i = 0; i < NumElts; ++i) {
348 MergeTyParts.push_back(Unmerge.getReg(i));
349 }
350 B.buildMergeLikeInstr(Dst, MergeTyParts);
351 }
352 MI.eraseFromParent();
353}
354
355void RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
356 Register Dst = MI.getDstReg();
357 Register Ptr = MI.getPointerReg();
358 MachineMemOperand &MMO = MI.getMMO();
359 unsigned MemSize = 8 * MMO.getSize().getValue();
360
361 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
362
363 if (MI.getOpcode() == G_LOAD) {
364 B.buildLoad(Dst, Ptr, *WideMMO);
365 } else {
366 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
367
368 if (MI.getOpcode() == G_ZEXTLOAD) {
369 APInt Mask = APInt::getLowBitsSet(S32.getSizeInBits(), MemSize);
370 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
371 B.buildAnd(Dst, Load, MaskCst);
372 } else {
373 assert(MI.getOpcode() == G_SEXTLOAD);
374 B.buildSExtInReg(Dst, Load, MemSize);
375 }
376 }
377
378 MI.eraseFromParent();
379}
380
381void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
382 Register Dst = MI.getOperand(0).getReg();
383 LLT Ty = MRI.getType(Dst);
384 Register Src = MI.getOperand(1).getReg();
385 unsigned Opc = MI.getOpcode();
386 int TrueExtCst = Opc == G_SEXT ? -1 : 1;
387 if (Ty == S32 || Ty == S16) {
388 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
389 auto False = B.buildConstant({VgprRB, Ty}, 0);
390 B.buildSelect(Dst, Src, True, False);
391 } else if (Ty == S64) {
392 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
393 auto False = B.buildConstant({VgprRB_S32}, 0);
394 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
395 MachineInstrBuilder Hi;
396 switch (Opc) {
397 case G_SEXT:
398 Hi = Lo;
399 break;
400 case G_ZEXT:
401 Hi = False;
402 break;
403 case G_ANYEXT:
404 Hi = B.buildUndef({VgprRB_S32});
405 break;
406 default:
407 llvm_unreachable("Opcode not supported");
408 }
409
410 B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)});
411 } else {
412 llvm_unreachable("Type not supported");
413 }
414
415 MI.eraseFromParent();
416}
417
418std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
419 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
420 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
421 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
422 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
423 return {Lo.getReg(0), Hi.getReg(0)};
424}
425
426std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
427 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
428 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
429 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
430 return {Lo.getReg(0), Hi.getReg(0)};
431}
432
433std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
434 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
435 auto Lo = PackedS32;
436 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
437 return {Lo.getReg(0), Hi.getReg(0)};
438}
439
440std::pair<Register, Register>
441RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) {
442 auto [Lo32, Hi32] = unpackAExt(Reg);
443 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
444 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
445}
446
447void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
448 Register Lo, Hi;
449 switch (MI.getOpcode()) {
450 case AMDGPU::G_SHL: {
451 auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg());
452 auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg());
453 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
454 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
455 break;
456 }
457 case AMDGPU::G_LSHR: {
458 auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg());
459 auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg());
460 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
461 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
462 break;
463 }
464 case AMDGPU::G_ASHR: {
465 auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg());
466 auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg());
467 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
468 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
469 break;
470 }
471 default:
472 llvm_unreachable("Unpack lowering not implemented");
473 }
474 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
475 MI.eraseFromParent();
476}
477
478void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
479 Register Lo, Hi;
480 switch (MI.getOpcode()) {
481 case AMDGPU::G_SMIN:
482 case AMDGPU::G_SMAX: {
483 // For signed operations, use sign extension
484 auto [Val0_Lo, Val0_Hi] = unpackSExt(MI.getOperand(1).getReg());
485 auto [Val1_Lo, Val1_Hi] = unpackSExt(MI.getOperand(2).getReg());
486 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
487 .getReg(0);
488 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
489 .getReg(0);
490 break;
491 }
492 case AMDGPU::G_UMIN:
493 case AMDGPU::G_UMAX: {
494 // For unsigned operations, use zero extension
495 auto [Val0_Lo, Val0_Hi] = unpackZExt(MI.getOperand(1).getReg());
496 auto [Val1_Lo, Val1_Hi] = unpackZExt(MI.getOperand(2).getReg());
497 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
498 .getReg(0);
499 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
500 .getReg(0);
501 break;
502 }
503 default:
504 llvm_unreachable("Unpack min/max lowering not implemented");
505 }
506 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
507 MI.eraseFromParent();
508}
509
510void RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) {
511 auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg());
512 auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg());
513 auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
514 auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
515 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
516 {ResLo.getReg(0), ResHi.getReg(0)});
517 MI.eraseFromParent();
518}
519
522 return (GI->is(Intrinsic::amdgcn_sbfe));
523
524 return MI.getOpcode() == AMDGPU::G_SBFX;
525}
526
527void RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
528 Register Dst = MI.getOperand(0).getReg();
529 assert(MRI.getType(Dst) == LLT::scalar(64));
530 bool Signed = isSignedBFE(MI);
531 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
532 // Extract bitfield from Src, LSBit is the least-significant bit for the
533 // extraction (field offset) and Width is size of bitfield.
534 Register Src = MI.getOperand(FirstOpnd).getReg();
535 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
536 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
537 // Comments are for signed bitfield extract, similar for unsigned. x is sign
538 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
539
540 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
541 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
542 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
543
544 auto ConstWidth = getIConstantVRegValWithLookThrough(Width, MRI);
545
546 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
547 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
548 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
549 if (!ConstWidth) {
550 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
551 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
552 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
553 MI.eraseFromParent();
554 return;
555 }
556
557 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
558 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
559 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
560 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
561 auto Zero = B.buildConstant({VgprRB, S32}, 0);
562 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
563
564 if (WidthImm <= 32) {
565 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
566 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo, Zero, Width});
567 MachineInstrBuilder Hi;
568 if (Signed) {
569 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
570 Hi = B.buildAShr(VgprRB_S32, Lo, B.buildConstant(VgprRB_S32, 31));
571 } else {
572 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
573 Hi = Zero;
574 }
575 B.buildMergeLikeInstr(Dst, {Lo, Hi});
576 } else {
577 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
578 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
579 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi, Zero, Amt});
580 B.buildMergeLikeInstr(Dst, {SHRSrcLo, Hi});
581 }
582
583 MI.eraseFromParent();
584}
585
586void RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
587 Register DstReg = MI.getOperand(0).getReg();
588 LLT Ty = MRI.getType(DstReg);
589 bool Signed = isSignedBFE(MI);
590 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
591 Register Src = MI.getOperand(FirstOpnd).getReg();
592 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
593 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
594 // For uniform bit field extract there are 4 available instructions, but
595 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
596 // field offset in low and size in high 16 bits.
597
598 // Src1 Hi16|Lo16 = Size|FieldOffset
599 auto Mask = B.buildConstant(SgprRB_S32, maskTrailingOnes<unsigned>(6));
600 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
601 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
602 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset, Size);
603 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
604 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
605 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
606
607 // Select machine instruction, because of reg class constraining, insert
608 // copies from reg class to reg bank.
609 auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}},
610 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
611 if (!constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(),
612 *ST.getRegisterInfo(), RBI))
613 llvm_unreachable("failed to constrain BFE");
614
615 B.buildCopy(DstReg, S_BFE->getOperand(0).getReg());
616 MI.eraseFromParent();
617}
618
619void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
620 Register Dst = MI.getOperand(0).getReg();
621 LLT DstTy = MRI.getType(Dst);
622 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
623 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
624 auto Op1 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(1).getReg());
625 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
626 unsigned Opc = MI.getOpcode();
627 auto Flags = MI.getFlags();
628 auto Lo =
629 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)}, Flags);
630 auto Hi =
631 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags);
632 B.buildMergeLikeInstr(Dst, {Lo, Hi});
633 MI.eraseFromParent();
634}
635
636void RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
637 Register Dst = MI.getOperand(0).getReg();
638 assert(MRI.getType(Dst) == V2S16);
639 unsigned Opc = MI.getOpcode();
640 auto Flags = MI.getFlags();
641
642 if (MI.getNumOperands() == 2) {
643 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
644 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags);
645 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags);
646 B.buildMergeLikeInstr(Dst, {Lo, Hi});
647 MI.eraseFromParent();
648 return;
649 }
650
651 assert(MI.getNumOperands() == 3);
652 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
653 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(MI.getOperand(2).getReg());
654 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
655 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
656 B.buildMergeLikeInstr(Dst, {Lo, Hi});
657 MI.eraseFromParent();
658}
659
660void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
661 Register Dst = MI.getOperand(0).getReg();
662 LLT DstTy = MRI.getType(Dst);
663 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
664 (DstTy.isPointer() && DstTy.getSizeInBits() == 64));
665 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
666 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
667 auto Op3 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(3).getReg());
668 Register Cond = MI.getOperand(1).getReg();
669 auto Flags = MI.getFlags();
670 auto Lo =
671 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(0), Op3.getReg(0), Flags);
672 auto Hi =
673 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(1), Op3.getReg(1), Flags);
674
675 B.buildMergeLikeInstr(Dst, {Lo, Hi});
676 MI.eraseFromParent();
677}
678
679void RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
680 auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
681 int Amt = MI.getOperand(2).getImm();
682 Register Lo, Hi;
683 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
684 if (Amt <= 32) {
685 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
686 if (Amt == 32) {
687 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
688 Lo = Freeze.getReg(0);
689 } else {
690 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
691 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
692 }
693
694 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
695 Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0);
696 } else {
697 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
698 Lo = Op1.getReg(0);
699 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
700 }
701
702 B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
703 MI.eraseFromParent();
704}
705
706void RegBankLegalizeHelper::lower(MachineInstr &MI,
707 const RegBankLLTMapping &Mapping,
708 SmallSet<Register, 4> &WaterfallSgprs) {
709
710 switch (Mapping.LoweringMethod) {
711 case DoNotLower:
712 break;
713 case VccExtToSel:
714 return lowerVccExtToSel(MI);
715 case UniExtToSel: {
716 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
717 auto True = B.buildConstant({SgprRB, Ty},
718 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
719 auto False = B.buildConstant({SgprRB, Ty}, 0);
720 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare.
721 // We are making select here. S1 cond was already 'any-extended to S32' +
722 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg.
723 B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True,
724 False);
725 MI.eraseFromParent();
726 return;
727 }
728 case UnpackBitShift:
729 return lowerUnpackBitShift(MI);
730 case UnpackMinMax:
731 return lowerUnpackMinMax(MI);
732 case ScalarizeToS16:
733 return lowerSplitTo16(MI);
734 case Ext32To64: {
735 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
736 MachineInstrBuilder Hi;
737 switch (MI.getOpcode()) {
738 case AMDGPU::G_ZEXT: {
739 Hi = B.buildConstant({RB, S32}, 0);
740 break;
741 }
742 case AMDGPU::G_SEXT: {
743 // Replicate sign bit from 32-bit extended part.
744 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
745 Hi = B.buildAShr({RB, S32}, MI.getOperand(1).getReg(), ShiftAmt);
746 break;
747 }
748 case AMDGPU::G_ANYEXT: {
749 Hi = B.buildUndef({RB, S32});
750 break;
751 }
752 default:
753 llvm_unreachable("Unsuported Opcode in Ext32To64");
754 }
755
756 B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
757 {MI.getOperand(1).getReg(), Hi});
758 MI.eraseFromParent();
759 return;
760 }
761 case UniCstExt: {
762 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
763 B.buildConstant(MI.getOperand(0).getReg(), ConstVal);
764
765 MI.eraseFromParent();
766 return;
767 }
768 case VgprToVccCopy: {
769 Register Src = MI.getOperand(1).getReg();
770 LLT Ty = MRI.getType(Src);
771 // Take lowest bit from each lane and put it in lane mask.
772 // Lowering via compare, but we need to clean high bits first as compare
773 // compares all bits in register.
774 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
775 if (Ty == S64) {
776 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
777 auto One = B.buildConstant(VgprRB_S32, 1);
778 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
779 auto Zero = B.buildConstant(VgprRB_S32, 0);
780 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
781 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
782 } else {
783 assert(Ty == S32 || Ty == S16);
784 auto One = B.buildConstant({VgprRB, Ty}, 1);
785 B.buildAnd(BoolSrc, Src, One);
786 }
787 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
788 B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero);
789 MI.eraseFromParent();
790 return;
791 }
792 case V_BFE:
793 return lowerV_BFE(MI);
794 case S_BFE:
795 return lowerS_BFE(MI);
796 case SplitTo32:
797 return lowerSplitTo32(MI);
798 case SplitTo32Select:
799 return lowerSplitTo32Select(MI);
801 return lowerSplitTo32SExtInReg(MI);
802 case SplitLoad: {
803 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
804 unsigned Size = DstTy.getSizeInBits();
805 // Even split to 128-bit loads
806 if (Size > 128) {
807 LLT B128;
808 if (DstTy.isVector()) {
809 LLT EltTy = DstTy.getElementType();
810 B128 = LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
811 } else {
812 B128 = LLT::scalar(128);
813 }
814 if (Size / 128 == 2)
815 splitLoad(MI, {B128, B128});
816 else if (Size / 128 == 4)
817 splitLoad(MI, {B128, B128, B128, B128});
818 else {
819 LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
820 llvm_unreachable("SplitLoad type not supported for MI");
821 }
822 }
823 // 64 and 32 bit load
824 else if (DstTy == S96)
825 splitLoad(MI, {S64, S32}, S32);
826 else if (DstTy == V3S32)
827 splitLoad(MI, {V2S32, S32}, S32);
828 else if (DstTy == V6S16)
829 splitLoad(MI, {V4S16, V2S16}, V2S16);
830 else {
831 LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
832 llvm_unreachable("SplitLoad type not supported for MI");
833 }
834 break;
835 }
836 case WidenLoad: {
837 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
838 if (DstTy == S96)
839 widenLoad(MI, S128);
840 else if (DstTy == V3S32)
841 widenLoad(MI, V4S32, S32);
842 else if (DstTy == V6S16)
843 widenLoad(MI, V8S16, V2S16);
844 else {
845 LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
846 llvm_unreachable("WidenLoad type not supported for MI");
847 }
848 break;
849 }
850 case UnpackAExt:
851 return lowerUnpackAExt(MI);
852 case WidenMMOToS32:
853 return widenMMOToS32(cast<GAnyLoad>(MI));
854 }
855
856 if (!WaterfallSgprs.empty()) {
857 MachineBasicBlock::iterator I = MI.getIterator();
858 executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs);
859 }
860}
861
862LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
863 switch (ID) {
864 case Vcc:
865 case UniInVcc:
866 return LLT::scalar(1);
867 case Sgpr16:
868 case Vgpr16:
869 case UniInVgprS16:
870 return LLT::scalar(16);
871 case Sgpr32:
872 case Sgpr32_WF:
873 case Sgpr32Trunc:
874 case Sgpr32AExt:
876 case Sgpr32SExt:
877 case Sgpr32ZExt:
878 case UniInVgprS32:
879 case Vgpr32:
880 case Vgpr32SExt:
881 case Vgpr32ZExt:
882 return LLT::scalar(32);
883 case Sgpr64:
884 case Vgpr64:
885 case UniInVgprS64:
886 return LLT::scalar(64);
887 case Sgpr128:
888 case Vgpr128:
889 return LLT::scalar(128);
890 case SgprP0:
891 case VgprP0:
892 return LLT::pointer(0, 64);
893 case SgprP1:
894 case VgprP1:
895 return LLT::pointer(1, 64);
896 case SgprP3:
897 case VgprP3:
898 return LLT::pointer(3, 32);
899 case SgprP4:
900 case VgprP4:
901 return LLT::pointer(4, 64);
902 case SgprP5:
903 case VgprP5:
904 return LLT::pointer(5, 32);
905 case SgprP8:
906 return LLT::pointer(8, 128);
907 case SgprV2S16:
908 case VgprV2S16:
909 case UniInVgprV2S16:
910 return LLT::fixed_vector(2, 16);
911 case SgprV2S32:
912 case VgprV2S32:
913 return LLT::fixed_vector(2, 32);
914 case SgprV4S32:
915 case SgprV4S32_WF:
916 case VgprV4S32:
917 case UniInVgprV4S32:
918 return LLT::fixed_vector(4, 32);
919 default:
920 return LLT();
921 }
922}
923
924LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
925 switch (ID) {
926 case SgprB32:
927 case VgprB32:
928 case UniInVgprB32:
929 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
930 isAnyPtr(Ty, 32))
931 return Ty;
932 return LLT();
933 case SgprPtr32:
934 case VgprPtr32:
935 return isAnyPtr(Ty, 32) ? Ty : LLT();
936 case SgprPtr64:
937 case VgprPtr64:
938 return isAnyPtr(Ty, 64) ? Ty : LLT();
939 case SgprPtr128:
940 case VgprPtr128:
941 return isAnyPtr(Ty, 128) ? Ty : LLT();
942 case SgprB64:
943 case VgprB64:
944 case UniInVgprB64:
945 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
946 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
947 return Ty;
948 return LLT();
949 case SgprB96:
950 case VgprB96:
951 case UniInVgprB96:
952 if (Ty == LLT::scalar(96) || Ty == LLT::fixed_vector(3, 32) ||
953 Ty == LLT::fixed_vector(6, 16))
954 return Ty;
955 return LLT();
956 case SgprB128:
957 case VgprB128:
958 case UniInVgprB128:
959 if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) ||
960 Ty == LLT::fixed_vector(2, 64) || isAnyPtr(Ty, 128))
961 return Ty;
962 return LLT();
963 case SgprB256:
964 case VgprB256:
965 case UniInVgprB256:
966 if (Ty == LLT::scalar(256) || Ty == LLT::fixed_vector(8, 32) ||
967 Ty == LLT::fixed_vector(4, 64) || Ty == LLT::fixed_vector(16, 16))
968 return Ty;
969 return LLT();
970 case SgprB512:
971 case VgprB512:
972 case UniInVgprB512:
973 if (Ty == LLT::scalar(512) || Ty == LLT::fixed_vector(16, 32) ||
974 Ty == LLT::fixed_vector(8, 64))
975 return Ty;
976 return LLT();
977 default:
978 return LLT();
979 }
980}
981
982const RegisterBank *
983RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
984 switch (ID) {
985 case Vcc:
986 return VccRB;
987 case Sgpr16:
988 case Sgpr32:
989 case Sgpr32_WF:
990 case Sgpr64:
991 case Sgpr128:
992 case SgprP0:
993 case SgprP1:
994 case SgprP3:
995 case SgprP4:
996 case SgprP5:
997 case SgprP8:
998 case SgprPtr32:
999 case SgprPtr64:
1000 case SgprPtr128:
1001 case SgprV2S16:
1002 case SgprV2S32:
1003 case SgprV4S32:
1004 case SgprV4S32_WF:
1005 case SgprB32:
1006 case SgprB64:
1007 case SgprB96:
1008 case SgprB128:
1009 case SgprB256:
1010 case SgprB512:
1011 case UniInVcc:
1012 case UniInVgprS16:
1013 case UniInVgprS32:
1014 case UniInVgprS64:
1015 case UniInVgprV2S16:
1016 case UniInVgprV4S32:
1017 case UniInVgprB32:
1018 case UniInVgprB64:
1019 case UniInVgprB96:
1020 case UniInVgprB128:
1021 case UniInVgprB256:
1022 case UniInVgprB512:
1023 case Sgpr32Trunc:
1024 case Sgpr32AExt:
1026 case Sgpr32SExt:
1027 case Sgpr32ZExt:
1028 return SgprRB;
1029 case Vgpr16:
1030 case Vgpr32:
1031 case Vgpr64:
1032 case Vgpr128:
1033 case VgprP0:
1034 case VgprP1:
1035 case VgprP3:
1036 case VgprP4:
1037 case VgprP5:
1038 case VgprPtr32:
1039 case VgprPtr64:
1040 case VgprPtr128:
1041 case VgprV2S16:
1042 case VgprV2S32:
1043 case VgprV4S32:
1044 case VgprB32:
1045 case VgprB64:
1046 case VgprB96:
1047 case VgprB128:
1048 case VgprB256:
1049 case VgprB512:
1050 case Vgpr32SExt:
1051 case Vgpr32ZExt:
1052 return VgprRB;
1053 default:
1054 return nullptr;
1055 }
1056}
1057
1058void RegBankLegalizeHelper::applyMappingDst(
1059 MachineInstr &MI, unsigned &OpIdx,
1060 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1061 // Defs start from operand 0
1062 for (; OpIdx < MethodIDs.size(); ++OpIdx) {
1063 if (MethodIDs[OpIdx] == None)
1064 continue;
1065 MachineOperand &Op = MI.getOperand(OpIdx);
1066 Register Reg = Op.getReg();
1067 LLT Ty = MRI.getType(Reg);
1068 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg);
1069
1070 switch (MethodIDs[OpIdx]) {
1071 // vcc, sgpr and vgpr scalars, pointers and vectors
1072 case Vcc:
1073 case Sgpr16:
1074 case Sgpr32:
1075 case Sgpr64:
1076 case Sgpr128:
1077 case SgprP0:
1078 case SgprP1:
1079 case SgprP3:
1080 case SgprP4:
1081 case SgprP5:
1082 case SgprP8:
1083 case SgprV2S16:
1084 case SgprV2S32:
1085 case SgprV4S32:
1086 case Vgpr16:
1087 case Vgpr32:
1088 case Vgpr64:
1089 case Vgpr128:
1090 case VgprP0:
1091 case VgprP1:
1092 case VgprP3:
1093 case VgprP4:
1094 case VgprP5:
1095 case VgprV2S16:
1096 case VgprV2S32:
1097 case VgprV4S32: {
1098 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1099 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1100 break;
1101 }
1102 // sgpr and vgpr B-types
1103 case SgprB32:
1104 case SgprB64:
1105 case SgprB96:
1106 case SgprB128:
1107 case SgprB256:
1108 case SgprB512:
1109 case SgprPtr32:
1110 case SgprPtr64:
1111 case SgprPtr128:
1112 case VgprB32:
1113 case VgprB64:
1114 case VgprB96:
1115 case VgprB128:
1116 case VgprB256:
1117 case VgprB512:
1118 case VgprPtr32:
1119 case VgprPtr64:
1120 case VgprPtr128: {
1121 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1122 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1123 break;
1124 }
1125 // uniform in vcc/vgpr: scalars, vectors and B-types
1126 case UniInVcc: {
1127 assert(Ty == S1);
1128 assert(RB == SgprRB);
1129 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1130 Op.setReg(NewDst);
1131 auto CopyS32_Vcc =
1132 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1133 B.buildTrunc(Reg, CopyS32_Vcc);
1134 break;
1135 }
1136 case UniInVgprS16: {
1137 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1138 assert(RB == SgprRB);
1139 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1140 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1141 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1142 Op.setReg(NewVgprDstS16);
1143 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1144 buildReadAnyLane(B, NewSgprDstS32, NewVgprDstS32, RBI);
1145 B.buildTrunc(Reg, NewSgprDstS32);
1146 break;
1147 }
1148 case UniInVgprS32:
1149 case UniInVgprS64:
1150 case UniInVgprV2S16:
1151 case UniInVgprV4S32: {
1152 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1153 assert(RB == SgprRB);
1154 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1155 Op.setReg(NewVgprDst);
1156 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1157 break;
1158 }
1159 case UniInVgprB32:
1160 case UniInVgprB64:
1161 case UniInVgprB96:
1162 case UniInVgprB128:
1163 case UniInVgprB256:
1164 case UniInVgprB512: {
1165 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1166 assert(RB == SgprRB);
1167 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1168 Op.setReg(NewVgprDst);
1169 AMDGPU::buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1170 break;
1171 }
1172 // sgpr trunc
1173 case Sgpr32Trunc: {
1174 assert(Ty.getSizeInBits() < 32);
1175 assert(RB == SgprRB);
1176 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1177 Op.setReg(NewDst);
1178 if (!MRI.use_empty(Reg))
1179 B.buildTrunc(Reg, NewDst);
1180 break;
1181 }
1182 case InvalidMapping: {
1183 LLVM_DEBUG(dbgs() << "Instruction with Invalid mapping: "; MI.dump(););
1184 llvm_unreachable("missing fast rule for MI");
1185 }
1186 default:
1187 llvm_unreachable("ID not supported");
1188 }
1189 }
1190}
1191
1192void RegBankLegalizeHelper::applyMappingSrc(
1193 MachineInstr &MI, unsigned &OpIdx,
1194 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1195 SmallSet<Register, 4> &SgprWaterfallOperandRegs) {
1196 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) {
1197 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm)
1198 continue;
1199
1200 MachineOperand &Op = MI.getOperand(OpIdx);
1201 Register Reg = Op.getReg();
1202 LLT Ty = MRI.getType(Reg);
1203 const RegisterBank *RB = MRI.getRegBank(Reg);
1204
1205 switch (MethodIDs[i]) {
1206 case Vcc: {
1207 assert(Ty == S1);
1208 assert(RB == VccRB || RB == SgprRB);
1209 if (RB == SgprRB) {
1210 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1211 auto CopyVcc_Scc =
1212 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1213 Op.setReg(CopyVcc_Scc.getReg(0));
1214 }
1215 break;
1216 }
1217 // sgpr scalars, pointers and vectors
1218 case Sgpr16:
1219 case Sgpr32:
1220 case Sgpr64:
1221 case Sgpr128:
1222 case SgprP0:
1223 case SgprP1:
1224 case SgprP3:
1225 case SgprP4:
1226 case SgprP5:
1227 case SgprP8:
1228 case SgprV2S16:
1229 case SgprV2S32:
1230 case SgprV4S32: {
1231 assert(Ty == getTyFromID(MethodIDs[i]));
1232 assert(RB == getRegBankFromID(MethodIDs[i]));
1233 break;
1234 }
1235 // sgpr B-types
1236 case SgprB32:
1237 case SgprB64:
1238 case SgprB96:
1239 case SgprB128:
1240 case SgprB256:
1241 case SgprB512:
1242 case SgprPtr32:
1243 case SgprPtr64:
1244 case SgprPtr128: {
1245 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1246 assert(RB == getRegBankFromID(MethodIDs[i]));
1247 break;
1248 }
1249 // vgpr scalars, pointers and vectors
1250 case Vgpr16:
1251 case Vgpr32:
1252 case Vgpr64:
1253 case Vgpr128:
1254 case VgprP0:
1255 case VgprP1:
1256 case VgprP3:
1257 case VgprP4:
1258 case VgprP5:
1259 case VgprV2S16:
1260 case VgprV2S32:
1261 case VgprV4S32: {
1262 assert(Ty == getTyFromID(MethodIDs[i]));
1263 if (RB != VgprRB) {
1264 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1265 Op.setReg(CopyToVgpr.getReg(0));
1266 }
1267 break;
1268 }
1269 // vgpr B-types
1270 case VgprB32:
1271 case VgprB64:
1272 case VgprB96:
1273 case VgprB128:
1274 case VgprB256:
1275 case VgprB512:
1276 case VgprPtr32:
1277 case VgprPtr64:
1278 case VgprPtr128: {
1279 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1280 if (RB != VgprRB) {
1281 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1282 Op.setReg(CopyToVgpr.getReg(0));
1283 }
1284 break;
1285 }
1286 // sgpr waterfall, scalars and vectors
1287 case Sgpr32_WF:
1288 case SgprV4S32_WF: {
1289 assert(Ty == getTyFromID(MethodIDs[i]));
1290 if (RB != SgprRB)
1291 SgprWaterfallOperandRegs.insert(Reg);
1292 break;
1293 }
1294 // sgpr and vgpr scalars with extend
1295 case Sgpr32AExt: {
1296 // Note: this ext allows S1, and it is meant to be combined away.
1297 assert(Ty.getSizeInBits() < 32);
1298 assert(RB == SgprRB);
1299 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1300 Op.setReg(Aext.getReg(0));
1301 break;
1302 }
1303 case Sgpr32AExtBoolInReg: {
1304 // Note: this ext allows S1, and it is meant to be combined away.
1305 assert(Ty.getSizeInBits() == 1);
1306 assert(RB == SgprRB);
1307 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1308 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
1309 // most of times meant to be combined away in AMDGPURegBankCombiner.
1310 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
1311 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
1312 Op.setReg(BoolInReg.getReg(0));
1313 break;
1314 }
1315 case Sgpr32SExt: {
1316 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1317 assert(RB == SgprRB);
1318 auto Sext = B.buildSExt(SgprRB_S32, Reg);
1319 Op.setReg(Sext.getReg(0));
1320 break;
1321 }
1322 case Sgpr32ZExt: {
1323 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1324 assert(RB == SgprRB);
1325 auto Zext = B.buildZExt({SgprRB, S32}, Reg);
1326 Op.setReg(Zext.getReg(0));
1327 break;
1328 }
1329 case Vgpr32SExt: {
1330 // Note this ext allows S1, and it is meant to be combined away.
1331 assert(Ty.getSizeInBits() < 32);
1332 assert(RB == VgprRB);
1333 auto Sext = B.buildSExt({VgprRB, S32}, Reg);
1334 Op.setReg(Sext.getReg(0));
1335 break;
1336 }
1337 case Vgpr32ZExt: {
1338 // Note this ext allows S1, and it is meant to be combined away.
1339 assert(Ty.getSizeInBits() < 32);
1340 assert(RB == VgprRB);
1341 auto Zext = B.buildZExt({VgprRB, S32}, Reg);
1342 Op.setReg(Zext.getReg(0));
1343 break;
1344 }
1345 default:
1346 llvm_unreachable("ID not supported");
1347 }
1348 }
1349}
1350
1352 Register Dst = MI.getOperand(0).getReg();
1353 LLT Ty = MRI.getType(Dst);
1354
1355 if (Ty == LLT::scalar(1) && MUI.isUniform(Dst)) {
1356 B.setInsertPt(*MI.getParent(), MI.getParent()->getFirstNonPHI());
1357
1358 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1359 MI.getOperand(0).setReg(NewDst);
1360 B.buildTrunc(Dst, NewDst);
1361
1362 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1363 Register UseReg = MI.getOperand(i).getReg();
1364
1365 auto DefMI = MRI.getVRegDef(UseReg)->getIterator();
1366 MachineBasicBlock *DefMBB = DefMI->getParent();
1367
1368 B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
1369
1370 auto NewUse = B.buildAnyExt(SgprRB_S32, UseReg);
1371 MI.getOperand(i).setReg(NewUse.getReg(0));
1372 }
1373
1374 return;
1375 }
1376
1377 // ALL divergent i1 phis should be already lowered and inst-selected into PHI
1378 // with sgpr reg class and S1 LLT.
1379 // Note: this includes divergent phis that don't require lowering.
1380 if (Ty == LLT::scalar(1) && MUI.isDivergent(Dst)) {
1381 LLVM_DEBUG(dbgs() << "Divergent S1 G_PHI: "; MI.dump(););
1382 llvm_unreachable("Make sure to run AMDGPUGlobalISelDivergenceLowering "
1383 "before RegBankLegalize to lower lane mask(vcc) phis");
1384 }
1385
1386 // We accept all types that can fit in some register class.
1387 // Uniform G_PHIs have all sgpr registers.
1388 // Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr.
1389 if (Ty == LLT::scalar(32) || Ty == LLT::pointer(1, 64) ||
1390 Ty == LLT::pointer(4, 64)) {
1391 return;
1392 }
1393
1394 LLVM_DEBUG(dbgs() << "G_PHI not handled: "; MI.dump(););
1395 llvm_unreachable("type not supported");
1396}
1397
1398[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
1399 const RegisterBank *RB,
1401 unsigned StartOpIdx,
1402 unsigned EndOpIdx) {
1403 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
1404 if (MRI.getRegBankOrNull(MI.getOperand(i).getReg()) != RB)
1405 return false;
1406 }
1407 return true;
1408}
1409
1411 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
1412 // Put RB on all registers
1413 unsigned NumDefs = MI.getNumDefs();
1414 unsigned NumOperands = MI.getNumOperands();
1415
1416 assert(verifyRegBankOnOperands(MI, RB, MRI, 0, NumDefs - 1));
1417 if (RB == SgprRB)
1418 assert(verifyRegBankOnOperands(MI, RB, MRI, NumDefs, NumOperands - 1));
1419
1420 if (RB == VgprRB) {
1421 B.setInstr(MI);
1422 for (unsigned i = NumDefs; i < NumOperands; ++i) {
1423 Register Reg = MI.getOperand(i).getReg();
1424 if (MRI.getRegBank(Reg) != RB) {
1425 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1426 MI.getOperand(i).setReg(Copy.getReg(0));
1427 }
1428 }
1429 }
1430}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition MD5.cpp:57
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
const SmallVectorImpl< MachineOperand > & Cond
#define LLVM_DEBUG(...)
Definition Debug.h:114
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping & findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
@ ICMP_NE
not equal
Definition InstrTypes.h:698
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
Representation of each machine instruction.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void dump() const
Definition Pass.cpp:146
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
bool empty() const
Definition SmallSet.h:168
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
void push_back(const T &Elt)
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isAnyPtr(LLT Ty, unsigned Width)
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Kill
The last use of a register.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
@ Offset
Definition DWP.cpp:532
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:155
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping