LLVM 23.0.0git
AMDGPURegBankLegalizeRules.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeRules.cpp ------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Definitions of RegBankLegalize Rules for all opcodes.
10/// Implementation of container for all the Rules and search.
11/// Fast search for most common case when Rule.Predicate checks LLT and
12/// uniformity of register in operand 0.
13//
14//===----------------------------------------------------------------------===//
15
17#include "AMDGPUInstrInfo.h"
18#include "GCNSubtarget.h"
21#include "llvm/IR/IntrinsicsAMDGPU.h"
23
24#define DEBUG_TYPE "amdgpu-regbanklegalize"
25
26using namespace llvm;
27using namespace AMDGPU;
28
29bool AMDGPU::isAnyPtr(LLT Ty, unsigned Width) {
30 return Ty.isPointer() && Ty.getSizeInBits() == Width;
31}
32
34 std::initializer_list<RegBankLLTMappingApplyID> DstOpMappingList,
35 std::initializer_list<RegBankLLTMappingApplyID> SrcOpMappingList,
37 : DstOpMapping(DstOpMappingList), SrcOpMapping(SrcOpMappingList),
39
41 std::initializer_list<UniformityLLTOpPredicateID> OpList,
42 std::function<bool(const MachineInstr &)> TestFunc)
44
46 const MachineUniformityInfo &MUI,
47 const MachineRegisterInfo &MRI) {
48 switch (UniID) {
49 case S1:
50 return MRI.getType(Reg) == LLT::scalar(1);
51 case S16:
52 return MRI.getType(Reg) == LLT::scalar(16);
53 case S32:
54 return MRI.getType(Reg) == LLT::scalar(32);
55 case S64:
56 return MRI.getType(Reg) == LLT::scalar(64);
57 case S128:
58 return MRI.getType(Reg) == LLT::scalar(128);
59 case P0:
60 return MRI.getType(Reg) == LLT::pointer(0, 64);
61 case P1:
62 return MRI.getType(Reg) == LLT::pointer(1, 64);
63 case P2:
64 return MRI.getType(Reg) == LLT::pointer(2, 32);
65 case P3:
66 return MRI.getType(Reg) == LLT::pointer(3, 32);
67 case P4:
68 return MRI.getType(Reg) == LLT::pointer(4, 64);
69 case P5:
70 return MRI.getType(Reg) == LLT::pointer(5, 32);
71 case P8:
72 return MRI.getType(Reg) == LLT::pointer(8, 128);
73 case Ptr32:
74 return isAnyPtr(MRI.getType(Reg), 32);
75 case Ptr64:
76 return isAnyPtr(MRI.getType(Reg), 64);
77 case Ptr128:
78 return isAnyPtr(MRI.getType(Reg), 128);
79 case V2S16:
80 return MRI.getType(Reg) == LLT::fixed_vector(2, 16);
81 case V2S32:
82 return MRI.getType(Reg) == LLT::fixed_vector(2, 32);
83 case V3S32:
84 return MRI.getType(Reg) == LLT::fixed_vector(3, 32);
85 case V4S32:
86 return MRI.getType(Reg) == LLT::fixed_vector(4, 32);
87 case B32:
88 return MRI.getType(Reg).getSizeInBits() == 32;
89 case B64:
90 return MRI.getType(Reg).getSizeInBits() == 64;
91 case B96:
92 return MRI.getType(Reg).getSizeInBits() == 96;
93 case B128:
94 return MRI.getType(Reg).getSizeInBits() == 128;
95 case B160:
96 return MRI.getType(Reg).getSizeInBits() == 160;
97 case B256:
98 return MRI.getType(Reg).getSizeInBits() == 256;
99 case B512:
100 return MRI.getType(Reg).getSizeInBits() == 512;
101 case DivAnyTy:
102 return MUI.isDivergent(Reg);
103 case UniS1:
104 return MRI.getType(Reg) == LLT::scalar(1) && MUI.isUniform(Reg);
105 case UniS16:
106 return MRI.getType(Reg) == LLT::scalar(16) && MUI.isUniform(Reg);
107 case UniS32:
108 return MRI.getType(Reg) == LLT::scalar(32) && MUI.isUniform(Reg);
109 case UniS64:
110 return MRI.getType(Reg) == LLT::scalar(64) && MUI.isUniform(Reg);
111 case UniS128:
112 return MRI.getType(Reg) == LLT::scalar(128) && MUI.isUniform(Reg);
113 case UniP0:
114 return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isUniform(Reg);
115 case UniP1:
116 return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isUniform(Reg);
117 case UniP2:
118 return MRI.getType(Reg) == LLT::pointer(2, 32) && MUI.isUniform(Reg);
119 case UniP3:
120 return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isUniform(Reg);
121 case UniP4:
122 return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniform(Reg);
123 case UniP5:
124 return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniform(Reg);
125 case UniP8:
126 return MRI.getType(Reg) == LLT::pointer(8, 128) && MUI.isUniform(Reg);
127 case UniPtr32:
128 return isAnyPtr(MRI.getType(Reg), 32) && MUI.isUniform(Reg);
129 case UniPtr64:
130 return isAnyPtr(MRI.getType(Reg), 64) && MUI.isUniform(Reg);
131 case UniPtr128:
132 return isAnyPtr(MRI.getType(Reg), 128) && MUI.isUniform(Reg);
133 case UniV2S16:
134 return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isUniform(Reg);
135 case UniV2S32:
136 return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && MUI.isUniform(Reg);
137 case UniB32:
138 return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isUniform(Reg);
139 case UniB64:
140 return MRI.getType(Reg).getSizeInBits() == 64 && MUI.isUniform(Reg);
141 case UniB96:
142 return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isUniform(Reg);
143 case UniB128:
144 return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isUniform(Reg);
145 case UniB160:
146 return MRI.getType(Reg).getSizeInBits() == 160 && MUI.isUniform(Reg);
147 case UniB256:
148 return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isUniform(Reg);
149 case UniB512:
150 return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isUniform(Reg);
151 case UniBRC: {
152 if (!MUI.isUniform(Reg))
153 return false;
154 // Check if there is SGPR register class of same size as the LLT.
155 const SIRegisterInfo *TRI =
156 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
157 // There is no 16 bit SGPR register class. Extra size check is required
158 // since getSGPRClassForBitWidth returns SReg_32RegClass for Size 16.
159 unsigned LLTSize = MRI.getType(Reg).getSizeInBits();
160 return LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize);
161 }
162 case DivS1:
163 return MRI.getType(Reg) == LLT::scalar(1) && MUI.isDivergent(Reg);
164 case DivS16:
165 return MRI.getType(Reg) == LLT::scalar(16) && MUI.isDivergent(Reg);
166 case DivS32:
167 return MRI.getType(Reg) == LLT::scalar(32) && MUI.isDivergent(Reg);
168 case DivS64:
169 return MRI.getType(Reg) == LLT::scalar(64) && MUI.isDivergent(Reg);
170 case DivS128:
171 return MRI.getType(Reg) == LLT::scalar(128) && MUI.isDivergent(Reg);
172 case DivP0:
173 return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isDivergent(Reg);
174 case DivP1:
175 return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isDivergent(Reg);
176 case DivP2:
177 return MRI.getType(Reg) == LLT::pointer(2, 32) && MUI.isDivergent(Reg);
178 case DivP3:
179 return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isDivergent(Reg);
180 case DivP4:
181 return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isDivergent(Reg);
182 case DivP5:
183 return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isDivergent(Reg);
184 case DivPtr32:
185 return isAnyPtr(MRI.getType(Reg), 32) && MUI.isDivergent(Reg);
186 case DivPtr64:
187 return isAnyPtr(MRI.getType(Reg), 64) && MUI.isDivergent(Reg);
188 case DivPtr128:
189 return isAnyPtr(MRI.getType(Reg), 128) && MUI.isDivergent(Reg);
190 case DivV2S16:
191 return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isDivergent(Reg);
192 case DivV2S32:
193 return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && MUI.isDivergent(Reg);
194 case DivV3S32:
195 return MRI.getType(Reg) == LLT::fixed_vector(3, 32) && MUI.isDivergent(Reg);
196 case DivV4S16:
197 return MRI.getType(Reg) == LLT::fixed_vector(4, 16) && MUI.isDivergent(Reg);
198 case DivV6S32:
199 return MRI.getType(Reg) == LLT::fixed_vector(6, 32) && MUI.isDivergent(Reg);
200 case DivB32:
201 return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergent(Reg);
202 case DivB64:
203 return MRI.getType(Reg).getSizeInBits() == 64 && MUI.isDivergent(Reg);
204 case DivB96:
205 return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isDivergent(Reg);
206 case DivB128:
207 return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isDivergent(Reg);
208 case DivB160:
209 return MRI.getType(Reg).getSizeInBits() == 160 && MUI.isDivergent(Reg);
210 case DivB256:
211 return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isDivergent(Reg);
212 case DivB512:
213 return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isDivergent(Reg);
214 case DivBRC: {
215 if (!MUI.isDivergent(Reg))
216 return false;
217 // Check if there is VGPR register class of same size as the LLT.
218 const SIRegisterInfo *TRI =
219 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
220 return TRI->getSGPRClassForBitWidth(MRI.getType(Reg).getSizeInBits());
221 }
222 case BRC: {
223 // Check if there is SGPR and VGPR register class of same size as the LLT.
224 const SIRegisterInfo *TRI =
225 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
226 unsigned LLTSize = MRI.getType(Reg).getSizeInBits();
227 return LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize) &&
228 TRI->getVGPRClassForBitWidth(LLTSize);
229 }
230 case _:
231 return true;
232 default:
233 llvm_unreachable("missing matchUniformityAndLLT");
234 }
235}
236
238 const MachineUniformityInfo &MUI,
239 const MachineRegisterInfo &MRI) const {
240 // Check LLT signature.
241 for (unsigned i = 0; i < OpUniformityAndTypes.size(); ++i) {
242 const MachineOperand &MO = MI.getOperand(i);
243 if (OpUniformityAndTypes[i] == _) {
244 assert((!MI.getOperand(i).isReg() ||
245 !MI.getOperand(i).getReg().isVirtual()) &&
246 "_ is for non-register and physical register operands only");
247 continue;
248 }
249
250 // Remaining IDs check registers.
251 if (!MO.isReg())
252 return false;
253
254 if (!matchUniformityAndLLT(MO.getReg(), OpUniformityAndTypes[i], MUI, MRI))
255 return false;
256 }
257
258 // More complex check.
259 if (TestFunc)
260 return TestFunc(MI);
261
262 return true;
263}
264
266
268 : FastTypes(FastTypes) {}
269
271 if (Ty == LLT::scalar(16))
272 return S16;
273 if (Ty == LLT::scalar(32))
274 return S32;
275 if (Ty == LLT::scalar(64))
276 return S64;
277 if (Ty == LLT::fixed_vector(2, 16))
278 return V2S16;
279 if (Ty == LLT::fixed_vector(2, 32))
280 return V2S32;
281 if (Ty == LLT::fixed_vector(3, 32))
282 return V3S32;
283 if (Ty == LLT::fixed_vector(4, 32))
284 return V4S32;
285 return _;
286}
287
289 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
290 isAnyPtr(Ty, 32))
291 return B32;
292 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
293 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
294 return B64;
295 if (Ty == LLT::fixed_vector(3, 32))
296 return B96;
297 if (Ty == LLT::fixed_vector(4, 32) || Ty == LLT::fixed_vector(2, 64) ||
298 Ty == LLT::fixed_vector(8, 16) || isAnyPtr(Ty, 128))
299 return B128;
300 return _;
301}
302
303const RegBankLLTMapping *
305 const MachineRegisterInfo &MRI,
306 const MachineUniformityInfo &MUI) const {
307 // Search in "Fast Rules".
308 // Note: if fast rules are enabled, RegBankLLTMapping must be added in each
309 // slot that could "match fast Predicate". If not, InvalidMapping is
310 // returned which results in failure, does not search "Slow Rules".
311 if (FastTypes != NoFastRules) {
312 Register Reg = MI.getOperand(0).getReg();
313 int Slot;
314 if (FastTypes == StandardB)
315 Slot = getFastPredicateSlot(LLTToBId(MRI.getType(Reg)));
316 else
317 Slot = getFastPredicateSlot(LLTToId(MRI.getType(Reg)));
318
319 if (Slot != -1)
320 return MUI.isUniform(Reg) ? &Uni[Slot] : &Div[Slot];
321 }
322
323 // Slow search for more complex rules.
324 for (const RegBankLegalizeRule &Rule : Rules) {
325 if (Rule.Predicate.match(MI, MUI, MRI))
326 return &Rule.OperandMapping;
327 }
328
329 return nullptr;
330}
331
333 Rules.push_back(Rule);
334}
335
337 RegBankLLTMapping RuleApplyIDs) {
338 int Slot = getFastPredicateSlot(Ty);
339 assert(Slot != -1 && "Ty unsupported in this FastRulesTypes");
340 Div[Slot] = std::move(RuleApplyIDs);
341}
342
344 RegBankLLTMapping RuleApplyIDs) {
345 int Slot = getFastPredicateSlot(Ty);
346 assert(Slot != -1 && "Ty unsupported in this FastRulesTypes");
347 Uni[Slot] = std::move(RuleApplyIDs);
348}
349
350int SetOfRulesForOpcode::getFastPredicateSlot(
352 switch (FastTypes) {
353 case Standard: {
354 switch (Ty) {
355 case S32:
356 return 0;
357 case S16:
358 return 1;
359 case S64:
360 return 2;
361 case V2S16:
362 return 3;
363 default:
364 return -1;
365 }
366 }
367 case StandardB: {
368 switch (Ty) {
369 case B32:
370 return 0;
371 case B64:
372 return 1;
373 case B96:
374 return 2;
375 case B128:
376 return 3;
377 default:
378 return -1;
379 }
380 }
381 case Vector: {
382 switch (Ty) {
383 case S32:
384 return 0;
385 case V2S32:
386 return 1;
387 case V3S32:
388 return 2;
389 case V4S32:
390 return 3;
391 default:
392 return -1;
393 }
394 }
395 default:
396 return -1;
397 }
398}
399
400RegBankLegalizeRules::RuleSetInitializer
401RegBankLegalizeRules::addRulesForGOpcs(std::initializer_list<unsigned> OpcList,
402 FastRulesTypes FastTypes) {
403 return RuleSetInitializer(OpcList, GRulesAlias, GRules, FastTypes);
404}
405
406RegBankLegalizeRules::RuleSetInitializer
407RegBankLegalizeRules::addRulesForIOpcs(std::initializer_list<unsigned> OpcList,
408 FastRulesTypes FastTypes) {
409 return RuleSetInitializer(OpcList, IRulesAlias, IRules, FastTypes);
410}
411
414 unsigned Opc = MI.getOpcode();
415 if (Opc == AMDGPU::G_INTRINSIC || Opc == AMDGPU::G_INTRINSIC_CONVERGENT ||
416 Opc == AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS ||
417 Opc == AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS) {
418 unsigned IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
419 auto IRAIt = IRulesAlias.find(IntrID);
420 if (IRAIt == IRulesAlias.end())
421 return nullptr;
422 return &IRules.at(IRAIt->second);
423 }
424
425 auto GRAIt = GRulesAlias.find(Opc);
426 if (GRAIt == GRulesAlias.end())
427 return nullptr;
428 return &GRules.at(GRAIt->second);
429}
430
431// Syntactic sugar wrapper for predicate lambda that enables '&&', '||' and '!'.
432class Predicate {
433private:
434 struct Elt {
435 // Save formula composed of Pred, '&&', '||' and '!' as a jump table.
436 // Sink ! to Pred. For example !((A && !B) || C) -> (!A || B) && !C
437 // Sequences of && and || will be represented by jumps, for example:
438 // (A && B && ... X) or (A && B && ... X) || Y
439 // A == true jump to B
440 // A == false jump to end or Y, result is A(false) or Y
441 // (A || B || ... X) or (A || B || ... X) && Y
442 // A == true jump to end or Y, result is A(true) or Y
443 // A == false jump to B
444 // Notice that when negating expression, we simply flip Neg on each Pred
445 // and swap TJumpOffset and FJumpOffset (&& becomes ||, || becomes &&).
446 std::function<bool(const MachineInstr &)> Pred;
447 bool Neg; // Neg of Pred is calculated before jump
448 unsigned TJumpOffset;
449 unsigned FJumpOffset;
450 };
451
452 SmallVector<Elt, 8> Expression;
453
454 Predicate(SmallVectorImpl<Elt> &&Expr) { Expression.swap(Expr); };
455
456public:
457 Predicate(std::function<bool(const MachineInstr &)> Pred) {
458 Expression.push_back({Pred, false, 1, 1});
459 };
460
461 bool operator()(const MachineInstr &MI) const {
462 unsigned Idx = 0;
463 unsigned ResultIdx = Expression.size();
464 bool Result;
465 do {
466 Result = Expression[Idx].Pred(MI);
467 Result = Expression[Idx].Neg ? !Result : Result;
468 if (Result) {
469 Idx += Expression[Idx].TJumpOffset;
470 } else {
471 Idx += Expression[Idx].FJumpOffset;
472 }
473 } while ((Idx != ResultIdx));
474
475 return Result;
476 };
477
478 Predicate operator!() const {
479 SmallVector<Elt, 8> NegExpression;
480 for (const Elt &ExprElt : Expression) {
481 NegExpression.push_back({ExprElt.Pred, !ExprElt.Neg, ExprElt.FJumpOffset,
482 ExprElt.TJumpOffset});
483 }
484 return Predicate(std::move(NegExpression));
485 };
486
487 Predicate operator&&(const Predicate &RHS) const {
488 SmallVector<Elt, 8> AndExpression = Expression;
489
490 unsigned RHSSize = RHS.Expression.size();
491 unsigned ResultIdx = Expression.size();
492 for (unsigned i = 0; i < ResultIdx; ++i) {
493 // LHS results in false, whole expression results in false.
494 if (i + AndExpression[i].FJumpOffset == ResultIdx)
495 AndExpression[i].FJumpOffset += RHSSize;
496 }
497
498 AndExpression.append(RHS.Expression);
499
500 return Predicate(std::move(AndExpression));
501 }
502
503 Predicate operator||(const Predicate &RHS) const {
504 SmallVector<Elt, 8> OrExpression = Expression;
505
506 unsigned RHSSize = RHS.Expression.size();
507 unsigned ResultIdx = Expression.size();
508 for (unsigned i = 0; i < ResultIdx; ++i) {
509 // LHS results in true, whole expression results in true.
510 if (i + OrExpression[i].TJumpOffset == ResultIdx)
511 OrExpression[i].TJumpOffset += RHSSize;
512 }
513
514 OrExpression.append(RHS.Expression);
515
516 return Predicate(std::move(OrExpression));
517 }
518};
519
520// Initialize rules
523 : ST(&_ST), MRI(&_MRI) {
524
525 addRulesForGOpcs({G_ADD, G_SUB}, Standard)
526 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}})
527 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
528 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
529 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
531 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
532 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}})
533 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}});
534
535 addRulesForGOpcs({G_UADDO, G_USUBO}, Standard)
536 .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32}})
537 .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32}});
538
539 addRulesForGOpcs({G_UADDE, G_USUBE, G_SADDE, G_SSUBE}, Standard)
541 .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32, Vcc}});
542
543 addRulesForGOpcs({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT}, Standard)
544 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}})
545 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
546 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
547 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
549 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
550
551 bool HasVecMulU64 = ST->hasVectorMulU64();
552 addRulesForGOpcs({G_MUL}, Standard)
553 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
554 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
555 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
556 .Uni(S64, {{SgprB64}, {SgprB64, SgprB64}})
558 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
559 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}})
560 .Div(S64, {{VgprB64}, {VgprB64, VgprB64}}, HasVecMulU64)
561 .Div(S64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32Mul}, !HasVecMulU64);
562
563 bool hasMulHi = ST->hasScalarMulHiInsts();
564 addRulesForGOpcs({G_UMULH, G_SMULH}, Standard)
565 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
566 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasMulHi)
567 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasMulHi);
568
569 addRulesForGOpcs({G_AMDGPU_MAD_U64_U32}, Standard)
570 .Div(S64, {{Vgpr64, Vcc}, {Vgpr32, Vgpr32, Vgpr64}})
572
573 bool HasScalarSMulU64 = ST->hasScalarSMulU64();
574 addRulesForGOpcs({G_AMDGPU_S_MUL_U64_U32, G_AMDGPU_S_MUL_I64_I32}, Standard)
575 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}, UniMul64}, HasScalarSMulU64)
576 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}, DivSMulToMAD});
577
578 addRulesForGOpcs({G_XOR, G_OR, G_AND}, StandardB)
580 .Any({{DivS1}, {{Vcc}, {Vcc, Vcc}}})
581 .Any({{UniS16}, {{Sgpr16}, {Sgpr16, Sgpr16}}})
582 .Any({{DivS16}, {{Vgpr16}, {Vgpr16, Vgpr16}}})
583 .Uni(B32, {{SgprB32}, {SgprB32, SgprB32}})
584 .Div(B32, {{VgprB32}, {VgprB32, VgprB32}})
585 .Uni(B64, {{SgprB64}, {SgprB64, SgprB64}})
586 .Div(B64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32});
587
588 addRulesForGOpcs({G_SHL}, Standard)
589 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32ZExt}})
590 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
592 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
593 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
594 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
595 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
596 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
597
598 addRulesForGOpcs({G_LSHR}, Standard)
599 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32ZExt, Sgpr32ZExt}})
600 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
602 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
603 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
604 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
605 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
606 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
607
608 addRulesForGOpcs({G_ASHR}, Standard)
609 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt, Sgpr32ZExt}})
610 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
612 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
613 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
614 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
615 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
616 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
617
618 addRulesForGOpcs({G_FSHR}, Standard)
619 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
620 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
621
622 addRulesForGOpcs({G_BSWAP}, Standard)
623 .Uni(S16, {{UniInVgprS16}, {Vgpr16}})
624 .Div(S16, {{Vgpr16}, {Vgpr16}})
625 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
626 .Div(S32, {{Vgpr32}, {Vgpr32}})
627 .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}})
628 .Div(V2S16, {{VgprV2S16}, {VgprV2S16}});
629
630 addRulesForGOpcs({G_AMDGPU_CVT_F32_UBYTE0, G_AMDGPU_CVT_F32_UBYTE1,
631 G_AMDGPU_CVT_F32_UBYTE2, G_AMDGPU_CVT_F32_UBYTE3,
632 G_AMDGPU_RCP_IFLAG},
633 Standard)
634 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
635 .Div(S32, {{Vgpr32}, {Vgpr32}});
636
637 addRulesForGOpcs({G_FRAME_INDEX}).Any({{UniP5, _}, {{SgprP5}, {None}}});
638
639 addRulesForGOpcs({G_UBFX, G_SBFX}, Standard)
640 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32, Sgpr32}, S_BFE})
641 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}})
642 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32, Sgpr32}, S_BFE})
643 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32, Vgpr32}, V_BFE});
644
645 addRulesForGOpcs({G_SMIN, G_SMAX}, Standard)
646 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt, Sgpr32SExt}})
647 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
648 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
649 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
651 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
652
653 addRulesForGOpcs({G_UMIN, G_UMAX}, Standard)
654 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32ZExt, Sgpr32ZExt}})
655 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
656 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
657 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
659 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
660
661 addRulesForGOpcs({G_IMPLICIT_DEF})
662 .Any({{UniS1}, {{Sgpr32Trunc}, {}}})
663 .Any({{UniS16}, {{Sgpr16}, {}}})
664 .Any({{UniBRC}, {{SgprBRC}, {}}});
665
666 addRulesForGOpcs({G_CONSTANT}, Standard)
667 .Any({{UniS1, _}, {{Sgpr32Trunc}, {}, UniCstExt}})
668 .Uni(S16, {{Sgpr16}, {}})
669 .Uni(S32, {{Sgpr32}, {}})
670 .Uni(S64, {{Sgpr64}, {}})
671 .Any({{UniPtr32, _}, {{SgprPtr32}, {}}})
672 .Any({{UniPtr64, _}, {{SgprPtr64}, {}}});
673
674 addRulesForGOpcs({G_FCONSTANT}, Standard)
675 .Uni(S16, {{Sgpr16}, {}})
676 .Uni(S32, {{Sgpr32}, {}})
677 .Uni(S64, {{Sgpr64}, {}});
678
679 addRulesForGOpcs({G_FREEZE})
680 .Any({{UniS1}, {{Sgpr32Trunc}, {Sgpr32AExt}}})
681 .Any({{DivS1}, {{Vcc}, {Vcc}}})
682 .Any({{UniS16}, {{Sgpr16}, {Sgpr16}}})
683 .Any({{UniBRC}, {{SgprBRC}, {SgprBRC}}})
684 .Any({{DivBRC}, {{VgprBRC}, {VgprBRC}}});
685
686 addRulesForGOpcs({G_BITCAST})
687 .Any({{UniBRC}, {{SgprBRC}, {SgprBRC}}})
688 .Any({{DivBRC}, {{VgprBRC}, {VgprBRC}}});
689
690 addRulesForGOpcs({G_UNMERGE_VALUES})
691 .Any({{UniS16}, {{}, {}, UnmergeToShiftTrunc}})
692 .Any({{UniBRC}, {{}, {}, VerifyAllSgpr}})
693 .Any({{DivBRC}, {{}, {}, ApplyAllVgpr}});
694
695 addRulesForGOpcs({G_BUILD_VECTOR})
696 .Any({{UniBRC, S16}, {{}, {}, VerifyAllSgpr}})
697 .Any({{UniBRC, BRC}, {{}, {}, VerifyAllSgpr}})
698 .Any({{DivBRC, S16}, {{}, {}, ApplyAllVgpr}})
699 .Any({{DivBRC, BRC}, {{}, {}, ApplyAllVgpr}});
700
701 addRulesForGOpcs({G_MERGE_VALUES, G_CONCAT_VECTORS})
702 .Any({{UniBRC, BRC}, {{}, {}, VerifyAllSgpr}})
703 .Any({{DivBRC, BRC}, {{}, {}, ApplyAllVgpr}});
704
705 addRulesForGOpcs({G_PHI})
706 .Any({{UniS1}, {{}, {}, AextToS32InIncomingBlockGPHI}})
707 .Any({{UniS16}, {{}, {}, VerifyAllSgprGPHI}})
708 .Any({{UniBRC}, {{}, {}, VerifyAllSgprGPHI}})
709 .Any({{DivBRC}, {{}, {}, VerifyAllSgprOrVgprGPHI}});
710
711 addRulesForGOpcs({G_EXTRACT_VECTOR_ELT})
712 .Any({{UniB32, UniBRC, UniS32}, {{SgprB32}, {SgprBRC, Sgpr32}}})
713 .Any({{DivB32, DivBRC, UniS32}, {{VgprB32}, {VgprBRC, Sgpr32}}})
714 .Any({{DivB32, BRC, DivS32},
716 .Any({{UniB64, UniBRC, UniS32}, {{SgprB64}, {SgprBRC, Sgpr32}}})
717 .Any({{DivB64, DivBRC, UniS32},
719 .Any({{DivB64, BRC, DivS32},
721
722 addRulesForGOpcs({G_INSERT_VECTOR_ELT})
724 {{SgprBRC}, {SgprBRC, SgprB32, Sgpr32}}})
725 .Any(
726 {{DivBRC, BRC, B32, UniS32}, {{VgprBRC}, {VgprBRC, VgprB32, Sgpr32}}})
727 .Any({{DivBRC, BRC, B32, DivS32},
731 .Any({{DivBRC, BRC, B64, UniS32},
733 .Any({{DivBRC, BRC, B64, DivS32},
735
736 // INTERSECT_RAY {Div}, {{VgprDst...}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
737 // INTERSECT_RAY {Uni}, {{UniInVgprDst...}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
738 addRulesForGOpcs({G_AMDGPU_BVH_INTERSECT_RAY, G_AMDGPU_BVH_DUAL_INTERSECT_RAY,
739 G_AMDGPU_BVH8_INTERSECT_RAY})
740 .Any({{}, {{}, {}, ApplyBVH_INTERSECT_RAY}});
741
742 // LOAD {Div}, {{VgprDst...}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
743 // LOAD {Uni}, {{UniInVgprDst...}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
744 // LOAD_NORET {}, {{}, {Imm, VgprSrc, ..., Sgpr_WF_RsrcIdx}}
745 // STORE {}, {{}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
746 addRulesForGOpcs({G_AMDGPU_INTRIN_IMAGE_LOAD, G_AMDGPU_INTRIN_IMAGE_LOAD_D16,
747 G_AMDGPU_INTRIN_IMAGE_LOAD_NORET,
748 G_AMDGPU_INTRIN_IMAGE_STORE,
749 G_AMDGPU_INTRIN_IMAGE_STORE_D16})
750 .Any({{}, {{}, {}, ApplyINTRIN_IMAGE}});
751
752 Predicate isSignedICmp([](const MachineInstr &MI) -> bool {
753 auto Pred =
754 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
755 return CmpInst::isSigned(Pred);
756 });
757
758 Predicate isEqualityICmp([](const MachineInstr &MI) -> bool {
759 auto Pred =
760 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
761 return ICmpInst::isEquality(Pred);
762 });
763
764 bool HasScalarCompareEq64 = ST->hasScalarCompareEq64();
765 // clang-format off
766 addRulesForGOpcs({G_ICMP})
767 .Any({{{UniS1, _, S16}, isEqualityICmp}, {{Sgpr32Trunc}, {None, Sgpr32ZExt, Sgpr32ZExt}}})
768 .Any({{{UniS1, _, S16}, !isEqualityICmp && isSignedICmp}, {{Sgpr32Trunc}, {None, Sgpr32SExt, Sgpr32SExt}}})
769 .Any({{{UniS1, _, S16}, !isEqualityICmp && !isSignedICmp}, {{Sgpr32Trunc}, {None, Sgpr32ZExt, Sgpr32ZExt}}})
770 .Any({{{DivS1, _, S16}}, {{Vcc}, {None, Vgpr16, Vgpr16}}})
771 .Any({{{UniS1, _, S32}}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}})
772 .Any({{{DivS1, _, S32}}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
773 .Any({{{UniS1, _, S64}, isEqualityICmp}, {{Sgpr32Trunc}, {None, Sgpr64, Sgpr64}}}, HasScalarCompareEq64)
774 .Any({{{UniS1, _, S64}, isEqualityICmp}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}}, !HasScalarCompareEq64)
775 .Any({{{UniS1, _, S64}, !isEqualityICmp}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}})
776 .Any({{{DivS1, _, S64}}, {{Vcc}, {None, Vgpr64, Vgpr64}}})
777 .Any({{{UniS1, _, Ptr32}}, {{Sgpr32Trunc}, {None, SgprPtr32, SgprPtr32}}})
778 .Any({{{DivS1, _, Ptr32}}, {{Vcc}, {None, VgprPtr32, VgprPtr32}}})
779 .Any({{{UniS1, _, Ptr64}, isEqualityICmp}, {{Sgpr32Trunc}, {None, SgprPtr64, SgprPtr64}}}, HasScalarCompareEq64)
780 .Any({{{UniS1, _, Ptr64}, isEqualityICmp}, {{UniInVcc}, {None, VgprPtr64, VgprPtr64}}}, !HasScalarCompareEq64)
781 .Any({{{UniS1, _, Ptr64}, !isEqualityICmp}, {{UniInVcc}, {None, VgprPtr64, VgprPtr64}}})
782 .Any({{{DivS1, _, Ptr64}}, {{Vcc}, {None, VgprPtr64, VgprPtr64}}});
783 // clang-format on
784
785 addRulesForGOpcs({G_BRCOND})
786 .Any({{UniS1}, {{}, {Sgpr32AExtBoolInReg}}})
787 .Any({{DivS1}, {{}, {Vcc}}});
788
789 addRulesForGOpcs({G_BR}).Any({{_}, {{}, {None}}});
790
791 addRulesForGOpcs({G_SELECT}, StandardB)
792 .Any({{DivS16}, {{Vgpr16}, {Vcc, Vgpr16, Vgpr16}}})
794 .Div(B32, {{VgprB32}, {Vcc, VgprB32, VgprB32}})
798
799 addRulesForGOpcs({G_ANYEXT})
800 .Any({{UniS16, S1}, {{None}, {None}}}) // should be combined away
801 .Any({{UniS32, S1}, {{None}, {None}}}) // should be combined away
802 .Any({{UniS64, S1}, {{None}, {None}}}) // should be combined away
803 .Any({{DivS16, S1}, {{Vgpr16}, {Vcc}, VccExtToSel}})
804 .Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
805 .Any({{DivS64, S1}, {{Vgpr64}, {Vcc}, VccExtToSel}})
806 .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
807 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
808 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
809 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
810
811 bool Has16bitCmp = ST->has16BitInsts();
812
813 // In global-isel G_TRUNC in-reg is treated as no-op, inst selected into COPY.
814 // It is up to user to deal with truncated bits.
815 // S1, S16, S32 and S64 results are handled with specific rules. Remaining
816 // (result, source) pairs with valid register classes are covered by the
817 // generic UniBRC/DivBRC wildcard rules.
818 addRulesForGOpcs({G_TRUNC})
819 .Any({{UniS1, UniS16}, {{None}, {None}}}) // should be combined away
820 .Any({{UniS1, UniS32}, {{None}, {None}}}) // should be combined away
821 .Any({{UniS1, UniS64}, {{None}, {None}}}) // should be combined away
822 .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}})
823 .Any({{UniBRC, UniBRC}, {{SgprBRC}, {SgprBRC}}})
824 .Any({{DivBRC, DivBRC}, {{VgprBRC}, {VgprBRC}}})
825 .Any({{UniV2S16, V2S32}, {{SgprV2S16}, {SgprV2S32}}})
826 .Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}})
827 // This is non-trivial. VgprToVccCopy is done using compare instruction.
828 .Any({{DivS1, DivS16}, {{Vcc}, {Vgpr16}, VgprToVccCopy}}, Has16bitCmp)
830 !Has16bitCmp)
831 .Any({{DivS1, DivS32}, {{Vcc}, {Vgpr32}, VgprToVccCopy}})
832 .Any({{DivS1, DivS64}, {{Vcc}, {Vgpr64}, VgprToVccCopy}});
833
834 addRulesForGOpcs({G_ZEXT})
838 .Any({{DivS16, S1}, {{Vgpr16}, {Vcc}, VccExtToSel}})
839 .Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
840 .Any({{DivS64, S1}, {{Vgpr64}, {Vcc}, VccExtToSel}})
841 .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
842 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
843 // not extending S16 to S32 is questionable.
844 .Any({{UniS64, S16}, {{Sgpr64}, {Sgpr32ZExt}, Ext32To64}})
845 .Any({{DivS64, S16}, {{Vgpr64}, {Vgpr32ZExt}, Ext32To64}})
846 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
847 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
848
849 addRulesForGOpcs({G_SEXT})
853 .Any({{DivS16, S1}, {{Vgpr16}, {Vcc}, VccExtToSel}})
854 .Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
855 .Any({{DivS64, S1}, {{Vgpr64}, {Vcc}, VccExtToSel}})
856 .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
857 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
858 // not extending S16 to S32 is questionable.
859 .Any({{UniS64, S16}, {{Sgpr64}, {Sgpr32SExt}, Ext32To64}})
860 .Any({{DivS64, S16}, {{Vgpr64}, {Vgpr32SExt}, Ext32To64}})
861 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
862 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
863
864 addRulesForGOpcs({G_SEXT_INREG})
865 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}})
866 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
867 .Any({{UniS64, S64}, {{Sgpr64}, {Sgpr64}}})
869
870 addRulesForGOpcs({G_ASSERT_ZEXT, G_ASSERT_SEXT}, Standard)
871 .Uni(S32, {{Sgpr32}, {Sgpr32, Imm}})
872 .Div(S32, {{Vgpr32}, {Vgpr32, Imm}})
873 .Uni(S64, {{Sgpr64}, {Sgpr64, Imm}})
874 .Div(S64, {{Vgpr64}, {Vgpr64, Imm}});
875
876 addRulesForGOpcs({G_ASSERT_ALIGN}, Standard)
877 .Uni(S32, {{Sgpr32}, {Sgpr32}})
878 .Div(S32, {{Vgpr32}, {Vgpr32}})
879 .Uni(S64, {{Sgpr64}, {Sgpr64}})
880 .Div(S64, {{Vgpr64}, {Vgpr64}})
881 .Any({{UniPtr32}, {{SgprPtr32}, {SgprPtr32}}})
882 .Any({{DivPtr32}, {{VgprPtr32}, {VgprPtr32}}})
883 .Any({{UniPtr64}, {{SgprPtr64}, {SgprPtr64}}})
884 .Any({{DivPtr64}, {{VgprPtr64}, {VgprPtr64}}});
885
886 // Atomic read-modify-write operations: result and value are always VGPR,
887 // pointer varies by address space.
888 addRulesForGOpcs({G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_XCHG,
889 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
890 G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN,
891 G_ATOMICRMW_UMAX, G_ATOMICRMW_UINC_WRAP,
892 G_ATOMICRMW_UDEC_WRAP, G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
893 .Any({{DivS32, P0, S32}, {{Vgpr32}, {VgprP0, Vgpr32}}})
894 .Any({{DivS64, P0, S64}, {{Vgpr64}, {VgprP0, Vgpr64}}})
895 .Any({{DivS32, P1, S32}, {{Vgpr32}, {VgprP1, Vgpr32}}})
896 .Any({{DivS64, P1, S64}, {{Vgpr64}, {VgprP1, Vgpr64}}})
897 .Any({{DivS32, P3, S32}, {{Vgpr32}, {VgprP3, Vgpr32}}})
898 .Any({{DivS64, P3, S64}, {{Vgpr64}, {VgprP3, Vgpr64}}});
899
900 bool HasAtomicFlatPkAdd16Insts = ST->hasAtomicFlatPkAdd16Insts();
901 bool HasAtomicBufferGlobalPkAddF16Insts =
902 ST->hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
903 ST->hasAtomicBufferGlobalPkAddF16Insts();
904 bool HasAtomicDsPkAdd16Insts = ST->hasAtomicDsPkAdd16Insts();
905 addRulesForGOpcs({G_ATOMICRMW_FADD})
906 .Any({{DivS32, P0, S32}, {{Vgpr32}, {VgprP0, Vgpr32}}})
907 .Any({{DivS64, P0, S64}, {{Vgpr64}, {VgprP0, Vgpr64}}})
908 .Any({{DivS32, P1, S32}, {{Vgpr32}, {VgprP1, Vgpr32}}})
909 .Any({{DivS64, P1, S64}, {{Vgpr64}, {VgprP1, Vgpr64}}})
910 .Any({{DivS32, P3, S32}, {{Vgpr32}, {VgprP3, Vgpr32}}})
911 .Any({{DivS64, P3, S64}, {{Vgpr64}, {VgprP3, Vgpr64}}})
912 .Any({{DivV2S16, P0, V2S16}, {{VgprV2S16}, {VgprP0, VgprV2S16}}},
913 HasAtomicFlatPkAdd16Insts)
914 .Any({{DivV2S16, P1, V2S16}, {{VgprV2S16}, {VgprP1, VgprV2S16}}},
915 HasAtomicBufferGlobalPkAddF16Insts)
916 .Any({{DivV2S16, P3, V2S16}, {{VgprV2S16}, {VgprP3, VgprV2S16}}},
917 HasAtomicDsPkAdd16Insts);
918
919 addRulesForGOpcs({G_ATOMIC_CMPXCHG})
920 .Any({{DivS32, P2}, {{Vgpr32}, {VgprP2, Vgpr32, Vgpr32}}})
921 .Any({{DivS64, P2}, {{Vgpr64}, {VgprP2, Vgpr64, Vgpr64}}})
922 .Any({{DivS32, P3}, {{Vgpr32}, {VgprP3, Vgpr32, Vgpr32}}})
923 .Any({{DivS64, P3}, {{Vgpr64}, {VgprP3, Vgpr64, Vgpr64}}});
924
925 addRulesForGOpcs({G_AMDGPU_ATOMIC_CMPXCHG})
926 .Any({{DivS32, P0}, {{Vgpr32}, {VgprP0, VgprV2S32}}})
927 .Any({{DivS32, P1}, {{Vgpr32}, {VgprP1, VgprV2S32}}})
928 .Any({{DivS64, P0}, {{Vgpr64}, {VgprP0, VgprV2S64}}})
929 .Any({{DivS64, P1}, {{Vgpr64}, {VgprP1, VgprV2S64}}});
930
931 addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_CMPSWAP}, Standard)
932 .Div(S32, {{Vgpr32},
934 .Div(S64, {{Vgpr64},
936
937 addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_ADD, G_AMDGPU_BUFFER_ATOMIC_AND,
938 G_AMDGPU_BUFFER_ATOMIC_DEC, G_AMDGPU_BUFFER_ATOMIC_FMAX,
939 G_AMDGPU_BUFFER_ATOMIC_FMIN, G_AMDGPU_BUFFER_ATOMIC_INC,
940 G_AMDGPU_BUFFER_ATOMIC_OR, G_AMDGPU_BUFFER_ATOMIC_SMAX,
941 G_AMDGPU_BUFFER_ATOMIC_SMIN, G_AMDGPU_BUFFER_ATOMIC_SUB,
942 G_AMDGPU_BUFFER_ATOMIC_SWAP, G_AMDGPU_BUFFER_ATOMIC_UMAX,
943 G_AMDGPU_BUFFER_ATOMIC_UMIN, G_AMDGPU_BUFFER_ATOMIC_XOR},
944 Standard)
947
948 bool hasSMRDx3 = ST->hasScalarDwordx3Loads();
949 bool hasSMRDSmall = ST->hasScalarSubwordLoads();
950 bool usesTrue16 = ST->useRealTrue16Insts();
951
952 Predicate isAlign16([](const MachineInstr &MI) -> bool {
953 return (*MI.memoperands_begin())->getAlign() >= Align(16);
954 });
955
956 Predicate isAlign4([](const MachineInstr &MI) -> bool {
957 return (*MI.memoperands_begin())->getAlign() >= Align(4);
958 });
959
960 Predicate isAtomicMMO([](const MachineInstr &MI) -> bool {
961 return (*MI.memoperands_begin())->isAtomic();
962 });
963
964 Predicate isUniMMO([](const MachineInstr &MI) -> bool {
965 return AMDGPU::isUniformMMO(*MI.memoperands_begin());
966 });
967
968 Predicate isConst([](const MachineInstr &MI) -> bool {
969 // Address space in MMO be different then address space on pointer.
970 const MachineMemOperand *MMO = *MI.memoperands_begin();
971 const unsigned AS = MMO->getAddrSpace();
972 return AS == AMDGPUAS::CONSTANT_ADDRESS ||
974 });
975
976 Predicate isVolatileMMO([](const MachineInstr &MI) -> bool {
977 return (*MI.memoperands_begin())->isVolatile();
978 });
979
980 Predicate isInvMMO([](const MachineInstr &MI) -> bool {
981 return (*MI.memoperands_begin())->isInvariant();
982 });
983
984 Predicate isNoClobberMMO([](const MachineInstr &MI) -> bool {
985 return (*MI.memoperands_begin())->getFlags() & MONoClobber;
986 });
987
988 Predicate isNaturalAligned([](const MachineInstr &MI) -> bool {
989 const MachineMemOperand *MMO = *MI.memoperands_begin();
990 return MMO->getAlign() >= Align(MMO->getSize().getValue());
991 });
992
993 Predicate is8Or16BitMMO([](const MachineInstr &MI) -> bool {
994 const MachineMemOperand *MMO = *MI.memoperands_begin();
995 const unsigned MemSize = 8 * MMO->getSize().getValue();
996 return MemSize == 16 || MemSize == 8;
997 });
998
999 Predicate is32BitMMO([](const MachineInstr &MI) -> bool {
1000 const MachineMemOperand *MMO = *MI.memoperands_begin();
1001 return 8 * MMO->getSize().getValue() == 32;
1002 });
1003
1004 auto isUL = !isAtomicMMO && isUniMMO && (isConst || !isVolatileMMO) &&
1005 (isConst || isInvMMO || isNoClobberMMO);
1006
1007 // clang-format off
1008 // TODO: S32Dst, 16-bit any-extending load should not appear on True16 targets
1009 addRulesForGOpcs({G_LOAD})
1010 // flat, addrspace(0), never uniform - flat_load
1011 .Any({{DivS16, P0}, {{Vgpr16}, {VgprP0}}}, usesTrue16)
1012 .Any({{DivB32, P0}, {{VgprB32}, {VgprP0}}}) // 32-bit load, 8-bit and 16-bit any-extending load
1013 .Any({{DivB64, P0}, {{VgprB64}, {VgprP0}}})
1014 .Any({{DivB96, P0}, {{VgprB96}, {VgprP0}}})
1015 .Any({{DivB128, P0}, {{VgprB128}, {VgprP0}}})
1016
1017 // global, addrspace(1)
1018 // divergent - global_load
1019 .Any({{DivS16, P1}, {{Vgpr16}, {VgprP1}}}, usesTrue16)
1020 .Any({{DivB32, P1}, {{VgprB32}, {VgprP1}}}) //32-bit load, 8-bit and 16-bit any-extending load
1021 .Any({{DivB64, P1}, {{VgprB64}, {VgprP1}}})
1022 .Any({{DivB96, P1}, {{VgprB96}, {VgprP1}}})
1023 .Any({{DivB128, P1}, {{VgprB128}, {VgprP1}}})
1024 .Any({{DivB256, P1}, {{VgprB256}, {VgprP1}, SplitLoad}})
1025 .Any({{DivB512, P1}, {{VgprB512}, {VgprP1}, SplitLoad}})
1026
1027 // uniform - s_load
1028 .Any({{{UniS16, P1}, isNaturalAligned && isUL}, {{Sgpr32Trunc}, {SgprP1}}}, usesTrue16 && hasSMRDSmall) // s16 load
1029 .Any({{{UniS16, P1}, isAlign4 && isUL}, {{Sgpr32Trunc}, {SgprP1}, WidenMMOToS32}}, usesTrue16 && !hasSMRDSmall) // s16 load to 32-bit load
1030 .Any({{{UniB32, P1}, isNaturalAligned && isUL}, {{SgprB32}, {SgprP1}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1031 // TODO: SplitLoad when !isNaturalAligned && isUL and target hasSMRDSmall
1032 .Any({{{UniB32, P1}, is8Or16BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP1}, WidenMMOToS32}}, !hasSMRDSmall) //8-bit and 16-bit any-extending load to 32-bit load
1033 .Any({{{UniB32, P1}, is32BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP1}}}) //32-bit load
1034 .Any({{{UniB64, P1}, isAlign4 && isUL}, {{SgprB64}, {SgprP1}}})
1035 .Any({{{UniB96, P1}, isAlign16 && isUL}, {{SgprB96}, {SgprP1}, WidenLoad}}, !hasSMRDx3)
1036 .Any({{{UniB96, P1}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP1}, SplitLoad}}, !hasSMRDx3)
1037 .Any({{{UniB96, P1}, isAlign4 && isUL}, {{SgprB96}, {SgprP1}}}, hasSMRDx3)
1038 .Any({{{UniB128, P1}, isAlign4 && isUL}, {{SgprB128}, {SgprP1}}})
1039 .Any({{{UniB256, P1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}})
1040 .Any({{{UniB512, P1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}})
1041
1042 // Uniform via global or buffer load, for example volatile or non-aligned
1043 // uniform load. Not using standard {{UniInVgprTy}, {VgprP1}} since it is
1044 // selected as global_load, use SgprP1 for pointer instead to match
1045 // patterns without flat-for-global, default for GFX7 and older.
1046 // -> +flat-for-global + {{UniInVgprTy}, {SgprP1}} - global_load
1047 // -> -flat-for-global + {{UniInVgprTy}, {SgprP1}} - buffer_load
1048 .Any({{{UniS16, P1}, !isNaturalAligned || !isUL}, {{UniInVgprS16}, {SgprP1}}}, usesTrue16 && hasSMRDSmall) // s16 load
1049 .Any({{{UniS16, P1}, !isAlign4 || !isUL}, {{UniInVgprS16}, {SgprP1}}}, usesTrue16 && !hasSMRDSmall) // s16 load
1050 .Any({{{UniB32, P1}, !isNaturalAligned || !isUL}, {{UniInVgprB32}, {SgprP1}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1051 .Any({{{UniB32, P1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}}, !hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1052 .Any({{{UniB64, P1}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP1}}})
1053 .Any({{{UniB96, P1}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP1}}})
1054 .Any({{{UniB128, P1}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP1}}})
1055 .Any({{{UniB256, P1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {SgprP1}, SplitLoad}})
1056 .Any({{{UniB512, P1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {SgprP1}, SplitLoad}})
1057
1058 // local, addrspace(3) - ds_load
1059 .Any({{DivS16, P3}, {{Vgpr16}, {VgprP3}}}, usesTrue16)
1060 .Any({{DivB32, P3}, {{VgprB32}, {VgprP3}}}) // 32-bit load, 8-bit and 16-bit any-extending load
1061 .Any({{DivB64, P3}, {{VgprB64}, {VgprP3}}})
1062 .Any({{DivB96, P3}, {{VgprB96}, {VgprP3}}})
1063 .Any({{DivB128, P3}, {{VgprB128}, {VgprP3}}})
1064
1065 .Any({{UniS16, P3}, {{UniInVgprS16}, {SgprP3}}}, usesTrue16) // 16-bit load
1066 .Any({{UniB32, P3}, {{UniInVgprB32}, {VgprP3}}}) // 32-bit load, 8-bit and 16-bit any-extending load
1067 .Any({{UniB64, P3}, {{UniInVgprB64}, {VgprP3}}})
1068 .Any({{UniB96, P3}, {{UniInVgprB96}, {VgprP3}}})
1069 .Any({{UniB128, P3}, {{UniInVgprB128}, {VgprP3}}})
1070
1071 // constant, addrspace(4)
1072 // divergent - global_load
1073 .Any({{DivS16, P4}, {{Vgpr16}, {VgprP4}}}, usesTrue16)
1074 .Any({{DivB32, P4}, {{VgprB32}, {VgprP4}}}) //32-bit load, 8-bit and 16-bit any-extending load
1075 .Any({{DivB64, P4}, {{VgprB64}, {VgprP4}}})
1076 .Any({{DivB96, P4}, {{VgprB96}, {VgprP4}}})
1077 .Any({{DivB128, P4}, {{VgprB128}, {VgprP4}}})
1078 .Any({{DivB256, P4}, {{VgprB256}, {VgprP4}, SplitLoad}})
1079 .Any({{DivB512, P4}, {{VgprB512}, {VgprP4}, SplitLoad}})
1080
1081 // uniform - s_load
1082 .Any({{{UniS16, P4}, isNaturalAligned && isUL}, {{Sgpr32Trunc}, {SgprP4}}}, usesTrue16 && hasSMRDSmall) // s16 load
1083 .Any({{{UniS16, P4}, isAlign4 && isUL}, {{Sgpr32Trunc}, {SgprP4}, WidenMMOToS32}}, usesTrue16 && !hasSMRDSmall) // s16 load to 32-bit load
1084 .Any({{{UniB32, P4}, isNaturalAligned && isUL}, {{SgprB32}, {SgprP4}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1085 .Any({{{UniB32, P4}, is8Or16BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP4}, WidenMMOToS32}}, !hasSMRDSmall) //8-bit and 16-bit any-extending load to 32-bit load
1086 .Any({{{UniB32, P4}, is32BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP4}}}) //32-bit load
1087 .Any({{{UniB64, P4}, isAlign4 && isUL}, {{SgprB64}, {SgprP4}}})
1088 .Any({{{UniB96, P4}, isAlign16 && isUL}, {{SgprB96}, {SgprP4}, WidenLoad}}, !hasSMRDx3)
1089 .Any({{{UniB96, P4}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP4}, SplitLoad}}, !hasSMRDx3)
1090 .Any({{{UniB96, P4}, isAlign4 && isUL}, {{SgprB96}, {SgprP4}}}, hasSMRDx3)
1091 .Any({{{UniB128, P4}, isAlign4 && isUL}, {{SgprB128}, {SgprP4}}})
1092 .Any({{{UniB256, P4}, isAlign4 && isUL}, {{SgprB256}, {SgprP4}}})
1093 .Any({{{UniB512, P4}, isAlign4 && isUL}, {{SgprB512}, {SgprP4}}})
1094
1095 // uniform in vgpr - global_load or buffer_load
1096 .Any({{{UniS16, P4}, !isNaturalAligned || !isUL}, {{UniInVgprS16}, {SgprP4}}}, usesTrue16 && hasSMRDSmall) // s16 load
1097 .Any({{{UniS16, P4}, !isAlign4 || !isUL}, {{UniInVgprS16}, {SgprP4}}}, usesTrue16 && !hasSMRDSmall) // s16 load
1098 .Any({{{UniB32, P4}, !isNaturalAligned || !isUL}, {{UniInVgprB32}, {SgprP4}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1099 .Any({{{UniB32, P4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP4}}}, !hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1100 .Any({{{UniB64, P4}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP4}}})
1101 .Any({{{UniB96, P4}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP4}}})
1102 .Any({{{UniB128, P4}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP4}}})
1103 .Any({{{UniB256, P4}, !isAlign4 || !isUL}, {{UniInVgprB256}, {SgprP4}, SplitLoad}})
1104 .Any({{{UniB512, P4}, !isAlign4 || !isUL}, {{UniInVgprB512}, {SgprP4}, SplitLoad}})
1105
1106 // private, addrspace(5), never uniform - scratch_load
1107 .Any({{DivS16, P5}, {{Vgpr16}, {VgprP5}}}, usesTrue16)
1108 .Any({{DivB32, P5}, {{VgprB32}, {VgprP5}}}) // 32-bit load, 8-bit and 16-bit any-extending load
1109 .Any({{DivB64, P5}, {{VgprB64}, {VgprP5}}})
1110 .Any({{DivB96, P5}, {{VgprB96}, {VgprP5}}})
1111 .Any({{DivB128, P5}, {{VgprB128}, {VgprP5}}})
1112
1113 .Any({{DivS32, Ptr128}, {{Vgpr32}, {VgprPtr128}}});
1114
1115
1116 addRulesForGOpcs({G_ZEXTLOAD, G_SEXTLOAD}) // i8 and i16 zeroextending loads
1117 .Any({{DivS32, P0}, {{Vgpr32}, {VgprP0}}})
1118
1119 .Any({{DivS32, P1}, {{Vgpr32}, {VgprP1}}})
1120 .Any({{{UniS32, P1}, isAlign4 && isUL}, {{Sgpr32}, {SgprP1}, WidenMMOToS32}}, !hasSMRDSmall)
1121 .Any({{{UniS32, P1}, isNaturalAligned && isUL}, {{Sgpr32}, {SgprP1}}}, hasSMRDSmall)
1122 .Any({{{UniS32, P1}, !isAlign4 || !isUL}, {{UniInVgprS32}, {SgprP1}}}, !hasSMRDSmall)
1123 .Any({{{UniS32, P1}, !isNaturalAligned || !isUL}, {{UniInVgprS32}, {SgprP1}}}, hasSMRDSmall)
1124
1125 .Any({{DivS32, P3}, {{Vgpr32}, {VgprP3}}})
1126 .Any({{UniS32, P3}, {{UniInVgprS32}, {VgprP3}}})
1127
1128 .Any({{DivS32, P4}, {{Vgpr32}, {VgprP4}}})
1129 .Any({{{UniS32, P4}, isAlign4 && isUL}, {{Sgpr32}, {SgprP4}, WidenMMOToS32}}, !hasSMRDSmall)
1130 .Any({{{UniS32, P4}, isNaturalAligned && isUL}, {{Sgpr32}, {SgprP4}}}, hasSMRDSmall)
1131 .Any({{{UniS32, P4}, !isAlign4 || !isUL}, {{UniInVgprS32}, {SgprP4}}}, !hasSMRDSmall)
1132 .Any({{{UniS32, P4}, !isNaturalAligned || !isUL}, {{UniInVgprS32}, {SgprP4}}}, hasSMRDSmall)
1133
1134 .Any({{DivS32, P5}, {{Vgpr32}, {VgprP5}}});
1135
1136 addRulesForGOpcs({G_STORE})
1137 // addrspace(0)
1138 .Any({{S16, P0}, {{}, {Vgpr16, VgprP0}}}, usesTrue16) // 16-bit store
1139 .Any({{B32, P0}, {{}, {VgprB32, VgprP0}}}) // 32-bit store, 8-bit and 16-bit truncating store
1140 .Any({{B64, P0}, {{}, {VgprB64, VgprP0}}})
1141 .Any({{B96, P0}, {{}, {VgprB96, VgprP0}}})
1142 .Any({{B128, P0}, {{}, {VgprB128, VgprP0}}})
1143
1144 // addrspace(1), there are no stores to addrspace(4)
1145 // For targets:
1146 // - with "+flat-for-global" - global_store
1147 // - without(-flat-for-global) - buffer_store addr64
1148 .Any({{S16, DivP1}, {{}, {Vgpr16, VgprP1}}}, usesTrue16) // 16-bit store
1149 .Any({{B32, DivP1}, {{}, {VgprB32, VgprP1}}}) // 32-bit store, 8-bit and 16-bit truncating store
1150 .Any({{B64, DivP1}, {{}, {VgprB64, VgprP1}}})
1151 .Any({{B96, DivP1}, {{}, {VgprB96, VgprP1}}})
1152 .Any({{B128, DivP1}, {{}, {VgprB128, VgprP1}}})
1153
1154 // For UniP1, use sgpr ptr to match flat-for-global patterns. Targets:
1155 // - with "+flat-for-global" - global_store for both sgpr and vgpr ptr
1156 // - without(-flat-for-global) - need sgpr ptr to select buffer_store
1157 .Any({{S16, UniP1}, {{}, {Vgpr16, SgprP1}}}, usesTrue16) // 16-bit store
1158 .Any({{B32, UniP1}, {{}, {VgprB32, SgprP1}}}) // 32-bit store, 8-bit and 16-bit truncating store
1159 .Any({{B64, UniP1}, {{}, {VgprB64, SgprP1}}})
1160 .Any({{B96, UniP1}, {{}, {VgprB96, SgprP1}}})
1161 .Any({{B128, UniP1}, {{}, {VgprB128, SgprP1}}})
1162
1163 // addrspace(3) and addrspace(5)
1164 .Any({{S16, Ptr32}, {{}, {Vgpr16, VgprPtr32}}}, usesTrue16) // 16-bit store
1165 .Any({{B32, Ptr32}, {{}, {VgprB32, VgprPtr32}}}) // 32-bit store, 8-bit and 16-bit truncating store
1166 .Any({{B64, Ptr32}, {{}, {VgprB64, VgprPtr32}}})
1167 .Any({{B96, Ptr32}, {{}, {VgprB96, VgprPtr32}}})
1168 .Any({{B128, Ptr32}, {{}, {VgprB128, VgprPtr32}}});
1169
1170 // clang-format on
1171
1172 addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD, G_AMDGPU_BUFFER_LOAD_FORMAT,
1173 G_AMDGPU_TBUFFER_LOAD_FORMAT},
1174 StandardB)
1183
1184 addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD_USHORT, G_AMDGPU_BUFFER_LOAD_UBYTE,
1185 G_AMDGPU_BUFFER_LOAD_SSHORT, G_AMDGPU_BUFFER_LOAD_SBYTE},
1186 StandardB)
1189
1190 addRulesForGOpcs(
1191 {G_AMDGPU_BUFFER_LOAD_UBYTE_TFE, G_AMDGPU_BUFFER_LOAD_USHORT_TFE},
1192 StandardB)
1195
1196 addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD_TFE, G_AMDGPU_BUFFER_LOAD_FORMAT_TFE},
1197 StandardB)
1205 .Any({{UniB160},
1207
1208 addRulesForGOpcs(
1209 {G_AMDGPU_BUFFER_LOAD_FORMAT_D16, G_AMDGPU_TBUFFER_LOAD_FORMAT_D16},
1210 StandardB)
1217
1218 addRulesForGOpcs({G_AMDGPU_BUFFER_STORE, G_AMDGPU_BUFFER_STORE_BYTE,
1219 G_AMDGPU_BUFFER_STORE_SHORT, G_AMDGPU_BUFFER_STORE_FORMAT,
1220 G_AMDGPU_BUFFER_STORE_FORMAT_D16,
1221 G_AMDGPU_TBUFFER_STORE_FORMAT,
1222 G_AMDGPU_TBUFFER_STORE_FORMAT_D16})
1223 .Any({{B32}, {{}, {VgprB32, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}})
1224 .Any({{B64}, {{}, {VgprB64, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}})
1225 .Any({{B96}, {{}, {VgprB96, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}})
1226 .Any({{B128}, {{}, {VgprB128, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}});
1227
1228 // Buffer atomics: resource descriptor + scalar offset are SGPR, data and
1229 // address components are VGPR.
1230 //
1231 // Operand order (SIInstructions.td BufferAtomicGenericInstruction):
1232 // dst = op vdata, rsrc, vindex, voffset, soffset, offset_imm, cachepolicy,
1233 // idxen_imm
1234 addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_FADD})
1235 .Any({{S32, S32, V4S32, S32, S32, S32},
1237 .Any({{S64, S64, V4S32, S32, S32, S32},
1239 .Any({{V2S16, V2S16, V4S32, S32, S32, S32},
1240 {{VgprV2S16},
1242
1243 addRulesForGOpcs({G_PTR_ADD})
1244 .Any({{UniPtr32}, {{SgprPtr32}, {SgprPtr32, Sgpr32}}})
1245 .Any({{DivPtr32}, {{VgprPtr32}, {VgprPtr32, Vgpr32}}})
1246 .Any({{UniPtr64}, {{SgprPtr64}, {SgprPtr64, Sgpr64}}})
1247 .Any({{DivPtr64}, {{VgprPtr64}, {VgprPtr64, Vgpr64}}});
1248
1249 addRulesForGOpcs({G_INTTOPTR})
1250 .Any({{UniPtr32}, {{SgprPtr32}, {Sgpr32}}})
1251 .Any({{DivPtr32}, {{VgprPtr32}, {Vgpr32}}})
1252 .Any({{UniPtr64}, {{SgprPtr64}, {Sgpr64}}})
1253 .Any({{DivPtr64}, {{VgprPtr64}, {Vgpr64}}})
1254 .Any({{UniPtr128}, {{SgprPtr128}, {Sgpr128}}})
1255 .Any({{DivPtr128}, {{VgprPtr128}, {Vgpr128}}});
1256
1257 addRulesForGOpcs({G_PTRTOINT})
1258 .Any({{UniS32}, {{Sgpr32}, {SgprPtr32}}})
1259 .Any({{DivS32}, {{Vgpr32}, {VgprPtr32}}})
1260 .Any({{UniS64}, {{Sgpr64}, {SgprPtr64}}})
1261 .Any({{DivS64}, {{Vgpr64}, {VgprPtr64}}})
1262 .Any({{UniS128}, {{Sgpr128}, {SgprPtr128}}})
1263 .Any({{DivS128}, {{Vgpr128}, {VgprPtr128}}});
1264
1265 // FIXME: Update llvm/test/CodeGen/AMDGPU/ptrmask.ll to use GlobalISel.
1266 // Currently crashes on P8 (buffer resource) tests due to legalizer issue.
1267 addRulesForGOpcs({G_PTRMASK})
1268 .Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}})
1269 .Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}})
1270 .Any({{UniP3}, {{SgprP3}, {SgprP3, Sgpr32}}})
1271 .Any({{DivP3}, {{VgprP3}, {VgprP3, Vgpr32}}});
1272
1273 addRulesForGOpcs({G_ABS}, Standard)
1274 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}})
1275 .Div(S16, {{Vgpr16}, {Vgpr16}, AbsToNegMax})
1276 .Uni(S32, {{Sgpr32}, {Sgpr32}})
1277 .Div(S32, {{Vgpr32}, {Vgpr32}, AbsToNegMax})
1278 .Uni(V2S16, {{SgprV2S16}, {SgprV2S16}, AbsToS32})
1279 .Div(V2S16, {{VgprV2S16}, {VgprV2S16}, AbsToNegMax});
1280
1281 addRulesForGOpcs({G_BITREVERSE}, Standard)
1282 .Uni(S32, {{Sgpr32}, {Sgpr32}})
1283 .Div(S32, {{Vgpr32}, {Vgpr32}})
1284 .Uni(S64, {{Sgpr64}, {Sgpr64}})
1285 .Div(S64, {{Vgpr64}, {Vgpr64}});
1286
1287 addRulesForGOpcs({G_AMDGPU_FFBH_U32, G_AMDGPU_FFBL_B32, G_CTLZ_ZERO_UNDEF,
1288 G_CTTZ_ZERO_UNDEF})
1289 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}})
1290 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
1291 .Any({{UniS32, S64}, {{Sgpr32}, {Sgpr64}}})
1293
1294 addRulesForGOpcs({G_FENCE}).Any({{{}}, {{}, {}}});
1295
1296 addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard)
1297 .Uni(S64, {{Sgpr64}, {}});
1298
1299 addRulesForGOpcs({G_BLOCK_ADDR}).Any({{UniP0}, {{SgprP0}, {}}});
1300
1301 addRulesForGOpcs({G_GLOBAL_VALUE})
1302 .Any({{UniP0}, {{SgprP0}, {}}})
1303 .Any({{UniP1}, {{SgprP1}, {}}})
1304 .Any({{UniP3}, {{SgprP3}, {}}})
1305 .Any({{UniP4}, {{SgprP4}, {}}})
1306 .Any({{UniP8}, {{SgprP8}, {}}});
1307
1308 addRulesForGOpcs({G_AMDGPU_WAVE_ADDRESS}).Any({{UniP5}, {{SgprP5}, {}}});
1309
1310 addRulesForGOpcs({G_SI_CALL})
1311 .Any({{_, UniP0}, {{None}, {SgprP0}}})
1312 .Any({{_, DivP0}, {{None}, {SgprP0Call_WF}}})
1313 .Any({{_, UniP4}, {{None}, {SgprP4}}})
1314 .Any({{_, DivP4}, {{None}, {SgprP4Call_WF}}});
1315
1316 bool hasSALUFloat = ST->hasSALUFloatInsts();
1317
1318 addRulesForGOpcs({G_FADD, G_FMUL, G_STRICT_FADD, G_STRICT_FMUL}, Standard)
1319 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
1320 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
1321 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1322 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
1323 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat)
1324 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1325 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
1326 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
1327 .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}, !hasSALUFloat)
1329 hasSALUFloat)
1330 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
1331
1332 addRulesForGOpcs({G_FSUB, G_STRICT_FSUB}, Standard)
1333 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1334 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1335 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
1336 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
1337 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
1338 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat);
1339
1340 addRulesForGOpcs({G_FMAD}, Standard)
1341 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}})
1342 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
1343 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
1344 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
1345
1346 addRulesForGOpcs({G_FLDEXP, G_STRICT_FLDEXP}, Standard)
1347 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
1348 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1349 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}})
1350 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1351 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr32}})
1352 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
1353
1354 addRulesForGOpcs({G_FMA, G_STRICT_FMA}, Standard)
1355 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
1356 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}})
1357 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64, Vgpr64}})
1358 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64, Vgpr64}})
1362 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16, Sgpr16}}, hasSALUFloat)
1363 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}}, !hasSALUFloat)
1364 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32, Sgpr32}}, hasSALUFloat)
1365 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}}, !hasSALUFloat)
1366 .Uni(V2S16,
1368 hasSALUFloat)
1370 !hasSALUFloat);
1371
1372 addRulesForGOpcs({G_AMDGPU_FMED3}, Standard)
1373 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}})
1374 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
1375 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
1376 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
1377
1378 // TODO: This opcode is generated from the i64->i16 signed clamped pattern in
1379 // the PreLegalizerCombiner. Move the combine to RegBankCombiner to keep more
1380 // instructions on SALU.
1381 addRulesForGOpcs({G_AMDGPU_SMED3}, Standard)
1382 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
1383 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
1384
1385 // FNEG and FABS are either folded as source modifiers or can be selected as
1386 // bitwise XOR and AND with Mask. XOR and AND are available on SALU but for
1387 // targets without SALU float we still select them as VGPR since there would
1388 // be no real sgpr use.
1389 addRulesForGOpcs({G_FNEG, G_FABS}, Standard)
1390 .Uni(S16, {{UniInVgprS16}, {Vgpr16}}, !hasSALUFloat)
1391 .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasSALUFloat)
1392 .Div(S16, {{Vgpr16}, {Vgpr16}})
1393 .Uni(S32, {{UniInVgprS32}, {Vgpr32}}, !hasSALUFloat)
1394 .Uni(S32, {{Sgpr32}, {Sgpr32}}, hasSALUFloat)
1395 .Div(S32, {{Vgpr32}, {Vgpr32}})
1396 .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
1397 .Div(S64, {{Vgpr64}, {Vgpr64}})
1398 .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}}, !hasSALUFloat)
1399 .Uni(V2S16, {{SgprV2S16}, {SgprV2S16}, ScalarizeToS16}, hasSALUFloat)
1400 .Div(V2S16, {{VgprV2S16}, {VgprV2S16}})
1401 .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32}}})
1402 .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32}}});
1403
1404 addRulesForGOpcs({G_FCANONICALIZE}, Standard)
1405 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
1406 .Div(S32, {{Vgpr32}, {Vgpr32}})
1407 .Uni(S16, {{UniInVgprS16}, {Vgpr16}})
1408 .Div(S16, {{Vgpr16}, {Vgpr16}})
1409 .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
1410 .Div(S64, {{Vgpr64}, {Vgpr64}})
1411 .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}})
1412 .Div(V2S16, {{VgprV2S16}, {VgprV2S16}})
1413 .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32}}})
1414 .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32}}});
1415
1416 bool hasPST = ST->hasPseudoScalarTrans();
1417 addRulesForGOpcs({G_FSQRT}, Standard)
1418 .Div(S16, {{Vgpr16}, {Vgpr16}})
1419 .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasPST)
1420 .Uni(S16, {{UniInVgprS16}, {Vgpr16}}, !hasPST);
1421
1422 addRulesForGOpcs({G_FPTOUI, G_FPTOSI})
1423 .Any({{UniS16, S16}, {{UniInVgprS16}, {Vgpr16}}})
1424 .Any({{DivS16, S16}, {{Vgpr16}, {Vgpr16}}})
1425 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}, hasSALUFloat)
1426 .Any({{UniS32, S16}, {{UniInVgprS32}, {Vgpr16}}}, !hasSALUFloat)
1427 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}})
1428 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
1429 .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat)
1430 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
1431 .Any({{UniS32, S64}, {{UniInVgprS32}, {Vgpr64}}})
1432 .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}});
1433
1434 addRulesForGOpcs({G_UITOFP, G_SITOFP})
1435 .Any({{UniS16, S16}, {{UniInVgprS16}, {Vgpr16}}})
1436 .Any({{DivS16, S16}, {{Vgpr16}, {Vgpr16}}})
1437 .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}}, hasSALUFloat)
1438 .Any({{UniS16, S32}, {{UniInVgprS16}, {Vgpr32}}}, !hasSALUFloat)
1439 .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
1440 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
1441 .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat)
1442 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
1443 .Any({{UniS64, S32}, {{UniInVgprS64}, {Vgpr32}}})
1444 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}}});
1445
1446 addRulesForGOpcs({G_AMDGPU_S_BUFFER_PREFETCH})
1448
1449 addRulesForGOpcs({G_FPEXT})
1450 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}})
1451 .Any({{UniS64, S32}, {{UniInVgprS64}, {Vgpr32}}})
1452 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}}})
1453 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}, hasSALUFloat)
1454 .Any({{UniS32, S16}, {{UniInVgprS32}, {Vgpr16}}}, !hasSALUFloat);
1455
1456 addRulesForGOpcs({G_AMDGPU_CVT_PK_I16_I32}, Standard)
1457 .Uni(V2S16, {{UniInVgprV2S16}, {Vgpr32, Vgpr32}})
1458 .Div(V2S16, {{VgprV2S16}, {Vgpr32, Vgpr32}});
1459
1460 addRulesForGOpcs({G_AMDGPU_FMIN_LEGACY, G_AMDGPU_FMAX_LEGACY}, Standard)
1461 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
1462 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
1463
1464 bool hasSALUMinimumMaximumInsts = ST->hasSALUMinimumMaximumInsts();
1465
1466 addRulesForGOpcs({G_FMINIMUM, G_FMAXIMUM}, Standard)
1467 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUMinimumMaximumInsts)
1468 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUMinimumMaximumInsts)
1469 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1470 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUMinimumMaximumInsts)
1471 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUMinimumMaximumInsts)
1472 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1473 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
1474 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
1476 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
1477
1478 addRulesForGOpcs({G_FMINNUM_IEEE, G_FMAXNUM_IEEE, G_FMINNUM, G_FMAXNUM,
1479 G_FMINIMUMNUM, G_FMAXIMUMNUM},
1480 Standard)
1481 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1482 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1483 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
1484 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
1486 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
1487 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
1488 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
1489 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
1490 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat);
1491
1492 addRulesForGOpcs({G_FPTRUNC})
1493 .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
1494 .Any({{UniS32, S64}, {{UniInVgprS32}, {Vgpr64}}})
1495 .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}})
1497 .Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}})
1498 .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}}, hasSALUFloat)
1499 .Any({{UniS16, S32}, {{UniInVgprS16}, {Vgpr32}}}, !hasSALUFloat);
1500
1501 addRulesForGOpcs({G_IS_FPCLASS})
1502 .Any({{DivS1, S16}, {{Vcc}, {Vgpr16}}})
1503 .Any({{UniS1, S16}, {{UniInVcc}, {Vgpr16}}})
1504 .Any({{DivS1, S32}, {{Vcc}, {Vgpr32}}})
1505 .Any({{UniS1, S32}, {{UniInVcc}, {Vgpr32}}})
1506 .Any({{DivS1, S64}, {{Vcc}, {Vgpr64}}})
1507 .Any({{UniS1, S64}, {{UniInVcc}, {Vgpr64}}});
1508
1509 addRulesForGOpcs({G_FCMP}, Standard)
1510 .Any({{UniS1, _, S16}, {{Sgpr32Trunc}, {None, Sgpr16, Sgpr16}}},
1511 hasSALUFloat)
1512 .Any({{UniS1, _, S16}, {{UniInVcc}, {None, Vgpr16, Vgpr16}}},
1513 !hasSALUFloat)
1514 .Any({{DivS1, _, S16}, {{Vcc}, {None, Vgpr16, Vgpr16}}})
1515 .Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}},
1516 hasSALUFloat)
1517 .Any({{UniS1, _, S32}, {{UniInVcc}, {None, Vgpr32, Vgpr32}}},
1518 !hasSALUFloat)
1519 .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
1520 .Any({{UniS1, _, S64}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}})
1521 .Any({{DivS1, _, S64}, {{Vcc}, {None, Vgpr64, Vgpr64}}});
1522
1523 addRulesForGOpcs({G_INTRINSIC_TRUNC, G_INTRINSIC_ROUNDEVEN, G_FFLOOR, G_FCEIL,
1524 G_FEXP2, G_FLOG2},
1525 Standard)
1526 .Uni(S16, {{UniInVgprS16}, {Vgpr16}})
1527 .Div(S16, {{Vgpr16}, {Vgpr16}})
1528 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
1529 .Div(S32, {{Vgpr32}, {Vgpr32}})
1530 .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
1531 .Div(S64, {{Vgpr64}, {Vgpr64}});
1532
1533 using namespace Intrinsic;
1534
1535 addRulesForIOpcs({returnaddress}).Any({{UniP0}, {{SgprP0}, {}}});
1536
1537 addRulesForIOpcs({amdgcn_s_getpc}).Any({{UniS64, _}, {{Sgpr64}, {None}}});
1538
1539 addRulesForIOpcs({amdgcn_s_getreg}).Any({{}, {{Sgpr32}, {IntrId, Imm}}});
1540
1541 addRulesForIOpcs({amdgcn_s_setreg})
1542 .Any({{_, _, S32}, {{}, {IntrId, Imm, SgprB32_ReadFirstLane}}});
1543
1544 addRulesForIOpcs({amdgcn_s_sendmsg, amdgcn_s_sendmsghalt})
1545 .Any({{}, {{}, {IntrId, Imm, SgprB32_M0}}});
1546
1547 addRulesForIOpcs({amdgcn_s_sendmsg_rtn})
1548 .Any({{S32}, {{Sgpr32}, {}}})
1549 .Any({{S64}, {{Sgpr64}, {}}});
1550
1551 addRulesForIOpcs({amdgcn_s_memrealtime, amdgcn_s_memtime}, Standard)
1552 .Uni(S64, {{Sgpr64}, {IntrId}});
1553
1554 addRulesForIOpcs({amdgcn_groupstaticsize, amdgcn_pops_exiting_wave_id,
1555 amdgcn_reloc_constant, amdgcn_s_get_waveid_in_workgroup},
1556 Standard)
1557 .Uni(S32, {{Sgpr32}, {IntrId}});
1558
1559 // Intrinsics with no register operands.
1560 addRulesForIOpcs({amdgcn_asyncmark,
1561 amdgcn_endpgm,
1562 amdgcn_init_exec,
1563 amdgcn_s_barrier,
1564 amdgcn_s_barrier_leave,
1565 amdgcn_s_barrier_signal,
1566 amdgcn_s_barrier_wait,
1567 amdgcn_s_monitor_sleep,
1568 amdgcn_s_nop,
1569 amdgcn_s_sethalt,
1570 amdgcn_s_setprio,
1571 amdgcn_s_setprio_inc_wg,
1572 amdgcn_s_sleep,
1573 amdgcn_s_ttracedata_imm,
1574 amdgcn_s_wait_asynccnt,
1575 amdgcn_s_wait_bvhcnt,
1576 amdgcn_s_wait_dscnt,
1577 amdgcn_s_wait_event,
1578 amdgcn_s_wait_event_export_ready,
1579 amdgcn_s_wait_expcnt,
1580 amdgcn_s_wait_kmcnt,
1581 amdgcn_s_wait_loadcnt,
1582 amdgcn_s_wait_samplecnt,
1583 amdgcn_s_wait_storecnt,
1584 amdgcn_s_wait_tensorcnt,
1585 amdgcn_s_waitcnt,
1586 amdgcn_unreachable,
1587 amdgcn_wait_asyncmark,
1588 amdgcn_wave_barrier})
1589 .Any({{}, {{}, {}}});
1590
1591 addRulesForIOpcs({amdgcn_init_exec_from_input})
1592 .Any({{}, {{}, {IntrId, Sgpr32, Imm}}});
1593
1594 addRulesForIOpcs({amdgcn_s_ttracedata}).Any({{}, {{}, {IntrId, SgprB32_M0}}});
1595
1596 addRulesForIOpcs({amdgcn_s_sleep_var})
1597 .Any({{}, {{}, {IntrId, SgprB32_ReadFirstLane}}});
1598
1599 addRulesForIOpcs({amdgcn_s_barrier_join, amdgcn_s_wakeup_barrier})
1600 .Any({{}, {{}, {IntrId, SgprB32_M0}}});
1601
1602 addRulesForIOpcs({amdgcn_s_barrier_signal_var, amdgcn_s_barrier_init})
1603 .Any({{}, {{}, {IntrId, SgprB32_M0, SgprB32_M0}}});
1604
1605 addRulesForIOpcs({amdgcn_s_barrier_signal_isfirst})
1606 .Any({{UniS1}, {{Sgpr32Trunc}, {}}});
1607
1608 addRulesForIOpcs(
1609 {amdgcn_s_get_named_barrier_state, amdgcn_s_get_barrier_state}, Standard)
1610 .Uni(S32, {{Sgpr32}, {IntrId, SgprB32_M0}});
1611
1612 addRulesForIOpcs({amdgcn_flat_prefetch}).Any({{}, {{}, {IntrId, VgprP0}}});
1613
1614 addRulesForIOpcs({amdgcn_global_prefetch}).Any({{}, {{}, {IntrId, VgprP1}}});
1615
1616 addRulesForIOpcs({amdgcn_s_prefetch_data})
1618
1619 addRulesForIOpcs({amdgcn_class})
1620 .Any({{UniS1, _, S16}, {{UniInVcc}, {IntrId, Vgpr16, Vgpr32}}})
1621 .Any({{DivS1, _, S16}, {{Vcc}, {IntrId, Vgpr16, Vgpr32}}})
1622 .Any({{UniS1, _, S32}, {{UniInVcc}, {IntrId, Vgpr32, Vgpr32}}})
1623 .Any({{DivS1, _, S32}, {{Vcc}, {IntrId, Vgpr32, Vgpr32}}})
1624 .Any({{UniS1, _, S64}, {{UniInVcc}, {IntrId, Vgpr64, Vgpr32}}})
1625 .Any({{DivS1, _, S64}, {{Vcc}, {IntrId, Vgpr64, Vgpr32}}});
1626
1627 // This is "intrinsic lane mask" it was set to i32/i64 in llvm-ir.
1628 addRulesForIOpcs({amdgcn_end_cf})
1629 .Any({{_, UniS32}, {{}, {IntrId, Sgpr32}}})
1630 .Any({{_, UniS64}, {{}, {IntrId, Sgpr64}}});
1631
1632 addRulesForIOpcs({amdgcn_if_break}, Standard)
1633 .Uni(S64, {{Sgpr64}, {IntrId, Vcc, Sgpr64}})
1634 .Uni(S32, {{Sgpr32}, {IntrId, Vcc, Sgpr32}});
1635
1636 addRulesForIOpcs({amdgcn_exp})
1637 .Any({{_, _, _, S32, S32, S32, S32},
1638 {{}, {IntrId, Imm, Imm, Vgpr32, Vgpr32, Vgpr32, Vgpr32}}});
1639
1640 addRulesForIOpcs({amdgcn_exp_compr})
1641 .Any({{_, _, _, V2S16}, {{}, {IntrId, Imm, Imm, VgprV2S16, VgprV2S16}}});
1642
1643 addRulesForIOpcs({amdgcn_exp_row})
1644 .Any({{_, _, _, S32, S32, S32, S32, _, S32},
1645 {{},
1647 SgprB32_M0}}});
1648
1649 addRulesForIOpcs({amdgcn_lds_direct_load}, StandardB)
1650 .Div(B32, {{VgprB32}, {IntrId, SgprB32_M0}});
1651
1652 addRulesForIOpcs({amdgcn_lds_param_load}, Standard)
1653 .Div(S32, {{Vgpr32}, {IntrId, Imm, Imm, SgprB32_M0}});
1654
1655 addRulesForIOpcs({amdgcn_mbcnt_lo, amdgcn_mbcnt_hi}, Standard)
1656 .Div(S32, {{}, {Vgpr32, None, Vgpr32, Vgpr32}});
1657
1658 addRulesForIOpcs({amdgcn_readfirstlane})
1659 .Any({{UniB32, _, DivB32}, {{}, {SgprB32, None, VgprB32}}})
1660 // this should not exist in the first place, it is from call lowering
1661 // readfirstlaning just in case register is not in sgpr.
1662 .Any({{UniS32, _, UniS32}, {{}, {Sgpr32, None, Vgpr32}}});
1663
1664 addRulesForIOpcs({amdgcn_readlane}, StandardB)
1666
1667 addRulesForIOpcs({amdgcn_writelane}, StandardB)
1668 .Div(B32,
1669 {{VgprB32},
1671
1672 addRulesForIOpcs({amdgcn_add_max_i32, amdgcn_add_max_u32, amdgcn_add_min_i32,
1673 amdgcn_add_min_u32},
1674 Standard)
1675 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1676 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1677
1678 addRulesForIOpcs({amdgcn_pk_add_max_i16, amdgcn_pk_add_max_u16,
1679 amdgcn_pk_add_min_i16, amdgcn_pk_add_min_u16},
1680 Standard)
1683
1684 addRulesForIOpcs({amdgcn_permlane16, amdgcn_permlanex16}, Standard)
1685 .Div(S32, {{Vgpr32},
1688
1689 addRulesForIOpcs({amdgcn_perm}, Standard)
1690 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1691 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1692
1693 addRulesForIOpcs(
1694 {amdgcn_wave_reduce_add, amdgcn_wave_reduce_and, amdgcn_wave_reduce_fadd,
1695 amdgcn_wave_reduce_fmax, amdgcn_wave_reduce_fmin,
1696 amdgcn_wave_reduce_fsub, amdgcn_wave_reduce_max, amdgcn_wave_reduce_min,
1697 amdgcn_wave_reduce_or, amdgcn_wave_reduce_sub, amdgcn_wave_reduce_umax,
1698 amdgcn_wave_reduce_umin, amdgcn_wave_reduce_xor},
1699 Standard)
1700 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}})
1701 .Div(S32, {{Sgpr32ToVgprDst}, {IntrId, VgprB32}})
1702 .Uni(S64, {{Sgpr64}, {IntrId, Sgpr64}})
1703 .Div(S64, {{Sgpr64ToVgprDst}, {IntrId, VgprB64}});
1704
1705 addRulesForIOpcs({amdgcn_bitop3, amdgcn_fmad_ftz}, Standard)
1706 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16, Vgpr16, Vgpr16}})
1707 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16, Vgpr16, Vgpr16}})
1708 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1709 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1710
1711 addRulesForIOpcs({amdgcn_udot4, amdgcn_sdot4, amdgcn_udot8, amdgcn_sdot8,
1712 amdgcn_dot4_f32_bf8_bf8, amdgcn_dot4_f32_bf8_fp8,
1713 amdgcn_dot4_f32_fp8_fp8, amdgcn_dot4_f32_fp8_bf8},
1714 Standard)
1715 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1716 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1717
1718 addRulesForIOpcs({amdgcn_mul_u24, amdgcn_mul_i24}, Standard)
1719 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32}})
1720 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}})
1721 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr32, Vgpr32}})
1722 .Div(S64, {{Vgpr64}, {IntrId, Vgpr32, Vgpr32}});
1723
1724 addRulesForIOpcs({amdgcn_ds_bpermute, amdgcn_ds_bpermute_fi_b32,
1725 amdgcn_ds_permute, amdgcn_fmul_legacy, amdgcn_mulhi_i24,
1726 amdgcn_mulhi_u24},
1727 Standard)
1728 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32}})
1729 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}});
1730
1731 addRulesForIOpcs({amdgcn_cubesc, amdgcn_cubetc, amdgcn_cubema, amdgcn_cubeid,
1732 amdgcn_fma_legacy},
1733 Standard)
1734 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1735 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1736
1737 addRulesForIOpcs({amdgcn_frexp_mant, amdgcn_fract}, Standard)
1738 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}})
1739 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
1740 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}})
1741 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
1742 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64}})
1743 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64}});
1744
1745 addRulesForIOpcs({amdgcn_prng_b32})
1746 .Any({{UniS32}, {{UniInVgprS32}, {IntrId, Vgpr32}}})
1747 .Any({{DivS32}, {{Vgpr32}, {IntrId, Vgpr32}}});
1748
1749 addRulesForIOpcs({amdgcn_sffbh}, Standard)
1750 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}})
1751 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}});
1752
1753 addRulesForIOpcs({amdgcn_ubfe, amdgcn_sbfe}, Standard)
1754 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1755 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32, Sgpr32, Sgpr32}, S_BFE})
1756 .Uni(S64, {{Sgpr64}, {IntrId, Sgpr64, Sgpr32, Sgpr32}, S_BFE})
1757 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr32, Vgpr32}, V_BFE});
1758
1759 addRulesForIOpcs({amdgcn_cvt_pk_i16, amdgcn_cvt_pk_u16, amdgcn_cvt_pknorm_i16,
1760 amdgcn_cvt_pknorm_u16, amdgcn_cvt_pkrtz},
1761 Standard)
1762 .Uni(V2S16, {{UniInVgprV2S16}, {IntrId, Vgpr32, Vgpr32}})
1763 .Div(V2S16, {{VgprV2S16}, {IntrId, Vgpr32, Vgpr32}});
1764
1765 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk32_bf6_f16,
1766 amdgcn_cvt_scalef32_sr_pk32_fp6_f16,
1767 amdgcn_cvt_scalef32_sr_pk32_bf6_bf16,
1768 amdgcn_cvt_scalef32_sr_pk32_fp6_bf16},
1769 Standard)
1771
1772 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk32_bf6_f32,
1773 amdgcn_cvt_scalef32_sr_pk32_fp6_f32},
1774 Standard)
1776
1777 addRulesForIOpcs({amdgcn_global_load_tr_b64})
1778 .Any({{DivB64, _, UniP1}, {{VgprB64}, {IntrId, SgprP1}}})
1779 .Any({{DivB64, _, DivP1}, {{VgprB64}, {IntrId, VgprP1}}})
1780 .Any({{DivB32, _, UniP1}, {{VgprB32}, {IntrId, SgprP1}}})
1781 .Any({{DivB32, _, DivP1}, {{VgprB32}, {IntrId, VgprP1}}});
1782
1783 addRulesForIOpcs({amdgcn_global_load_tr_b128})
1784 .Any({{DivB64, _, UniP1}, {{VgprB64}, {IntrId, SgprP1}}})
1785 .Any({{DivB64, _, DivP1}, {{VgprB64}, {IntrId, VgprP1}}})
1786 .Any({{DivB128, _, UniP1}, {{VgprB128}, {IntrId, SgprP1}}})
1787 .Any({{DivB128, _, DivP1}, {{VgprB128}, {IntrId, VgprP1}}});
1788
1789 addRulesForIOpcs({amdgcn_global_load_tr4_b64})
1790 .Any({{DivV2S32, _, UniP1}, {{VgprV2S32}, {IntrId, SgprP1}}})
1791 .Any({{DivV2S32, _, DivP1}, {{VgprV2S32}, {IntrId, VgprP1}}});
1792
1793 addRulesForIOpcs({amdgcn_global_load_tr6_b96})
1794 .Any({{DivV3S32, _, UniP1}, {{VgprV3S32}, {IntrId, SgprP1}}})
1795 .Any({{DivV3S32, _, DivP1}, {{VgprV3S32}, {IntrId, VgprP1}}});
1796
1797 addRulesForIOpcs({amdgcn_ds_load_tr4_b64, amdgcn_ds_load_tr8_b64})
1798 .Any({{DivV2S32}, {{VgprV2S32}, {IntrId, VgprP3}}});
1799
1800 addRulesForIOpcs({amdgcn_ds_load_tr6_b96})
1801 .Any({{DivV3S32}, {{VgprV3S32}, {IntrId, VgprP3}}});
1802
1803 addRulesForIOpcs({amdgcn_ds_load_tr16_b128})
1804 .Any({{DivB128}, {{VgprB128}, {IntrId, VgprP3}}});
1805
1806 addRulesForIOpcs({amdgcn_global_atomic_ordered_add_b64})
1807 .Any({{DivS64}, {{Vgpr64}, {IntrId, VgprP1, Vgpr64}}});
1808
1809 addRulesForIOpcs(
1810 {amdgcn_global_atomic_fmin_num, amdgcn_global_atomic_fmax_num}, Standard)
1811 .Div(S32, {{Vgpr32}, {IntrId, VgprP1, Vgpr32}});
1812
1813 addRulesForIOpcs({amdgcn_flat_atomic_fmin_num, amdgcn_flat_atomic_fmax_num},
1814 Standard)
1815 .Div(S32, {{Vgpr32}, {IntrId, VgprP0, Vgpr32}});
1816
1817 addRulesForIOpcs({amdgcn_raw_buffer_load_lds})
1818 .Any({{_}, {{}, {IntrId, SgprV4S32, SgprP3, Imm, Vgpr32, Sgpr32}}});
1819
1820 addRulesForIOpcs({amdgcn_struct_buffer_load_lds})
1821 .Any({{_},
1822 {{}, {IntrId, SgprV4S32, SgprP3, Imm, Vgpr32, Vgpr32, Sgpr32}}});
1823
1824 addRulesForIOpcs({amdgcn_raw_ptr_buffer_load_lds})
1825 .Any({{_}, {{}, {IntrId, SgprP8, SgprP3, Imm, Vgpr32, Sgpr32}}});
1826
1827 addRulesForIOpcs({amdgcn_struct_ptr_buffer_load_lds})
1828 .Any({{_}, {{}, {IntrId, SgprP8, SgprP3, Imm, Vgpr32, Vgpr32, Sgpr32}}});
1829
1830 addRulesForIOpcs({amdgcn_global_load_lds})
1831 .Any({{}, {{}, {IntrId, VgprP1, SgprB32_M0}}});
1832
1833 addRulesForIOpcs({amdgcn_global_load_async_to_lds_b8,
1834 amdgcn_global_load_async_to_lds_b32,
1835 amdgcn_global_load_async_to_lds_b64,
1836 amdgcn_global_load_async_to_lds_b128,
1837 amdgcn_global_store_async_from_lds_b8,
1838 amdgcn_global_store_async_from_lds_b32,
1839 amdgcn_global_store_async_from_lds_b64,
1840 amdgcn_global_store_async_from_lds_b128})
1841 .Any({{}, {{}, {IntrId, VgprP1, VgprP3}}});
1842
1843 addRulesForIOpcs({amdgcn_perm_pk16_b4_u4}, StandardB)
1844 .Uni(B64, {{UniInVgprB64}, {IntrId, Vgpr32, Vgpr32, VgprV2S32}})
1845 .Div(B64, {{VgprB64}, {IntrId, Vgpr32, Vgpr32, VgprV2S32}});
1846
1847 addRulesForIOpcs({amdgcn_perm_pk16_b6_u4}, StandardB)
1849 .Div(B96, {{VgprB96}, {IntrId, Vgpr32, VgprB64, VgprV2S32}});
1850
1851 addRulesForIOpcs({amdgcn_perm_pk16_b8_u4}, StandardB)
1853 .Div(B128, {{VgprB128}, {IntrId, VgprB64, VgprB64, VgprV2S32}});
1854
1855 addRulesForIOpcs({amdgcn_wwm, amdgcn_strict_wwm, amdgcn_wqm, amdgcn_softwqm,
1856 amdgcn_strict_wqm},
1857 StandardB)
1858 .Div(B32, {{VgprB32}, {IntrId, VgprB32}})
1859 .Uni(B32, {{SgprB32}, {IntrId, SgprB32}})
1860 .Div(B64, {{VgprB64}, {IntrId, VgprB64}})
1861 .Uni(B64, {{SgprB64}, {IntrId, SgprB64}})
1862 .Div(B96, {{VgprB96}, {IntrId, VgprB96}})
1863 .Uni(B96, {{SgprB96}, {IntrId, SgprB96}})
1864 .Div(B128, {{VgprB128}, {IntrId, VgprB128}})
1865 .Uni(B128, {{SgprB128}, {IntrId, SgprB128}})
1866 .Any({{UniB256}, {{SgprB256}, {IntrId, SgprB256}}})
1867 .Any({{DivB256}, {{VgprB256}, {IntrId, VgprB256}}})
1868 .Any({{UniB512}, {{SgprB512}, {IntrId, SgprB512}}})
1869 .Any({{DivB512}, {{VgprB512}, {IntrId, VgprB512}}});
1870
1871 addRulesForIOpcs({amdgcn_wqm_demote}).Any({{}, {{}, {IntrId, Vcc}}});
1872
1873 addRulesForIOpcs({amdgcn_ballot}, Standard)
1874 .Uni(S64, {{Sgpr64}, {IntrId, Vcc}})
1875 .Uni(S32, {{Sgpr32}, {IntrId, Vcc}});
1876
1877 addRulesForIOpcs({amdgcn_inverse_ballot})
1878 .Any({{DivS1, _, S32}, {{Vcc}, {IntrId, SgprB32_ReadFirstLane}}})
1879 .Any({{DivS1, _, S64}, {{Vcc}, {IntrId, SgprB64_ReadFirstLane}}});
1880
1881 addRulesForIOpcs({amdgcn_live_mask, amdgcn_ps_live})
1882 .Any({{DivS1}, {{Vcc}, {}}});
1883
1884 addRulesForIOpcs({amdgcn_mov_dpp, amdgcn_mov_dpp8}, StandardB)
1885 .Div(B32, {{VgprB32}, {IntrId, VgprB32}})
1886 .Div(B64, {{VgprB64}, {IntrId, VgprB64}});
1887
1888 addRulesForIOpcs({amdgcn_update_dpp}, StandardB)
1889 .Div(B32, {{VgprB32}, {IntrId, VgprB32, VgprB32}})
1890 .Div(B64, {{VgprB64}, {IntrId, VgprB64, VgprB64}});
1891
1892 addRulesForIOpcs({amdgcn_sin, amdgcn_cos}, Standard)
1893 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
1894 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}})
1895 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
1896 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}});
1897
1898 addRulesForIOpcs({amdgcn_trig_preop}, Standard)
1899 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr32}})
1900 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64, Vgpr32}});
1901
1902 addRulesForIOpcs({amdgcn_exp2}, Standard)
1903 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
1904 .Uni(S16, {{Sgpr16}, {IntrId, Sgpr16}}, hasPST)
1905 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}}, !hasPST)
1906 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
1907 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}}, hasPST)
1908 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}}, !hasPST);
1909
1910 addRulesForIOpcs({amdgcn_ds_atomic_async_barrier_arrive_b64})
1911 .Any({{}, {{}, {IntrId, VgprP3}}});
1912
1913 addRulesForIOpcs({amdgcn_ds_atomic_barrier_arrive_rtn_b64}, Standard)
1914 .Div(S64, {{Vgpr64}, {IntrId, VgprP3, Vgpr64}});
1915
1916 addRulesForIOpcs({amdgcn_ds_add_gs_reg_rtn, amdgcn_ds_sub_gs_reg_rtn},
1917 Standard)
1918 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
1919 .Div(S64, {{Vgpr64}, {IntrId, Vgpr32}});
1920
1921 addRulesForIOpcs({amdgcn_ds_append, amdgcn_ds_consume}, Standard)
1922 .Uni(S32, {{UniInVgprS32}, {IntrId, SgprB32_M0}})
1923 .Div(S32, {{Vgpr32}, {IntrId, SgprB32_M0}});
1924
1925 addRulesForIOpcs(
1926 {amdgcn_ds_bvh_stack_rtn, amdgcn_ds_bvh_stack_push4_pop1_rtn}, Standard)
1927 .Div(S32, {{Vgpr32, Vgpr32}, {IntrId, Vgpr32, Vgpr32, VgprV4S32}});
1928
1929 addRulesForIOpcs({amdgcn_ds_bvh_stack_push8_pop1_rtn}, Standard)
1930 .Div(S32, {{Vgpr32, Vgpr32}, {IntrId, Vgpr32, Vgpr32, VgprV8S32}});
1931
1932 addRulesForIOpcs({amdgcn_ds_bvh_stack_push8_pop2_rtn}, Standard)
1933 .Div(S64, {{Vgpr64, Vgpr32}, {IntrId, Vgpr32, Vgpr32, VgprV8S32}});
1934
1935 addRulesForIOpcs({amdgcn_ds_gws_sema_p, amdgcn_ds_gws_sema_v,
1936 amdgcn_ds_gws_sema_release_all})
1937 .Any({{}, {{}, {IntrId, SgprB32_M0}}});
1938
1939 addRulesForIOpcs(
1940 {amdgcn_ds_gws_barrier, amdgcn_ds_gws_init, amdgcn_ds_gws_sema_br})
1941 .Any({{}, {{}, {IntrId, Vgpr32, SgprB32_M0}}});
1942
1943 addRulesForIOpcs({amdgcn_ds_ordered_add, amdgcn_ds_ordered_swap}, Standard)
1944 .Div(S32, {{Vgpr32}, {IntrId, SgprB32_M0, Vgpr32}});
1945
1946 addRulesForIOpcs({amdgcn_ds_swizzle}, Standard)
1947 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}})
1948 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}});
1949
1950 addRulesForIOpcs({amdgcn_permlane16_var, amdgcn_permlanex16_var}, Standard)
1951 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1952
1953 addRulesForIOpcs({amdgcn_permlane16_swap, amdgcn_permlane32_swap}, Standard)
1954 .Div(S32, {{Vgpr32, Vgpr32}, {IntrId, Vgpr32, Vgpr32}});
1955
1956 addRulesForIOpcs({amdgcn_permlane64}, StandardB)
1957 .Div(B32, {{VgprB32}, {IntrId, VgprB32}});
1958
1959 addRulesForIOpcs({amdgcn_ds_read_tr4_b64, amdgcn_ds_read_tr8_b64})
1960 .Any({{DivV2S32}, {{VgprV2S32}, {IntrId, VgprP3}}});
1961
1962 addRulesForIOpcs({amdgcn_ds_read_tr6_b96})
1963 .Any({{DivV3S32}, {{VgprV3S32}, {IntrId, VgprP3}}});
1964
1965 addRulesForIOpcs({amdgcn_ds_read_tr16_b64})
1966 .Any({{DivV4S16}, {{VgprV4S16}, {IntrId, VgprP3}}});
1967
1968 addRulesForIOpcs({amdgcn_interp_p1}, Standard)
1969 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Imm, Imm, SgprB32_M0}});
1970
1971 addRulesForIOpcs({amdgcn_interp_p1_f16}, Standard)
1972 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Imm, Imm, Imm, SgprB32_M0}});
1973
1974 addRulesForIOpcs({amdgcn_interp_p2}, Standard)
1975 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Imm, Imm, SgprB32_M0}});
1976
1977 addRulesForIOpcs({amdgcn_interp_p2_f16}, Standard)
1978 .Div(S16,
1980
1981 addRulesForIOpcs({amdgcn_interp_mov}, Standard)
1982 .Div(S32, {{Vgpr32}, {IntrId, Imm, Imm, Imm, SgprB32_M0}});
1983
1984 addRulesForIOpcs({amdgcn_interp_inreg_p10, amdgcn_interp_inreg_p2,
1985 amdgcn_interp_inreg_p10_f16, amdgcn_interp_p10_rtz_f16},
1986 Standard)
1987 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1988 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1989
1990 addRulesForIOpcs({amdgcn_interp_inreg_p2_f16, amdgcn_interp_p2_rtz_f16},
1991 Standard)
1992 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1993 .Div(S16, {{Vgpr16}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1994
1995 addRulesForIOpcs({amdgcn_div_fmas}, Standard)
1996 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32, Vcc}})
1997 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32, Vcc}})
1998 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr64, Vgpr64, Vcc}})
1999 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64, Vgpr64, Vgpr64, Vcc}});
2000
2001 addRulesForIOpcs({amdgcn_div_fixup}, Standard)
2002 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16, Vgpr16, Vgpr16}})
2003 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16, Vgpr16, Vgpr16}})
2004 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
2005 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
2006 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr64, Vgpr64}})
2007 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64, Vgpr64, Vgpr64}});
2008
2009 addRulesForIOpcs({amdgcn_div_scale}, Standard)
2010 .Div(S32, {{Vgpr32, Vcc}, {IntrId, Vgpr32, Vgpr32}})
2011 .Uni(S32, {{UniInVgprS32, UniInVcc}, {IntrId, Vgpr32, Vgpr32}})
2012 .Div(S64, {{Vgpr64, Vcc}, {IntrId, Vgpr64, Vgpr64}})
2013 .Uni(S64, {{UniInVgprS64, UniInVcc}, {IntrId, Vgpr64, Vgpr64}});
2014
2015 addRulesForIOpcs({amdgcn_fdot2, amdgcn_sdot2, amdgcn_udot2}, Standard)
2017 .Div(S32, {{Vgpr32}, {IntrId, VgprV2S16, VgprV2S16, Vgpr32}});
2018
2019 addRulesForIOpcs({amdgcn_fdot2_f16_f16}, Standard)
2021 .Div(S16, {{Vgpr16}, {IntrId, VgprV2S16, VgprV2S16, Vgpr16}});
2022
2023 addRulesForIOpcs({amdgcn_sudot4, amdgcn_sudot8}, Standard)
2024 .Uni(S32, {{UniInVgprS32}, {IntrId, Imm, Vgpr32, Imm, Vgpr32, Vgpr32}})
2025 .Div(S32, {{Vgpr32}, {IntrId, Imm, Vgpr32, Imm, Vgpr32, Vgpr32}});
2026
2027 addRulesForIOpcs({amdgcn_s_alloc_vgpr})
2029
2030 addRulesForIOpcs({amdgcn_sat_pk4_i4_i8, amdgcn_sat_pk4_u4_u8}, Standard)
2031 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr32}})
2032 .Div(S16, {{Vgpr16}, {IntrId, Vgpr32}});
2033
2034 // TODO: Add handling for GFX90A+ which should use VGPRs instead of AGPRs.
2035 bool HasGFX90AInsts = ST->hasGFX90AInsts();
2036 addRulesForIOpcs({amdgcn_mfma_f32_32x32x1f32, amdgcn_mfma_f32_16x16x1f32,
2037 amdgcn_mfma_f32_4x4x1f32, amdgcn_mfma_f32_32x32x2f32,
2038 amdgcn_mfma_f32_16x16x4f32, amdgcn_mfma_f32_32x32x4f16,
2039 amdgcn_mfma_f32_16x16x4f16, amdgcn_mfma_f32_4x4x4f16,
2040 amdgcn_mfma_f32_32x32x8f16, amdgcn_mfma_f32_16x16x16f16,
2041 amdgcn_mfma_i32_32x32x4i8, amdgcn_mfma_i32_16x16x4i8,
2042 amdgcn_mfma_i32_4x4x4i8, amdgcn_mfma_i32_32x32x8i8,
2043 amdgcn_mfma_i32_16x16x16i8, amdgcn_mfma_f32_32x32x2bf16,
2044 amdgcn_mfma_f32_16x16x2bf16, amdgcn_mfma_f32_4x4x2bf16,
2045 amdgcn_mfma_f32_32x32x4bf16, amdgcn_mfma_f32_16x16x8bf16})
2046 .Any({{DivAnyTy},
2048 !HasGFX90AInsts);
2049
2050 // WMMA/SWMMAC intrinsics: all register operands map to VGPR.
2051 addRulesForIOpcs(
2052 {// WMMA GFX11+
2053 amdgcn_wmma_f32_16x16x16_f16, amdgcn_wmma_f32_16x16x16_bf16,
2054 amdgcn_wmma_f16_16x16x16_f16, amdgcn_wmma_bf16_16x16x16_bf16,
2055 amdgcn_wmma_f16_16x16x16_f16_tied, amdgcn_wmma_bf16_16x16x16_bf16_tied,
2056 amdgcn_wmma_i32_16x16x16_iu8, amdgcn_wmma_i32_16x16x16_iu4,
2057 // WMMA GFX12
2058 amdgcn_wmma_f32_16x16x16_fp8_fp8, amdgcn_wmma_f32_16x16x16_fp8_bf8,
2059 amdgcn_wmma_f32_16x16x16_bf8_fp8, amdgcn_wmma_f32_16x16x16_bf8_bf8,
2060 amdgcn_wmma_i32_16x16x32_iu4,
2061 // WMMA GFX1250
2062 amdgcn_wmma_f32_16x16x4_f32, amdgcn_wmma_f32_16x16x32_bf16,
2063 amdgcn_wmma_f32_16x16x32_f16, amdgcn_wmma_f16_16x16x32_f16,
2064 amdgcn_wmma_bf16_16x16x32_bf16, amdgcn_wmma_bf16f32_16x16x32_bf16,
2065 amdgcn_wmma_f32_16x16x64_fp8_fp8, amdgcn_wmma_f32_16x16x64_fp8_bf8,
2066 amdgcn_wmma_f32_16x16x64_bf8_fp8, amdgcn_wmma_f32_16x16x64_bf8_bf8,
2067 amdgcn_wmma_f16_16x16x64_fp8_fp8, amdgcn_wmma_f16_16x16x64_fp8_bf8,
2068 amdgcn_wmma_f16_16x16x64_bf8_fp8, amdgcn_wmma_f16_16x16x64_bf8_bf8,
2069 amdgcn_wmma_f16_16x16x128_fp8_fp8, amdgcn_wmma_f16_16x16x128_fp8_bf8,
2070 amdgcn_wmma_f16_16x16x128_bf8_fp8, amdgcn_wmma_f16_16x16x128_bf8_bf8,
2071 amdgcn_wmma_f32_16x16x128_fp8_fp8, amdgcn_wmma_f32_16x16x128_fp8_bf8,
2072 amdgcn_wmma_f32_16x16x128_bf8_fp8, amdgcn_wmma_f32_16x16x128_bf8_bf8,
2073 amdgcn_wmma_i32_16x16x64_iu8, amdgcn_wmma_f32_16x16x128_f8f6f4,
2074 amdgcn_wmma_scale_f32_16x16x128_f8f6f4,
2075 amdgcn_wmma_scale16_f32_16x16x128_f8f6f4, amdgcn_wmma_f32_32x16x128_f4,
2076 amdgcn_wmma_scale_f32_32x16x128_f4, amdgcn_wmma_scale16_f32_32x16x128_f4,
2077 // SWMMAC GFX12
2078 amdgcn_swmmac_f32_16x16x32_f16, amdgcn_swmmac_f32_16x16x32_bf16,
2079 amdgcn_swmmac_f16_16x16x32_f16, amdgcn_swmmac_bf16_16x16x32_bf16,
2080 amdgcn_swmmac_i32_16x16x32_iu8, amdgcn_swmmac_i32_16x16x32_iu4,
2081 amdgcn_swmmac_i32_16x16x64_iu4, amdgcn_swmmac_f32_16x16x32_fp8_fp8,
2082 amdgcn_swmmac_f32_16x16x32_fp8_bf8, amdgcn_swmmac_f32_16x16x32_bf8_fp8,
2083 amdgcn_swmmac_f32_16x16x32_bf8_bf8,
2084 // SWMMAC GFX1250
2085 amdgcn_swmmac_f32_16x16x64_f16, amdgcn_swmmac_f32_16x16x64_bf16,
2086 amdgcn_swmmac_f16_16x16x64_f16, amdgcn_swmmac_bf16_16x16x64_bf16,
2087 amdgcn_swmmac_bf16f32_16x16x64_bf16, amdgcn_swmmac_f32_16x16x128_fp8_fp8,
2088 amdgcn_swmmac_f32_16x16x128_fp8_bf8, amdgcn_swmmac_f32_16x16x128_bf8_fp8,
2089 amdgcn_swmmac_f32_16x16x128_bf8_bf8, amdgcn_swmmac_f16_16x16x128_fp8_fp8,
2090 amdgcn_swmmac_f16_16x16x128_fp8_bf8, amdgcn_swmmac_f16_16x16x128_bf8_fp8,
2091 amdgcn_swmmac_f16_16x16x128_bf8_bf8, amdgcn_swmmac_i32_16x16x128_iu8})
2092 .Any({{}, {{}, {}, ApplyAllVgpr}});
2093
2094} // end initialize rules
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU address space definition.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
constexpr LLT S16
constexpr LLT S1
constexpr LLT V2S16
constexpr LLT S32
constexpr LLT V4S32
constexpr LLT V3S32
constexpr LLT S64
constexpr LLT V2S32
constexpr LLT S128
UniformityLLTOpPredicateID LLTToBId(LLT Ty)
bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, const MachineUniformityInfo &MUI, const MachineRegisterInfo &MRI)
UniformityLLTOpPredicateID LLTToId(LLT Ty)
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
#define _
IRTranslator LLVM IR MI
Register Reg
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
bool operator()(const MachineInstr &MI) const
Predicate operator||(const Predicate &RHS) const
Predicate operator&&(const Predicate &RHS) const
Predicate(std::function< bool(const MachineInstr &)> Pred)
Predicate operator!() const
RegBankLegalizeRules(const GCNSubtarget &ST, MachineRegisterInfo &MRI)
const SetOfRulesForOpcode * getRulesForOpc(MachineInstr &MI) const
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
void addFastRuleDivergent(UniformityLLTOpPredicateID Ty, RegBankLLTMapping RuleApplyIDs)
void addFastRuleUniform(UniformityLLTOpPredicateID Ty, RegBankLLTMapping RuleApplyIDs)
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
bool isSigned() const
Definition InstrTypes.h:930
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
bool isUniform(ConstValueRefT V) const
Whether V is uniform/non-divergent.
bool isEquality() const
Return true if this predicate is either EQ or NE.
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
TypeSize getValue() const
Representation of each machine instruction.
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const TargetRegisterInfo * getTargetRegisterInfo() const
Wrapper class representing virtual and physical registers.
Definition Register.h:20
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
bool isAnyPtr(LLT Ty, unsigned Width)
bool isUniformMMO(const MachineMemOperand *MMO)
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
SmallVector< UniformityLLTOpPredicateID, 4 > OpUniformityAndTypes
PredicateMapping(std::initializer_list< UniformityLLTOpPredicateID > OpList, std::function< bool(const MachineInstr &)> TestFunc=nullptr)
bool match(const MachineInstr &MI, const MachineUniformityInfo &MUI, const MachineRegisterInfo &MRI) const
std::function< bool(const MachineInstr &)> TestFunc
RegBankLLTMapping(std::initializer_list< RegBankLLTMappingApplyID > DstOpMappingList, std::initializer_list< RegBankLLTMappingApplyID > SrcOpMappingList, LoweringMethodID LoweringMethod=DoNotLower)
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39