LLVM 23.0.0git
AMDGPURegBankLegalizeRules.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeRules.cpp ------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Definitions of RegBankLegalize Rules for all opcodes.
10/// Implementation of container for all the Rules and search.
11/// Fast search for most common case when Rule.Predicate checks LLT and
12/// uniformity of register in operand 0.
13//
14//===----------------------------------------------------------------------===//
15
17#include "AMDGPUInstrInfo.h"
18#include "GCNSubtarget.h"
21#include "llvm/IR/IntrinsicsAMDGPU.h"
23
24#define DEBUG_TYPE "amdgpu-regbanklegalize"
25
26using namespace llvm;
27using namespace AMDGPU;
28
29bool AMDGPU::isAnyPtr(LLT Ty, unsigned Width) {
30 return Ty.isPointer() && Ty.getSizeInBits() == Width;
31}
32
34 std::initializer_list<RegBankLLTMappingApplyID> DstOpMappingList,
35 std::initializer_list<RegBankLLTMappingApplyID> SrcOpMappingList,
37 : DstOpMapping(DstOpMappingList), SrcOpMapping(SrcOpMappingList),
39
41 std::initializer_list<UniformityLLTOpPredicateID> OpList,
42 std::function<bool(const MachineInstr &)> TestFunc)
44
46 const MachineUniformityInfo &MUI,
47 const MachineRegisterInfo &MRI) {
48 switch (UniID) {
49 case S1:
50 return MRI.getType(Reg) == LLT::scalar(1);
51 case S16:
52 return MRI.getType(Reg) == LLT::scalar(16);
53 case S32:
54 return MRI.getType(Reg) == LLT::scalar(32);
55 case S64:
56 return MRI.getType(Reg) == LLT::scalar(64);
57 case S128:
58 return MRI.getType(Reg) == LLT::scalar(128);
59 case P0:
60 return MRI.getType(Reg) == LLT::pointer(0, 64);
61 case P1:
62 return MRI.getType(Reg) == LLT::pointer(1, 64);
63 case P2:
64 return MRI.getType(Reg) == LLT::pointer(2, 32);
65 case P3:
66 return MRI.getType(Reg) == LLT::pointer(3, 32);
67 case P4:
68 return MRI.getType(Reg) == LLT::pointer(4, 64);
69 case P5:
70 return MRI.getType(Reg) == LLT::pointer(5, 32);
71 case P8:
72 return MRI.getType(Reg) == LLT::pointer(8, 128);
73 case Ptr32:
74 return isAnyPtr(MRI.getType(Reg), 32);
75 case Ptr64:
76 return isAnyPtr(MRI.getType(Reg), 64);
77 case Ptr128:
78 return isAnyPtr(MRI.getType(Reg), 128);
79 case V2S16:
80 return MRI.getType(Reg) == LLT::fixed_vector(2, 16);
81 case V2S32:
82 return MRI.getType(Reg) == LLT::fixed_vector(2, 32);
83 case V3S32:
84 return MRI.getType(Reg) == LLT::fixed_vector(3, 32);
85 case V4S32:
86 return MRI.getType(Reg) == LLT::fixed_vector(4, 32);
87 case B32:
88 return MRI.getType(Reg).getSizeInBits() == 32;
89 case B64:
90 return MRI.getType(Reg).getSizeInBits() == 64;
91 case B96:
92 return MRI.getType(Reg).getSizeInBits() == 96;
93 case B128:
94 return MRI.getType(Reg).getSizeInBits() == 128;
95 case B160:
96 return MRI.getType(Reg).getSizeInBits() == 160;
97 case B256:
98 return MRI.getType(Reg).getSizeInBits() == 256;
99 case B512:
100 return MRI.getType(Reg).getSizeInBits() == 512;
101 case DivAnyTy:
102 return MUI.isDivergentAtDef(Reg);
103 case UniS1:
104 return MRI.getType(Reg) == LLT::scalar(1) && MUI.isUniformAtDef(Reg);
105 case UniS16:
106 return MRI.getType(Reg) == LLT::scalar(16) && MUI.isUniformAtDef(Reg);
107 case UniS32:
108 return MRI.getType(Reg) == LLT::scalar(32) && MUI.isUniformAtDef(Reg);
109 case UniS64:
110 return MRI.getType(Reg) == LLT::scalar(64) && MUI.isUniformAtDef(Reg);
111 case UniS128:
112 return MRI.getType(Reg) == LLT::scalar(128) && MUI.isUniformAtDef(Reg);
113 case UniP0:
114 return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isUniformAtDef(Reg);
115 case UniP1:
116 return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isUniformAtDef(Reg);
117 case UniP2:
118 return MRI.getType(Reg) == LLT::pointer(2, 32) && MUI.isUniformAtDef(Reg);
119 case UniP3:
120 return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isUniformAtDef(Reg);
121 case UniP4:
122 return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniformAtDef(Reg);
123 case UniP5:
124 return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniformAtDef(Reg);
125 case UniP6:
126 return MRI.getType(Reg) == LLT::pointer(6, 32) && MUI.isUniformAtDef(Reg);
127 case UniP8:
128 return MRI.getType(Reg) == LLT::pointer(8, 128) && MUI.isUniformAtDef(Reg);
129 case UniPtr32:
130 return isAnyPtr(MRI.getType(Reg), 32) && MUI.isUniformAtDef(Reg);
131 case UniPtr64:
132 return isAnyPtr(MRI.getType(Reg), 64) && MUI.isUniformAtDef(Reg);
133 case UniPtr128:
134 return isAnyPtr(MRI.getType(Reg), 128) && MUI.isUniformAtDef(Reg);
135 case UniV2S16:
136 return MRI.getType(Reg) == LLT::fixed_vector(2, 16) &&
137 MUI.isUniformAtDef(Reg);
138 case UniV2S32:
139 return MRI.getType(Reg) == LLT::fixed_vector(2, 32) &&
140 MUI.isUniformAtDef(Reg);
141 case UniV3S32:
142 return MRI.getType(Reg) == LLT::fixed_vector(3, 32) &&
143 MUI.isUniformAtDef(Reg);
144 case UniV4S32:
145 return MRI.getType(Reg) == LLT::fixed_vector(4, 32) &&
146 MUI.isUniformAtDef(Reg);
147 case UniV6S32:
148 return MRI.getType(Reg) == LLT::fixed_vector(6, 32) &&
149 MUI.isUniformAtDef(Reg);
150 case UniV8S16:
151 return MRI.getType(Reg) == LLT::fixed_vector(8, 16) &&
152 MUI.isUniformAtDef(Reg);
153 case UniV8S32:
154 return MRI.getType(Reg) == LLT::fixed_vector(8, 32) &&
155 MUI.isUniformAtDef(Reg);
156 case UniV16S16:
157 return MRI.getType(Reg) == LLT::fixed_vector(16, 16) &&
158 MUI.isUniformAtDef(Reg);
159 case UniV16S32:
160 return MRI.getType(Reg) == LLT::fixed_vector(16, 32) &&
161 MUI.isUniformAtDef(Reg);
162 case UniV32S16:
163 return MRI.getType(Reg) == LLT::fixed_vector(32, 16) &&
164 MUI.isUniformAtDef(Reg);
165 case UniV32S32:
166 return MRI.getType(Reg) == LLT::fixed_vector(32, 32) &&
167 MUI.isUniformAtDef(Reg);
168 case UniV2S64:
169 return MRI.getType(Reg) == LLT::fixed_vector(2, 64) &&
170 MUI.isUniformAtDef(Reg);
171 case UniB32:
172 return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isUniformAtDef(Reg);
173 case UniB64:
174 return MRI.getType(Reg).getSizeInBits() == 64 && MUI.isUniformAtDef(Reg);
175 case UniB96:
176 return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isUniformAtDef(Reg);
177 case UniB128:
178 return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isUniformAtDef(Reg);
179 case UniB160:
180 return MRI.getType(Reg).getSizeInBits() == 160 && MUI.isUniformAtDef(Reg);
181 case UniB256:
182 return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isUniformAtDef(Reg);
183 case UniB512:
184 return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isUniformAtDef(Reg);
185 case UniBRC: {
186 if (MUI.isDivergentAtDef(Reg))
187 return false;
188 // Check if there is SGPR register class of same size as the LLT.
189 const SIRegisterInfo *TRI =
190 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
191 // There is no 16 bit SGPR register class. Extra size check is required
192 // since getSGPRClassForBitWidth returns SReg_32RegClass for Size 16.
193 unsigned LLTSize = MRI.getType(Reg).getSizeInBits();
194 return LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize);
195 }
196 case DivS1:
197 return MRI.getType(Reg) == LLT::scalar(1) && MUI.isDivergentAtDef(Reg);
198 case DivS16:
199 return MRI.getType(Reg) == LLT::scalar(16) && MUI.isDivergentAtDef(Reg);
200 case DivS32:
201 return MRI.getType(Reg) == LLT::scalar(32) && MUI.isDivergentAtDef(Reg);
202 case DivS64:
203 return MRI.getType(Reg) == LLT::scalar(64) && MUI.isDivergentAtDef(Reg);
204 case DivS128:
205 return MRI.getType(Reg) == LLT::scalar(128) && MUI.isDivergentAtDef(Reg);
206 case DivP0:
207 return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isDivergentAtDef(Reg);
208 case DivP1:
209 return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isDivergentAtDef(Reg);
210 case DivP2:
211 return MRI.getType(Reg) == LLT::pointer(2, 32) && MUI.isDivergentAtDef(Reg);
212 case DivP3:
213 return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isDivergentAtDef(Reg);
214 case DivP4:
215 return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isDivergentAtDef(Reg);
216 case DivP5:
217 return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isDivergentAtDef(Reg);
218 case DivPtr32:
219 return isAnyPtr(MRI.getType(Reg), 32) && MUI.isDivergentAtDef(Reg);
220 case DivPtr64:
221 return isAnyPtr(MRI.getType(Reg), 64) && MUI.isDivergentAtDef(Reg);
222 case DivPtr128:
223 return isAnyPtr(MRI.getType(Reg), 128) && MUI.isDivergentAtDef(Reg);
224 case DivV2S16:
225 return MRI.getType(Reg) == LLT::fixed_vector(2, 16) &&
227 case DivV2S32:
228 return MRI.getType(Reg) == LLT::fixed_vector(2, 32) &&
230 case DivV4S32:
231 return MRI.getType(Reg) == LLT::fixed_vector(4, 32) &&
233 case DivV2S64:
234 return MRI.getType(Reg) == LLT::fixed_vector(2, 64) &&
236 case DivV3S32:
237 return MRI.getType(Reg) == LLT::fixed_vector(3, 32) &&
239 case DivV4S16:
240 return MRI.getType(Reg) == LLT::fixed_vector(4, 16) &&
242 case DivV8S16:
243 return MRI.getType(Reg) == LLT::fixed_vector(8, 16) &&
245 case DivV8S32:
246 return MRI.getType(Reg) == LLT::fixed_vector(8, 32) &&
248 case DivV16S16:
249 return MRI.getType(Reg) == LLT::fixed_vector(16, 16) &&
251 case DivV16S32:
252 return MRI.getType(Reg) == LLT::fixed_vector(16, 32) &&
254 case DivV6S32:
255 return MRI.getType(Reg) == LLT::fixed_vector(6, 32) &&
257 case DivV32S16:
258 return MRI.getType(Reg) == LLT::fixed_vector(32, 16) &&
260 case DivV32S32:
261 return MRI.getType(Reg) == LLT::fixed_vector(32, 32) &&
263 case DivB32:
264 return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergentAtDef(Reg);
265 case DivB64:
266 return MRI.getType(Reg).getSizeInBits() == 64 && MUI.isDivergentAtDef(Reg);
267 case DivB96:
268 return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isDivergentAtDef(Reg);
269 case DivB128:
270 return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isDivergentAtDef(Reg);
271 case DivB160:
272 return MRI.getType(Reg).getSizeInBits() == 160 && MUI.isDivergentAtDef(Reg);
273 case DivB256:
274 return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isDivergentAtDef(Reg);
275 case DivB512:
276 return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isDivergentAtDef(Reg);
277 case DivBRC: {
278 if (MUI.isUniformAtDef(Reg))
279 return false;
280 // Check if there is VGPR register class of same size as the LLT.
281 const SIRegisterInfo *TRI =
282 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
283 return TRI->getSGPRClassForBitWidth(MRI.getType(Reg).getSizeInBits());
284 }
285 case BRC: {
286 // Check if there is SGPR and VGPR register class of same size as the LLT.
287 const SIRegisterInfo *TRI =
288 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
289 unsigned LLTSize = MRI.getType(Reg).getSizeInBits();
290 return LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize) &&
291 TRI->getVGPRClassForBitWidth(LLTSize);
292 }
293 case _:
294 return true;
295 default:
296 llvm_unreachable("missing matchUniformityAndLLT");
297 }
298}
299
301 const MachineUniformityInfo &MUI,
302 const MachineRegisterInfo &MRI) const {
303 // Check LLT signature.
304 for (unsigned i = 0; i < OpUniformityAndTypes.size(); ++i) {
305 const MachineOperand &MO = MI.getOperand(i);
306 if (OpUniformityAndTypes[i] == _) {
307 assert((!MI.getOperand(i).isReg() ||
308 !MI.getOperand(i).getReg().isVirtual()) &&
309 "_ is for non-register and physical register operands only");
310 continue;
311 }
312
313 // Remaining IDs check registers.
314 if (!MO.isReg())
315 return false;
316
317 if (!matchUniformityAndLLT(MO.getReg(), OpUniformityAndTypes[i], MUI, MRI))
318 return false;
319 }
320
321 // More complex check.
322 if (TestFunc)
323 return TestFunc(MI);
324
325 return true;
326}
327
329
331 : FastTypes(FastTypes) {}
332
334 if (Ty == LLT::scalar(16))
335 return S16;
336 if (Ty == LLT::scalar(32))
337 return S32;
338 if (Ty == LLT::scalar(64))
339 return S64;
340 if (Ty == LLT::fixed_vector(2, 16))
341 return V2S16;
342 if (Ty == LLT::fixed_vector(2, 32))
343 return V2S32;
344 if (Ty == LLT::fixed_vector(3, 32))
345 return V3S32;
346 if (Ty == LLT::fixed_vector(4, 32))
347 return V4S32;
348 return _;
349}
350
352 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
353 isAnyPtr(Ty, 32))
354 return B32;
355 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
356 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
357 return B64;
358 if (Ty == LLT::fixed_vector(3, 32))
359 return B96;
360 if (Ty == LLT::fixed_vector(4, 32) || Ty == LLT::fixed_vector(2, 64) ||
361 Ty == LLT::fixed_vector(8, 16) || isAnyPtr(Ty, 128))
362 return B128;
363 return _;
364}
365
366const RegBankLLTMapping *
368 const MachineRegisterInfo &MRI,
369 const MachineUniformityInfo &MUI) const {
370 // Search in "Fast Rules".
371 // Note: if fast rules are enabled, RegBankLLTMapping must be added in each
372 // slot that could "match fast Predicate". If not, InvalidMapping is
373 // returned which results in failure, does not search "Slow Rules".
374 if (FastTypes != NoFastRules) {
375 Register Reg = MI.getOperand(0).getReg();
376 int Slot;
377 if (FastTypes == StandardB)
378 Slot = getFastPredicateSlot(LLTToBId(MRI.getType(Reg)));
379 else
380 Slot = getFastPredicateSlot(LLTToId(MRI.getType(Reg)));
381
382 if (Slot != -1)
383 return MUI.isUniformAtDef(Reg) ? &Uni[Slot] : &Div[Slot];
384 }
385
386 // Slow search for more complex rules.
387 for (const RegBankLegalizeRule &Rule : Rules) {
388 if (Rule.Predicate.match(MI, MUI, MRI))
389 return &Rule.OperandMapping;
390 }
391
392 return nullptr;
393}
394
396 Rules.push_back(Rule);
397}
398
400 RegBankLLTMapping RuleApplyIDs) {
401 int Slot = getFastPredicateSlot(Ty);
402 assert(Slot != -1 && "Ty unsupported in this FastRulesTypes");
403 Div[Slot] = std::move(RuleApplyIDs);
404}
405
407 RegBankLLTMapping RuleApplyIDs) {
408 int Slot = getFastPredicateSlot(Ty);
409 assert(Slot != -1 && "Ty unsupported in this FastRulesTypes");
410 Uni[Slot] = std::move(RuleApplyIDs);
411}
412
413int SetOfRulesForOpcode::getFastPredicateSlot(
415 switch (FastTypes) {
416 case Standard: {
417 switch (Ty) {
418 case S32:
419 return 0;
420 case S16:
421 return 1;
422 case S64:
423 return 2;
424 case V2S16:
425 return 3;
426 default:
427 return -1;
428 }
429 }
430 case StandardB: {
431 switch (Ty) {
432 case B32:
433 return 0;
434 case B64:
435 return 1;
436 case B96:
437 return 2;
438 case B128:
439 return 3;
440 default:
441 return -1;
442 }
443 }
444 case Vector: {
445 switch (Ty) {
446 case S32:
447 return 0;
448 case V2S32:
449 return 1;
450 case V3S32:
451 return 2;
452 case V4S32:
453 return 3;
454 default:
455 return -1;
456 }
457 }
458 default:
459 return -1;
460 }
461}
462
463RegBankLegalizeRules::RuleSetInitializer
464RegBankLegalizeRules::addRulesForGOpcs(std::initializer_list<unsigned> OpcList,
465 FastRulesTypes FastTypes) {
466 return RuleSetInitializer(OpcList, GRulesAlias, GRules, FastTypes);
467}
468
469RegBankLegalizeRules::RuleSetInitializer
470RegBankLegalizeRules::addRulesForIOpcs(std::initializer_list<unsigned> OpcList,
471 FastRulesTypes FastTypes) {
472 return RuleSetInitializer(OpcList, IRulesAlias, IRules, FastTypes);
473}
474
477 unsigned Opc = MI.getOpcode();
478 if (Opc == AMDGPU::G_INTRINSIC || Opc == AMDGPU::G_INTRINSIC_CONVERGENT ||
479 Opc == AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS ||
480 Opc == AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS) {
481 unsigned IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
482 auto IRAIt = IRulesAlias.find(IntrID);
483 if (IRAIt == IRulesAlias.end())
484 return nullptr;
485 return &IRules.at(IRAIt->second);
486 }
487
488 auto GRAIt = GRulesAlias.find(Opc);
489 if (GRAIt == GRulesAlias.end())
490 return nullptr;
491 return &GRules.at(GRAIt->second);
492}
493
494// Syntactic sugar wrapper for predicate lambda that enables '&&', '||' and '!'.
495class Predicate {
496private:
497 struct Elt {
498 // Save formula composed of Pred, '&&', '||' and '!' as a jump table.
499 // Sink ! to Pred. For example !((A && !B) || C) -> (!A || B) && !C
500 // Sequences of && and || will be represented by jumps, for example:
501 // (A && B && ... X) or (A && B && ... X) || Y
502 // A == true jump to B
503 // A == false jump to end or Y, result is A(false) or Y
504 // (A || B || ... X) or (A || B || ... X) && Y
505 // A == true jump to end or Y, result is A(true) or Y
506 // A == false jump to B
507 // Notice that when negating expression, we simply flip Neg on each Pred
508 // and swap TJumpOffset and FJumpOffset (&& becomes ||, || becomes &&).
509 std::function<bool(const MachineInstr &)> Pred;
510 bool Neg; // Neg of Pred is calculated before jump
511 unsigned TJumpOffset;
512 unsigned FJumpOffset;
513 };
514
515 SmallVector<Elt, 8> Expression;
516
517 Predicate(SmallVectorImpl<Elt> &&Expr) { Expression.swap(Expr); };
518
519public:
520 Predicate(std::function<bool(const MachineInstr &)> Pred) {
521 Expression.push_back({Pred, false, 1, 1});
522 };
523
524 bool operator()(const MachineInstr &MI) const {
525 unsigned Idx = 0;
526 unsigned ResultIdx = Expression.size();
527 bool Result;
528 do {
529 Result = Expression[Idx].Pred(MI);
530 Result = Expression[Idx].Neg ? !Result : Result;
531 if (Result) {
532 Idx += Expression[Idx].TJumpOffset;
533 } else {
534 Idx += Expression[Idx].FJumpOffset;
535 }
536 } while ((Idx != ResultIdx));
537
538 return Result;
539 };
540
541 Predicate operator!() const {
542 SmallVector<Elt, 8> NegExpression;
543 for (const Elt &ExprElt : Expression) {
544 NegExpression.push_back({ExprElt.Pred, !ExprElt.Neg, ExprElt.FJumpOffset,
545 ExprElt.TJumpOffset});
546 }
547 return Predicate(std::move(NegExpression));
548 };
549
550 Predicate operator&&(const Predicate &RHS) const {
551 SmallVector<Elt, 8> AndExpression = Expression;
552
553 unsigned RHSSize = RHS.Expression.size();
554 unsigned ResultIdx = Expression.size();
555 for (unsigned i = 0; i < ResultIdx; ++i) {
556 // LHS results in false, whole expression results in false.
557 if (i + AndExpression[i].FJumpOffset == ResultIdx)
558 AndExpression[i].FJumpOffset += RHSSize;
559 }
560
561 AndExpression.append(RHS.Expression);
562
563 return Predicate(std::move(AndExpression));
564 }
565
566 Predicate operator||(const Predicate &RHS) const {
567 SmallVector<Elt, 8> OrExpression = Expression;
568
569 unsigned RHSSize = RHS.Expression.size();
570 unsigned ResultIdx = Expression.size();
571 for (unsigned i = 0; i < ResultIdx; ++i) {
572 // LHS results in true, whole expression results in true.
573 if (i + OrExpression[i].TJumpOffset == ResultIdx)
574 OrExpression[i].TJumpOffset += RHSSize;
575 }
576
577 OrExpression.append(RHS.Expression);
578
579 return Predicate(std::move(OrExpression));
580 }
581};
582
583// Initialize rules
586 : ST(&_ST), MRI(&_MRI) {
587
588 addRulesForGOpcs({G_ADD, G_SUB}, Standard)
589 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}})
590 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
591 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
592 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
594 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
595 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}})
596 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}});
597
598 addRulesForGOpcs({G_UADDO, G_USUBO}, Standard)
599 .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32}})
600 .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32}});
601
602 addRulesForGOpcs({G_UADDE, G_USUBE, G_SADDE, G_SSUBE}, Standard)
604 .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32, Vcc}});
605
606 addRulesForGOpcs({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT}, Standard)
607 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}})
608 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
609 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
610 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
612 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
613
614 bool HasVecMulU64 = ST->hasVMulU64Inst();
615 addRulesForGOpcs({G_MUL}, Standard)
616 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
617 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
618 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
619 .Uni(S64, {{SgprB64}, {SgprB64, SgprB64}})
621 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
622 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}})
623 .Div(S64, {{VgprB64}, {VgprB64, VgprB64}}, HasVecMulU64)
624 .Div(S64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32Mul}, !HasVecMulU64);
625
626 bool hasMulHi = ST->hasScalarMulHiInsts();
627 addRulesForGOpcs({G_UMULH, G_SMULH}, Standard)
628 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
629 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasMulHi)
630 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasMulHi);
631
632 addRulesForGOpcs({G_AMDGPU_MAD_U64_U32}, Standard)
633 .Div(S64, {{Vgpr64, Vcc}, {Vgpr32, Vgpr32, Vgpr64}})
635
636 bool HasScalarSMulU64 = ST->hasScalarSMulU64();
637 addRulesForGOpcs({G_AMDGPU_S_MUL_U64_U32, G_AMDGPU_S_MUL_I64_I32}, Standard)
638 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}, UniMul64}, HasScalarSMulU64)
639 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}, DivSMulToMAD});
640
641 addRulesForGOpcs({G_XOR, G_OR, G_AND}, StandardB)
643 .Any({{DivS1}, {{Vcc}, {Vcc, Vcc}}})
644 .Any({{UniS16}, {{Sgpr16}, {Sgpr16, Sgpr16}}})
645 .Any({{DivS16}, {{Vgpr16}, {Vgpr16, Vgpr16}}})
646 .Uni(B32, {{SgprB32}, {SgprB32, SgprB32}})
647 .Div(B32, {{VgprB32}, {VgprB32, VgprB32}})
648 .Uni(B64, {{SgprB64}, {SgprB64, SgprB64}})
649 .Div(B64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32});
650
651 addRulesForGOpcs({G_SHL}, Standard)
652 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32ZExt}})
653 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
655 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
656 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
657 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
658 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
659 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
660
661 addRulesForGOpcs({G_LSHR}, Standard)
662 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32ZExt, Sgpr32ZExt}})
663 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
665 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
666 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
667 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
668 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
669 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
670
671 addRulesForGOpcs({G_ASHR}, Standard)
672 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt, Sgpr32ZExt}})
673 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
675 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
676 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
677 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
678 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
679 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
680
681 addRulesForGOpcs({G_FSHR}, Standard)
682 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
683 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
684
685 addRulesForGOpcs({G_BSWAP}, Standard)
686 .Uni(S16, {{UniInVgprS16}, {Vgpr16}})
687 .Div(S16, {{Vgpr16}, {Vgpr16}})
688 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
689 .Div(S32, {{Vgpr32}, {Vgpr32}})
690 .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}})
691 .Div(V2S16, {{VgprV2S16}, {VgprV2S16}});
692
693 addRulesForGOpcs({G_AMDGPU_CVT_F32_UBYTE0, G_AMDGPU_CVT_F32_UBYTE1,
694 G_AMDGPU_CVT_F32_UBYTE2, G_AMDGPU_CVT_F32_UBYTE3,
695 G_AMDGPU_RCP_IFLAG},
696 Standard)
697 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
698 .Div(S32, {{Vgpr32}, {Vgpr32}});
699
700 addRulesForGOpcs({G_FRAME_INDEX}).Any({{UniP5, _}, {{SgprP5}, {None}}});
701
702 addRulesForGOpcs({G_UBFX, G_SBFX}, Standard)
703 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32, Sgpr32}, S_BFE})
704 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}})
705 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32, Sgpr32}, S_BFE})
706 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32, Vgpr32}, V_BFE});
707
708 addRulesForGOpcs({G_SMIN, G_SMAX}, Standard)
709 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt, Sgpr32SExt}})
710 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
711 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
712 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
714 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
715 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
716 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}});
717
718 addRulesForGOpcs({G_UMIN, G_UMAX}, Standard)
719 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32ZExt, Sgpr32ZExt}})
720 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
721 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
722 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
724 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
725 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
726 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}});
727
728 addRulesForGOpcs({G_IMPLICIT_DEF})
729 .Any({{UniS1}, {{Sgpr32Trunc}, {}}})
730 .Any({{UniS16}, {{Sgpr16}, {}}})
731 .Any({{UniBRC}, {{SgprBRC}, {}}});
732
733 addRulesForGOpcs({G_CONSTANT}, Standard)
734 .Any({{UniS1, _}, {{Sgpr32Trunc}, {}, UniCstExt}})
735 .Uni(S16, {{Sgpr16}, {}})
736 .Uni(S32, {{Sgpr32}, {}})
737 .Uni(S64, {{Sgpr64}, {}})
738 .Any({{UniPtr32, _}, {{SgprPtr32}, {}}})
739 .Any({{UniPtr64, _}, {{SgprPtr64}, {}}});
740
741 addRulesForGOpcs({G_FCONSTANT}, Standard)
742 .Uni(S16, {{Sgpr16}, {}})
743 .Uni(S32, {{Sgpr32}, {}})
744 .Uni(S64, {{Sgpr64}, {}});
745
746 addRulesForGOpcs({G_FREEZE})
747 .Any({{UniS1}, {{Sgpr32Trunc}, {Sgpr32AExt}}})
748 .Any({{DivS1}, {{Vcc}, {Vcc}}})
749 .Any({{UniS16}, {{Sgpr16}, {Sgpr16}}})
750 .Any({{UniBRC}, {{SgprBRC}, {SgprBRC}}})
751 .Any({{DivBRC}, {{VgprBRC}, {VgprBRC}}});
752
753 addRulesForGOpcs({G_BITCAST})
754 .Any({{UniBRC}, {{SgprBRC}, {SgprBRC}}})
755 .Any({{DivBRC}, {{VgprBRC}, {VgprBRC}}});
756
757 addRulesForGOpcs({G_UNMERGE_VALUES})
758 .Any({{UniS16}, {{}, {}, UnmergeToShiftTrunc}})
759 .Any({{UniBRC}, {{}, {}, VerifyAllSgpr}})
760 .Any({{DivBRC}, {{}, {}, ApplyAllVgpr}});
761
762 addRulesForGOpcs({G_BUILD_VECTOR, G_MERGE_VALUES})
763 .Any({{UniBRC, S16}, {{}, {}, VerifyAllSgpr}})
764 .Any({{UniBRC, BRC}, {{}, {}, VerifyAllSgpr}})
765 .Any({{DivBRC, S16}, {{}, {}, ApplyAllVgpr}})
766 .Any({{DivBRC, BRC}, {{}, {}, ApplyAllVgpr}});
767
768 addRulesForGOpcs({G_CONCAT_VECTORS})
769 .Any({{UniBRC, BRC}, {{}, {}, VerifyAllSgpr}})
770 .Any({{DivBRC, BRC}, {{}, {}, ApplyAllVgpr}});
771
772 addRulesForGOpcs({G_PHI})
773 .Any({{UniS1}, {{}, {}, AextToS32InIncomingBlockGPHI}})
774 .Any({{UniS16}, {{}, {}, VerifyAllSgprGPHI}})
775 .Any({{UniBRC}, {{}, {}, VerifyAllSgprGPHI}})
776 .Any({{DivBRC}, {{}, {}, VerifyAllSgprOrVgprGPHI}});
777
778 addRulesForGOpcs({G_EXTRACT_VECTOR_ELT})
779 .Any({{UniB32, UniBRC, UniS32}, {{SgprB32}, {SgprBRC, Sgpr32}}})
780 .Any({{DivB32, DivBRC, UniS32}, {{VgprB32}, {VgprBRC, Sgpr32}}})
781 .Any({{DivB32, BRC, DivS32},
783 .Any({{UniB64, UniBRC, UniS32}, {{SgprB64}, {SgprBRC, Sgpr32}}})
784 .Any({{DivB64, DivBRC, UniS32},
786 .Any({{DivB64, BRC, DivS32},
788
789 addRulesForGOpcs({G_INSERT_VECTOR_ELT})
791 {{SgprBRC}, {SgprBRC, SgprB32, Sgpr32}}})
792 .Any(
793 {{DivBRC, BRC, B32, UniS32}, {{VgprBRC}, {VgprBRC, VgprB32, Sgpr32}}})
794 .Any({{DivBRC, BRC, B32, DivS32},
798 .Any({{DivBRC, BRC, B64, UniS32},
800 .Any({{DivBRC, BRC, B64, DivS32},
802
803 // INTERSECT_RAY {Div}, {{VgprDst...}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
804 // INTERSECT_RAY {Uni}, {{UniInVgprDst...}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
805 addRulesForGOpcs({G_AMDGPU_BVH_INTERSECT_RAY, G_AMDGPU_BVH_DUAL_INTERSECT_RAY,
806 G_AMDGPU_BVH8_INTERSECT_RAY})
807 .Any({{}, {{}, {}, ApplyBVH_INTERSECT_RAY}});
808
809 // LOAD {Div}, {{VgprDst...}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
810 // LOAD {Uni}, {{UniInVgprDst...}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
811 // LOAD_NORET {}, {{}, {Imm, VgprSrc, ..., Sgpr_WF_RsrcIdx}}
812 // STORE {}, {{}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
813 addRulesForGOpcs({G_AMDGPU_INTRIN_IMAGE_LOAD, G_AMDGPU_INTRIN_IMAGE_LOAD_D16,
814 G_AMDGPU_INTRIN_IMAGE_LOAD_NORET,
815 G_AMDGPU_INTRIN_IMAGE_STORE,
816 G_AMDGPU_INTRIN_IMAGE_STORE_D16})
817 .Any({{}, {{}, {}, ApplyINTRIN_IMAGE}});
818
819 Predicate isSignedICmp([](const MachineInstr &MI) -> bool {
820 auto Pred =
821 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
822 return CmpInst::isSigned(Pred);
823 });
824
825 Predicate isEqualityICmp([](const MachineInstr &MI) -> bool {
826 auto Pred =
827 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
828 return ICmpInst::isEquality(Pred);
829 });
830
831 bool HasScalarCompareEq64 = ST->hasScalarCompareEq64();
832 // clang-format off
833 addRulesForGOpcs({G_ICMP})
834 .Any({{{UniS1, _, S16}, isEqualityICmp}, {{Sgpr32Trunc}, {None, Sgpr32ZExt, Sgpr32ZExt}}})
835 .Any({{{UniS1, _, S16}, !isEqualityICmp && isSignedICmp}, {{Sgpr32Trunc}, {None, Sgpr32SExt, Sgpr32SExt}}})
836 .Any({{{UniS1, _, S16}, !isEqualityICmp && !isSignedICmp}, {{Sgpr32Trunc}, {None, Sgpr32ZExt, Sgpr32ZExt}}})
837 .Any({{{DivS1, _, S16}}, {{Vcc}, {None, Vgpr16, Vgpr16}}})
838 .Any({{{UniS1, _, S32}}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}})
839 .Any({{{DivS1, _, S32}}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
840 .Any({{{UniS1, _, S64}, isEqualityICmp}, {{Sgpr32Trunc}, {None, Sgpr64, Sgpr64}}}, HasScalarCompareEq64)
841 .Any({{{UniS1, _, S64}, isEqualityICmp}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}}, !HasScalarCompareEq64)
842 .Any({{{UniS1, _, S64}, !isEqualityICmp}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}})
843 .Any({{{DivS1, _, S64}}, {{Vcc}, {None, Vgpr64, Vgpr64}}})
844 .Any({{{UniS1, _, Ptr32}}, {{Sgpr32Trunc}, {None, SgprPtr32, SgprPtr32}}})
845 .Any({{{DivS1, _, Ptr32}}, {{Vcc}, {None, VgprPtr32, VgprPtr32}}})
846 .Any({{{UniS1, _, Ptr64}, isEqualityICmp}, {{Sgpr32Trunc}, {None, SgprPtr64, SgprPtr64}}}, HasScalarCompareEq64)
847 .Any({{{UniS1, _, Ptr64}, isEqualityICmp}, {{UniInVcc}, {None, VgprPtr64, VgprPtr64}}}, !HasScalarCompareEq64)
848 .Any({{{UniS1, _, Ptr64}, !isEqualityICmp}, {{UniInVcc}, {None, VgprPtr64, VgprPtr64}}})
849 .Any({{{DivS1, _, Ptr64}}, {{Vcc}, {None, VgprPtr64, VgprPtr64}}});
850 // clang-format on
851
852 addRulesForGOpcs({G_BRCOND})
853 .Any({{UniS1}, {{}, {Sgpr32AExtBoolInReg}}})
854 .Any({{DivS1}, {{}, {Vcc}}});
855
856 addRulesForGOpcs({G_BR}).Any({{_}, {{}, {None}}});
857
858 addRulesForGOpcs({G_SELECT}, StandardB)
859 .Any({{DivS16}, {{Vgpr16}, {Vcc, Vgpr16, Vgpr16}}})
861 .Div(B32, {{VgprB32}, {Vcc, VgprB32, VgprB32}})
865
866 addRulesForGOpcs({G_ANYEXT})
867 .Any({{UniS16, S1}, {{None}, {None}}}) // should be combined away
868 .Any({{UniS32, S1}, {{None}, {None}}}) // should be combined away
869 .Any({{UniS64, S1}, {{None}, {None}}}) // should be combined away
870 .Any({{DivS16, S1}, {{Vgpr16}, {Vcc}, VccExtToSel}})
871 .Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
872 .Any({{DivS64, S1}, {{Vgpr64}, {Vcc}, VccExtToSel}})
873 .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
874 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
875 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
876 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
877
878 bool Has16bitCmp = ST->has16BitInsts();
879
880 // In global-isel G_TRUNC in-reg is treated as no-op, inst selected into COPY.
881 // It is up to user to deal with truncated bits.
882 // S1, S16, S32 and S64 results are handled with specific rules. Remaining
883 // (result, source) pairs with valid register classes are covered by the
884 // generic UniBRC/DivBRC wildcard rules.
885 addRulesForGOpcs({G_TRUNC})
886 .Any({{UniS1, UniS16}, {{None}, {None}}}) // should be combined away
887 .Any({{UniS1, UniS32}, {{None}, {None}}}) // should be combined away
888 .Any({{UniS1, UniS64}, {{None}, {None}}}) // should be combined away
889 .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}})
890 .Any({{UniBRC, UniBRC}, {{SgprBRC}, {SgprBRC}}})
891 .Any({{DivBRC, DivBRC}, {{VgprBRC}, {VgprBRC}}})
892 .Any({{UniV2S16, V2S32}, {{SgprV2S16}, {SgprV2S32}}})
893 .Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}})
894 // This is non-trivial. VgprToVccCopy is done using compare instruction.
895 .Any({{DivS1, DivS16}, {{Vcc}, {Vgpr16}, VgprToVccCopy}}, Has16bitCmp)
897 !Has16bitCmp)
898 .Any({{DivS1, DivS32}, {{Vcc}, {Vgpr32}, VgprToVccCopy}})
899 .Any({{DivS1, DivS64}, {{Vcc}, {Vgpr64}, VgprToVccCopy}});
900
901 addRulesForGOpcs({G_ZEXT})
905 .Any({{DivS16, S1}, {{Vgpr16}, {Vcc}, VccExtToSel}})
906 .Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
907 .Any({{DivS64, S1}, {{Vgpr64}, {Vcc}, VccExtToSel}})
908 .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
909 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
910 // not extending S16 to S32 is questionable.
911 .Any({{UniS64, S16}, {{Sgpr64}, {Sgpr32ZExt}, Ext32To64}})
912 .Any({{DivS64, S16}, {{Vgpr64}, {Vgpr32ZExt}, Ext32To64}})
913 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
914 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
915
916 addRulesForGOpcs({G_SEXT})
920 .Any({{DivS16, S1}, {{Vgpr16}, {Vcc}, VccExtToSel}})
921 .Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
922 .Any({{DivS64, S1}, {{Vgpr64}, {Vcc}, VccExtToSel}})
923 .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
924 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
925 // not extending S16 to S32 is questionable.
926 .Any({{UniS64, S16}, {{Sgpr64}, {Sgpr32SExt}, Ext32To64}})
927 .Any({{DivS64, S16}, {{Vgpr64}, {Vgpr32SExt}, Ext32To64}})
928 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
929 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
930
931 addRulesForGOpcs({G_SEXT_INREG})
932 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}})
933 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
934 .Any({{UniS64, S64}, {{Sgpr64}, {Sgpr64}}})
936
937 addRulesForGOpcs({G_ASSERT_ZEXT, G_ASSERT_SEXT}, Standard)
938 .Uni(S32, {{Sgpr32}, {Sgpr32, Imm}})
939 .Div(S32, {{Vgpr32}, {Vgpr32, Imm}})
940 .Uni(S64, {{Sgpr64}, {Sgpr64, Imm}})
941 .Div(S64, {{Vgpr64}, {Vgpr64, Imm}});
942
943 addRulesForGOpcs({G_ASSERT_ALIGN}, Standard)
944 .Uni(S32, {{Sgpr32}, {Sgpr32}})
945 .Div(S32, {{Vgpr32}, {Vgpr32}})
946 .Uni(S64, {{Sgpr64}, {Sgpr64}})
947 .Div(S64, {{Vgpr64}, {Vgpr64}})
948 .Any({{UniPtr32}, {{SgprPtr32}, {SgprPtr32}}})
949 .Any({{DivPtr32}, {{VgprPtr32}, {VgprPtr32}}})
950 .Any({{UniPtr64}, {{SgprPtr64}, {SgprPtr64}}})
951 .Any({{DivPtr64}, {{VgprPtr64}, {VgprPtr64}}});
952
953 // Atomic read-modify-write operations: result and value are always VGPR,
954 // pointer varies by address space.
955 addRulesForGOpcs({G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_XCHG,
956 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
957 G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN,
958 G_ATOMICRMW_UMAX, G_ATOMICRMW_UINC_WRAP,
959 G_ATOMICRMW_UDEC_WRAP, G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
960 .Any({{DivS32, P0, S32}, {{Vgpr32}, {VgprP0, Vgpr32}}})
961 .Any({{DivS64, P0, S64}, {{Vgpr64}, {VgprP0, Vgpr64}}})
962 .Any({{DivS32, P1, S32}, {{Vgpr32}, {VgprP1, Vgpr32}}})
963 .Any({{DivS64, P1, S64}, {{Vgpr64}, {VgprP1, Vgpr64}}})
964 .Any({{DivS32, P3, S32}, {{Vgpr32}, {VgprP3, Vgpr32}}})
965 .Any({{DivS64, P3, S64}, {{Vgpr64}, {VgprP3, Vgpr64}}});
966
967 addRulesForGOpcs({G_ATOMICRMW_USUB_SAT, G_ATOMICRMW_USUB_COND})
968 .Any({{DivS32, P0}, {{Vgpr32}, {VgprP0, Vgpr32}}})
969 .Any({{DivS32, P1}, {{Vgpr32}, {VgprP1, Vgpr32}}})
970 .Any({{DivS32, P3}, {{Vgpr32}, {VgprP3, Vgpr32}}});
971
972 bool HasAtomicFlatPkAdd16Insts = ST->hasAtomicFlatPkAdd16Insts();
973 bool HasAtomicBufferGlobalPkAddF16Insts =
974 ST->hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
975 ST->hasAtomicBufferGlobalPkAddF16Insts();
976 bool HasAtomicDsPkAdd16Insts = ST->hasAtomicDsPkAdd16Insts();
977 addRulesForGOpcs({G_ATOMICRMW_FADD})
978 .Any({{DivS32, P0, S32}, {{Vgpr32}, {VgprP0, Vgpr32}}})
979 .Any({{DivS64, P0, S64}, {{Vgpr64}, {VgprP0, Vgpr64}}})
980 .Any({{DivS32, P1, S32}, {{Vgpr32}, {VgprP1, Vgpr32}}})
981 .Any({{DivS64, P1, S64}, {{Vgpr64}, {VgprP1, Vgpr64}}})
982 .Any({{DivS32, P3, S32}, {{Vgpr32}, {VgprP3, Vgpr32}}})
983 .Any({{DivS64, P3, S64}, {{Vgpr64}, {VgprP3, Vgpr64}}})
984 .Any({{DivV2S16, P0, V2S16}, {{VgprV2S16}, {VgprP0, VgprV2S16}}},
985 HasAtomicFlatPkAdd16Insts)
986 .Any({{DivV2S16, P1, V2S16}, {{VgprV2S16}, {VgprP1, VgprV2S16}}},
987 HasAtomicBufferGlobalPkAddF16Insts)
988 .Any({{DivV2S16, P3, V2S16}, {{VgprV2S16}, {VgprP3, VgprV2S16}}},
989 HasAtomicDsPkAdd16Insts);
990
991 addRulesForGOpcs({G_ATOMIC_CMPXCHG})
992 .Any({{DivS32, P2}, {{Vgpr32}, {VgprP2, Vgpr32, Vgpr32}}})
993 .Any({{DivS64, P2}, {{Vgpr64}, {VgprP2, Vgpr64, Vgpr64}}})
994 .Any({{DivS32, P3}, {{Vgpr32}, {VgprP3, Vgpr32, Vgpr32}}})
995 .Any({{DivS64, P3}, {{Vgpr64}, {VgprP3, Vgpr64, Vgpr64}}});
996
997 addRulesForGOpcs({G_AMDGPU_ATOMIC_CMPXCHG})
998 .Any({{DivS32, P0}, {{Vgpr32}, {VgprP0, VgprV2S32}}})
999 .Any({{DivS32, P1}, {{Vgpr32}, {VgprP1, VgprV2S32}}})
1000 .Any({{DivS64, P0}, {{Vgpr64}, {VgprP0, VgprV2S64}}})
1001 .Any({{DivS64, P1}, {{Vgpr64}, {VgprP1, VgprV2S64}}});
1002
1003 addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_CMPSWAP}, Standard)
1004 .Div(S32, {{Vgpr32},
1006 .Div(S64, {{Vgpr64},
1008
1009 addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_ADD, G_AMDGPU_BUFFER_ATOMIC_AND,
1010 G_AMDGPU_BUFFER_ATOMIC_DEC, G_AMDGPU_BUFFER_ATOMIC_FMAX,
1011 G_AMDGPU_BUFFER_ATOMIC_FMIN, G_AMDGPU_BUFFER_ATOMIC_INC,
1012 G_AMDGPU_BUFFER_ATOMIC_OR, G_AMDGPU_BUFFER_ATOMIC_SMAX,
1013 G_AMDGPU_BUFFER_ATOMIC_SMIN, G_AMDGPU_BUFFER_ATOMIC_SUB,
1014 G_AMDGPU_BUFFER_ATOMIC_SWAP, G_AMDGPU_BUFFER_ATOMIC_UMAX,
1015 G_AMDGPU_BUFFER_ATOMIC_UMIN, G_AMDGPU_BUFFER_ATOMIC_XOR},
1016 Standard)
1019
1020 bool hasSMRDx3 = ST->hasScalarDwordx3Loads();
1021 bool hasSMRDSmall = ST->hasScalarSubwordLoads();
1022 bool usesTrue16 = ST->useRealTrue16Insts();
1023
1024 Predicate isAlign16([](const MachineInstr &MI) -> bool {
1025 return (*MI.memoperands_begin())->getAlign() >= Align(16);
1026 });
1027
1028 Predicate isAlign4([](const MachineInstr &MI) -> bool {
1029 return (*MI.memoperands_begin())->getAlign() >= Align(4);
1030 });
1031
1032 Predicate isAtomicMMO([](const MachineInstr &MI) -> bool {
1033 return (*MI.memoperands_begin())->isAtomic();
1034 });
1035
1036 Predicate isUniMMO([](const MachineInstr &MI) -> bool {
1037 return AMDGPU::isUniformMMO(*MI.memoperands_begin());
1038 });
1039
1040 Predicate isConst([](const MachineInstr &MI) -> bool {
1041 // Address space in MMO be different then address space on pointer.
1042 const MachineMemOperand *MMO = *MI.memoperands_begin();
1043 const unsigned AS = MMO->getAddrSpace();
1044 return AS == AMDGPUAS::CONSTANT_ADDRESS ||
1046 });
1047
1048 Predicate isVolatileMMO([](const MachineInstr &MI) -> bool {
1049 return (*MI.memoperands_begin())->isVolatile();
1050 });
1051
1052 Predicate isInvMMO([](const MachineInstr &MI) -> bool {
1053 return (*MI.memoperands_begin())->isInvariant();
1054 });
1055
1056 Predicate isNoClobberMMO([](const MachineInstr &MI) -> bool {
1057 return (*MI.memoperands_begin())->getFlags() & MONoClobber;
1058 });
1059
1060 Predicate isNaturalAligned([](const MachineInstr &MI) -> bool {
1061 const MachineMemOperand *MMO = *MI.memoperands_begin();
1062 return MMO->getAlign() >= Align(MMO->getSize().getValue());
1063 });
1064
1065 Predicate is8Or16BitMMO([](const MachineInstr &MI) -> bool {
1066 const MachineMemOperand *MMO = *MI.memoperands_begin();
1067 const unsigned MemSize = 8 * MMO->getSize().getValue();
1068 return MemSize == 16 || MemSize == 8;
1069 });
1070
1071 Predicate is32BitMMO([](const MachineInstr &MI) -> bool {
1072 const MachineMemOperand *MMO = *MI.memoperands_begin();
1073 return 8 * MMO->getSize().getValue() == 32;
1074 });
1075
1076 auto isUL = !isAtomicMMO && isUniMMO && (isConst || !isVolatileMMO) &&
1077 (isConst || isInvMMO || isNoClobberMMO);
1078
1079 // clang-format off
1080 // TODO: S32Dst, 16-bit any-extending load should not appear on True16 targets
1081 addRulesForGOpcs({G_LOAD})
1082 // flat, addrspace(0), never uniform - flat_load
1083 .Any({{DivS16, P0}, {{Vgpr16}, {VgprP0}}}, usesTrue16)
1084 .Any({{DivB32, P0}, {{VgprB32}, {VgprP0}}}) // 32-bit load, 8-bit and 16-bit any-extending load
1085 .Any({{DivB64, P0}, {{VgprB64}, {VgprP0}}})
1086 .Any({{DivB96, P0}, {{VgprB96}, {VgprP0}}})
1087 .Any({{DivB128, P0}, {{VgprB128}, {VgprP0}}})
1088
1089 // global, addrspace(1)
1090 // divergent - global_load
1091 .Any({{DivS16, P1}, {{Vgpr16}, {VgprP1}}}, usesTrue16)
1092 .Any({{DivB32, P1}, {{VgprB32}, {VgprP1}}}) //32-bit load, 8-bit and 16-bit any-extending load
1093 .Any({{DivB64, P1}, {{VgprB64}, {VgprP1}}})
1094 .Any({{DivB96, P1}, {{VgprB96}, {VgprP1}}})
1095 .Any({{DivB128, P1}, {{VgprB128}, {VgprP1}}})
1096 .Any({{DivB256, P1}, {{VgprB256}, {VgprP1}, SplitLoad}})
1097 .Any({{DivB512, P1}, {{VgprB512}, {VgprP1}, SplitLoad}})
1098
1099 // uniform - s_load
1100 .Any({{{UniS16, P1}, isNaturalAligned && isUL}, {{Sgpr32Trunc}, {SgprP1}}}, usesTrue16 && hasSMRDSmall) // s16 load
1101 .Any({{{UniS16, P1}, isAlign4 && isUL}, {{Sgpr32Trunc}, {SgprP1}, WidenMMOToS32}}, usesTrue16 && !hasSMRDSmall) // s16 load to 32-bit load
1102 .Any({{{UniB32, P1}, isNaturalAligned && isUL}, {{SgprB32}, {SgprP1}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1103 // TODO: SplitLoad when !isNaturalAligned && isUL and target hasSMRDSmall
1104 .Any({{{UniB32, P1}, is8Or16BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP1}, WidenMMOToS32}}, !hasSMRDSmall) //8-bit and 16-bit any-extending load to 32-bit load
1105 .Any({{{UniB32, P1}, is32BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP1}}}) //32-bit load
1106 .Any({{{UniB64, P1}, isAlign4 && isUL}, {{SgprB64}, {SgprP1}}})
1107 .Any({{{UniB96, P1}, isAlign16 && isUL}, {{SgprB96}, {SgprP1}, WidenLoad}}, !hasSMRDx3)
1108 .Any({{{UniB96, P1}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP1}, SplitLoad}}, !hasSMRDx3)
1109 .Any({{{UniB96, P1}, isAlign4 && isUL}, {{SgprB96}, {SgprP1}}}, hasSMRDx3)
1110 .Any({{{UniB128, P1}, isAlign4 && isUL}, {{SgprB128}, {SgprP1}}})
1111 .Any({{{UniB256, P1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}})
1112 .Any({{{UniB512, P1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}})
1113
1114 // Uniform via global or buffer load, for example volatile or non-aligned
1115 // uniform load. Not using standard {{UniInVgprTy}, {VgprP1}} since it is
1116 // selected as global_load, use SgprP1 for pointer instead to match
1117 // patterns without flat-for-global, default for GFX7 and older.
1118 // -> +flat-for-global + {{UniInVgprTy}, {SgprP1}} - global_load
1119 // -> -flat-for-global + {{UniInVgprTy}, {SgprP1}} - buffer_load
1120 .Any({{{UniS16, P1}, !isNaturalAligned || !isUL}, {{UniInVgprS16}, {SgprP1}}}, usesTrue16 && hasSMRDSmall) // s16 load
1121 .Any({{{UniS16, P1}, !isAlign4 || !isUL}, {{UniInVgprS16}, {SgprP1}}}, usesTrue16 && !hasSMRDSmall) // s16 load
1122 .Any({{{UniB32, P1}, !isNaturalAligned || !isUL}, {{UniInVgprB32}, {SgprP1}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1123 .Any({{{UniB32, P1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}}, !hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1124 .Any({{{UniB64, P1}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP1}}})
1125 .Any({{{UniB96, P1}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP1}}})
1126 .Any({{{UniB128, P1}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP1}}})
1127 .Any({{{UniB256, P1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {SgprP1}, SplitLoad}})
1128 .Any({{{UniB512, P1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {SgprP1}, SplitLoad}})
1129
1130 // local, addrspace(3) - ds_load
1131 .Any({{DivS16, P3}, {{Vgpr16}, {VgprP3}}}, usesTrue16)
1132 .Any({{DivB32, P3}, {{VgprB32}, {VgprP3}}}) // 32-bit load, 8-bit and 16-bit any-extending load
1133 .Any({{DivB64, P3}, {{VgprB64}, {VgprP3}}})
1134 .Any({{DivB96, P3}, {{VgprB96}, {VgprP3}}})
1135 .Any({{DivB128, P3}, {{VgprB128}, {VgprP3}}})
1136
1137 .Any({{UniS16, P3}, {{UniInVgprS16}, {SgprP3}}}, usesTrue16) // 16-bit load
1138 .Any({{UniB32, P3}, {{UniInVgprB32}, {VgprP3}}}) // 32-bit load, 8-bit and 16-bit any-extending load
1139 .Any({{UniB64, P3}, {{UniInVgprB64}, {VgprP3}}})
1140 .Any({{UniB96, P3}, {{UniInVgprB96}, {VgprP3}}})
1141 .Any({{UniB128, P3}, {{UniInVgprB128}, {VgprP3}}})
1142
1143 // constant, addrspace(4)
1144 // divergent - global_load
1145 .Any({{DivS16, P4}, {{Vgpr16}, {VgprP4}}}, usesTrue16)
1146 .Any({{DivB32, P4}, {{VgprB32}, {VgprP4}}}) //32-bit load, 8-bit and 16-bit any-extending load
1147 .Any({{DivB64, P4}, {{VgprB64}, {VgprP4}}})
1148 .Any({{DivB96, P4}, {{VgprB96}, {VgprP4}}})
1149 .Any({{DivB128, P4}, {{VgprB128}, {VgprP4}}})
1150 .Any({{DivB256, P4}, {{VgprB256}, {VgprP4}, SplitLoad}})
1151 .Any({{DivB512, P4}, {{VgprB512}, {VgprP4}, SplitLoad}})
1152
1153 // uniform - s_load
1154 .Any({{{UniS16, P4}, isNaturalAligned && isUL}, {{Sgpr32Trunc}, {SgprP4}}}, usesTrue16 && hasSMRDSmall) // s16 load
1155 .Any({{{UniS16, P4}, isAlign4 && isUL}, {{Sgpr32Trunc}, {SgprP4}, WidenMMOToS32}}, usesTrue16 && !hasSMRDSmall) // s16 load to 32-bit load
1156 .Any({{{UniB32, P4}, isNaturalAligned && isUL}, {{SgprB32}, {SgprP4}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1157 .Any({{{UniB32, P4}, is8Or16BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP4}, WidenMMOToS32}}, !hasSMRDSmall) //8-bit and 16-bit any-extending load to 32-bit load
1158 .Any({{{UniB32, P4}, is32BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP4}}}) //32-bit load
1159 .Any({{{UniB64, P4}, isAlign4 && isUL}, {{SgprB64}, {SgprP4}}})
1160 .Any({{{UniB96, P4}, isAlign16 && isUL}, {{SgprB96}, {SgprP4}, WidenLoad}}, !hasSMRDx3)
1161 .Any({{{UniB96, P4}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP4}, SplitLoad}}, !hasSMRDx3)
1162 .Any({{{UniB96, P4}, isAlign4 && isUL}, {{SgprB96}, {SgprP4}}}, hasSMRDx3)
1163 .Any({{{UniB128, P4}, isAlign4 && isUL}, {{SgprB128}, {SgprP4}}})
1164 .Any({{{UniB256, P4}, isAlign4 && isUL}, {{SgprB256}, {SgprP4}}})
1165 .Any({{{UniB512, P4}, isAlign4 && isUL}, {{SgprB512}, {SgprP4}}})
1166
1167 // uniform in vgpr - global_load or buffer_load
1168 .Any({{{UniS16, P4}, !isNaturalAligned || !isUL}, {{UniInVgprS16}, {SgprP4}}}, usesTrue16 && hasSMRDSmall) // s16 load
1169 .Any({{{UniS16, P4}, !isAlign4 || !isUL}, {{UniInVgprS16}, {SgprP4}}}, usesTrue16 && !hasSMRDSmall) // s16 load
1170 .Any({{{UniB32, P4}, !isNaturalAligned || !isUL}, {{UniInVgprB32}, {SgprP4}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1171 .Any({{{UniB32, P4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP4}}}, !hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1172 .Any({{{UniB64, P4}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP4}}})
1173 .Any({{{UniB96, P4}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP4}}})
1174 .Any({{{UniB128, P4}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP4}}})
1175 .Any({{{UniB256, P4}, !isAlign4 || !isUL}, {{UniInVgprB256}, {SgprP4}, SplitLoad}})
1176 .Any({{{UniB512, P4}, !isAlign4 || !isUL}, {{UniInVgprB512}, {SgprP4}, SplitLoad}})
1177
1178 // private, addrspace(5), never uniform - scratch_load
1179 .Any({{DivS16, P5}, {{Vgpr16}, {VgprP5}}}, usesTrue16)
1180 .Any({{DivB32, P5}, {{VgprB32}, {VgprP5}}}) // 32-bit load, 8-bit and 16-bit any-extending load
1181 .Any({{DivB64, P5}, {{VgprB64}, {VgprP5}}})
1182 .Any({{DivB96, P5}, {{VgprB96}, {VgprP5}}})
1183 .Any({{DivB128, P5}, {{VgprB128}, {VgprP5}}})
1184
1185 .Any({{DivS32, Ptr128}, {{Vgpr32}, {VgprPtr128}}});
1186
1187
1188 addRulesForGOpcs({G_ZEXTLOAD, G_SEXTLOAD}) // i8 and i16 zeroextending loads
1189 .Any({{DivS32, P0}, {{Vgpr32}, {VgprP0}}})
1190 .Any({{DivS16, P0}, {{Vgpr16}, {VgprP0}}}, usesTrue16)
1191
1192 .Any({{DivS32, P1}, {{Vgpr32}, {VgprP1}}})
1193 .Any({{DivS16, P1}, {{Vgpr16}, {VgprP1}}}, usesTrue16)
1194 .Any({{{UniS32, P1}, isAlign4 && isUL}, {{Sgpr32}, {SgprP1}, WidenMMOToS32}}, !hasSMRDSmall)
1195 .Any({{{UniS32, P1}, isNaturalAligned && isUL}, {{Sgpr32}, {SgprP1}}}, hasSMRDSmall)
1196 .Any({{{UniS32, P1}, !isAlign4 || !isUL}, {{UniInVgprS32}, {SgprP1}}}, !hasSMRDSmall)
1197 .Any({{{UniS32, P1}, !isNaturalAligned || !isUL}, {{UniInVgprS32}, {SgprP1}}}, hasSMRDSmall)
1198
1199 .Any({{DivS32, P3}, {{Vgpr32}, {VgprP3}}})
1200 .Any({{DivS16, P3}, {{Vgpr16}, {VgprP3}}}, usesTrue16)
1201 .Any({{UniS32, P3}, {{UniInVgprS32}, {VgprP3}}})
1202
1203 .Any({{DivS32, P4}, {{Vgpr32}, {VgprP4}}})
1204 .Any({{DivS16, P4}, {{Vgpr16}, {VgprP4}}}, usesTrue16)
1205 .Any({{{UniS32, P4}, isAlign4 && isUL}, {{Sgpr32}, {SgprP4}, WidenMMOToS32}}, !hasSMRDSmall)
1206 .Any({{{UniS32, P4}, isNaturalAligned && isUL}, {{Sgpr32}, {SgprP4}}}, hasSMRDSmall)
1207 .Any({{{UniS32, P4}, !isAlign4 || !isUL}, {{UniInVgprS32}, {SgprP4}}}, !hasSMRDSmall)
1208 .Any({{{UniS32, P4}, !isNaturalAligned || !isUL}, {{UniInVgprS32}, {SgprP4}}}, hasSMRDSmall)
1209
1210 .Any({{DivS32, P5}, {{Vgpr32}, {VgprP5}}})
1211 .Any({{DivS16, P5}, {{Vgpr16}, {VgprP5}}}, usesTrue16);
1212
1213 addRulesForGOpcs({G_STORE})
1214 // addrspace(0)
1215 .Any({{S16, P0}, {{}, {Vgpr16, VgprP0}}}, usesTrue16) // 16-bit store
1216 .Any({{B32, P0}, {{}, {VgprB32, VgprP0}}}) // 32-bit store, 8-bit and 16-bit truncating store
1217 .Any({{B64, P0}, {{}, {VgprB64, VgprP0}}})
1218 .Any({{B96, P0}, {{}, {VgprB96, VgprP0}}})
1219 .Any({{B128, P0}, {{}, {VgprB128, VgprP0}}})
1220
1221 // addrspace(1), there are no stores to addrspace(4)
1222 // For targets:
1223 // - with "+flat-for-global" - global_store
1224 // - without(-flat-for-global) - buffer_store addr64
1225 .Any({{S16, DivP1}, {{}, {Vgpr16, VgprP1}}}, usesTrue16) // 16-bit store
1226 .Any({{B32, DivP1}, {{}, {VgprB32, VgprP1}}}) // 32-bit store, 8-bit and 16-bit truncating store
1227 .Any({{B64, DivP1}, {{}, {VgprB64, VgprP1}}})
1228 .Any({{B96, DivP1}, {{}, {VgprB96, VgprP1}}})
1229 .Any({{B128, DivP1}, {{}, {VgprB128, VgprP1}}})
1230
1231 // For UniP1, use sgpr ptr to match flat-for-global patterns. Targets:
1232 // - with "+flat-for-global" - global_store for both sgpr and vgpr ptr
1233 // - without(-flat-for-global) - need sgpr ptr to select buffer_store
1234 .Any({{S16, UniP1}, {{}, {Vgpr16, SgprP1}}}, usesTrue16) // 16-bit store
1235 .Any({{B32, UniP1}, {{}, {VgprB32, SgprP1}}}) // 32-bit store, 8-bit and 16-bit truncating store
1236 .Any({{B64, UniP1}, {{}, {VgprB64, SgprP1}}})
1237 .Any({{B96, UniP1}, {{}, {VgprB96, SgprP1}}})
1238 .Any({{B128, UniP1}, {{}, {VgprB128, SgprP1}}})
1239
1240 // addrspace(3) and addrspace(5)
1241 .Any({{S16, Ptr32}, {{}, {Vgpr16, VgprPtr32}}}, usesTrue16) // 16-bit store
1242 .Any({{B32, Ptr32}, {{}, {VgprB32, VgprPtr32}}}) // 32-bit store, 8-bit and 16-bit truncating store
1243 .Any({{B64, Ptr32}, {{}, {VgprB64, VgprPtr32}}})
1244 .Any({{B96, Ptr32}, {{}, {VgprB96, VgprPtr32}}})
1245 .Any({{B128, Ptr32}, {{}, {VgprB128, VgprPtr32}}});
1246
1247 // clang-format on
1248
1249 addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD, G_AMDGPU_BUFFER_LOAD_FORMAT,
1250 G_AMDGPU_TBUFFER_LOAD_FORMAT},
1251 StandardB)
1260
1261 addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD_USHORT, G_AMDGPU_BUFFER_LOAD_UBYTE,
1262 G_AMDGPU_BUFFER_LOAD_SSHORT, G_AMDGPU_BUFFER_LOAD_SBYTE},
1263 StandardB)
1266
1267 addRulesForGOpcs(
1268 {G_AMDGPU_BUFFER_LOAD_UBYTE_TFE, G_AMDGPU_BUFFER_LOAD_USHORT_TFE},
1269 StandardB)
1272
1273 addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD_TFE, G_AMDGPU_BUFFER_LOAD_FORMAT_TFE},
1274 StandardB)
1282 .Any({{UniB160},
1284
1285 addRulesForGOpcs(
1286 {G_AMDGPU_BUFFER_LOAD_FORMAT_D16, G_AMDGPU_TBUFFER_LOAD_FORMAT_D16},
1287 StandardB)
1294
1295 addRulesForGOpcs({G_AMDGPU_S_BUFFER_LOAD})
1296 // waterfall expansion is part of S_BUF_to_BUF
1297 .Any({{UniB32}, {{SgprB32}, {SgprV4S32, Sgpr32}}})
1298 .Any({{DivB32, UniV4S32, DivB32},
1300 .Any({{DivB32, DivV4S32, UniB32},
1302 .Any({{DivB32, DivV4S32, DivB32},
1304
1305 .Any({{UniB64}, {{SgprB64}, {SgprV4S32, Sgpr32}}})
1306 .Any({{DivB64, UniV4S32, DivB32},
1308 .Any({{DivB64, DivV4S32, UniB32},
1310 .Any({{DivB64, DivV4S32, DivB32},
1312
1313 .Any({{UniB96}, {{SgprB96}, {SgprV4S32, Sgpr32}}})
1314 .Any({{DivB96, UniV4S32, DivB32},
1316 .Any({{DivB96, DivV4S32, UniB32},
1318 .Any({{DivB96, DivV4S32, DivB32},
1320
1321 .Any({{UniB128}, {{SgprB128}, {SgprV4S32, Sgpr32}}})
1322 .Any({{DivB128, UniV4S32, DivB32},
1324 .Any({{DivB128, DivV4S32, UniB32},
1326 .Any({{DivB128, DivV4S32, DivB32},
1328
1329 .Any({{UniB256}, {{SgprB256}, {SgprV4S32, Sgpr32}}})
1330 .Any({{DivB256, UniV4S32, DivB32},
1332 .Any({{DivB256, DivV4S32, UniB32},
1334 .Any({{DivB256, DivV4S32, DivB32},
1336
1337 .Any({{UniB512}, {{SgprB512}, {SgprV4S32, Sgpr32}}})
1338 .Any({{DivB512, UniV4S32, DivB32},
1340 .Any({{DivB512, DivV4S32, UniB32},
1342 .Any({{DivB512, DivV4S32, DivB32},
1344
1345 addRulesForGOpcs({G_AMDGPU_S_BUFFER_LOAD_SBYTE, G_AMDGPU_S_BUFFER_LOAD_UBYTE,
1346 G_AMDGPU_S_BUFFER_LOAD_SSHORT,
1347 G_AMDGPU_S_BUFFER_LOAD_USHORT})
1349 .Any({{DivS32, UniV4S32, DivS32},
1351 .Any({{DivS32, DivV4S32, UniS32},
1353 .Any({{DivS32, DivV4S32, DivS32},
1355
1356 addRulesForGOpcs({G_AMDGPU_BUFFER_STORE, G_AMDGPU_BUFFER_STORE_BYTE,
1357 G_AMDGPU_BUFFER_STORE_SHORT, G_AMDGPU_BUFFER_STORE_FORMAT,
1358 G_AMDGPU_BUFFER_STORE_FORMAT_D16,
1359 G_AMDGPU_TBUFFER_STORE_FORMAT,
1360 G_AMDGPU_TBUFFER_STORE_FORMAT_D16})
1361 .Any({{B32}, {{}, {VgprB32, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}})
1362 .Any({{B64}, {{}, {VgprB64, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}})
1363 .Any({{B96}, {{}, {VgprB96, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}})
1364 .Any({{B128}, {{}, {VgprB128, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}});
1365
1366 // Buffer atomics: resource descriptor + scalar offset are SGPR, data and
1367 // address components are VGPR.
1368 //
1369 // Operand order (SIInstructions.td BufferAtomicGenericInstruction):
1370 // dst = op vdata, rsrc, vindex, voffset, soffset, offset_imm, cachepolicy,
1371 // idxen_imm
1372 addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_FADD})
1373 .Any({{S32, S32, V4S32, S32, S32, S32},
1375 .Any({{S64, S64, V4S32, S32, S32, S32},
1377 .Any({{V2S16, V2S16, V4S32, S32, S32, S32},
1378 {{VgprV2S16},
1380
1381 addRulesForGOpcs({G_PTR_ADD})
1382 .Any({{UniPtr32}, {{SgprPtr32}, {SgprPtr32, Sgpr32}}})
1383 .Any({{DivPtr32}, {{VgprPtr32}, {VgprPtr32, Vgpr32}}})
1384 .Any({{UniPtr64}, {{SgprPtr64}, {SgprPtr64, Sgpr64}}})
1385 .Any({{DivPtr64}, {{VgprPtr64}, {VgprPtr64, Vgpr64}}});
1386
1387 addRulesForGOpcs({G_INTTOPTR})
1388 .Any({{UniPtr32}, {{SgprPtr32}, {Sgpr32}}})
1389 .Any({{DivPtr32}, {{VgprPtr32}, {Vgpr32}}})
1390 .Any({{UniPtr64}, {{SgprPtr64}, {Sgpr64}}})
1391 .Any({{DivPtr64}, {{VgprPtr64}, {Vgpr64}}})
1392 .Any({{UniPtr128}, {{SgprPtr128}, {Sgpr128}}})
1393 .Any({{DivPtr128}, {{VgprPtr128}, {Vgpr128}}});
1394
1395 addRulesForGOpcs({G_PTRTOINT})
1396 .Any({{UniS32}, {{Sgpr32}, {SgprPtr32}}})
1397 .Any({{DivS32}, {{Vgpr32}, {VgprPtr32}}})
1398 .Any({{UniS64}, {{Sgpr64}, {SgprPtr64}}})
1399 .Any({{DivS64}, {{Vgpr64}, {VgprPtr64}}})
1400 .Any({{UniS128}, {{Sgpr128}, {SgprPtr128}}})
1401 .Any({{DivS128}, {{Vgpr128}, {VgprPtr128}}});
1402
1403 // FIXME: Update llvm/test/CodeGen/AMDGPU/ptrmask.ll to use GlobalISel.
1404 // Currently crashes on P8 (buffer resource) tests due to legalizer issue.
1405 addRulesForGOpcs({G_PTRMASK})
1406 .Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}})
1407 .Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}})
1408 .Any({{UniP3}, {{SgprP3}, {SgprP3, Sgpr32}}})
1409 .Any({{DivP3}, {{VgprP3}, {VgprP3, Vgpr32}}});
1410
1411 addRulesForGOpcs({G_DYN_STACKALLOC})
1412 .Any({{UniP5, UniS32}, {{SgprP5}, {Sgpr32}, DynStackAlloc}})
1413 .Any({{UniP5, DivS32}, {{SgprP5}, {Vgpr32}, DynStackAlloc}});
1414
1415 addRulesForGOpcs({G_ABS}, Standard)
1416 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}})
1417 .Div(S16, {{Vgpr16}, {Vgpr16}, AbsToNegMax})
1418 .Uni(S32, {{Sgpr32}, {Sgpr32}})
1419 .Div(S32, {{Vgpr32}, {Vgpr32}, AbsToNegMax})
1420 .Uni(V2S16, {{SgprV2S16}, {SgprV2S16}, AbsToS32})
1421 .Div(V2S16, {{VgprV2S16}, {VgprV2S16}, AbsToNegMax});
1422
1423 addRulesForGOpcs({G_BITREVERSE}, Standard)
1424 .Uni(S32, {{Sgpr32}, {Sgpr32}})
1425 .Div(S32, {{Vgpr32}, {Vgpr32}})
1426 .Uni(S64, {{Sgpr64}, {Sgpr64}})
1427 .Div(S64, {{Vgpr64}, {Vgpr64}});
1428
1429 addRulesForGOpcs({G_AMDGPU_FFBH_U32, G_AMDGPU_FFBL_B32, G_CTLZ_ZERO_POISON,
1430 G_CTTZ_ZERO_POISON})
1431 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}})
1432 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
1433 .Any({{UniS32, S64}, {{Sgpr32}, {Sgpr64}}})
1435
1436 addRulesForGOpcs({G_CTPOP})
1437 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}})
1438 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
1439 .Any({{UniS32, S64}, {{Sgpr32}, {Sgpr64}}})
1440 .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}, CtPop64To32}});
1441
1442 addRulesForGOpcs({G_FENCE}).Any({{{}}, {{}, {}}});
1443
1444 addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard)
1445 .Uni(S64, {{Sgpr64}, {}});
1446
1447 addRulesForGOpcs({G_BLOCK_ADDR}).Any({{UniP0}, {{SgprP0}, {}}});
1448
1449 addRulesForGOpcs({G_GLOBAL_VALUE})
1450 .Any({{UniP0}, {{SgprP0}, {}}})
1451 .Any({{UniP1}, {{SgprP1}, {}}})
1452 .Any({{UniP3}, {{SgprP3}, {}}})
1453 .Any({{UniP4}, {{SgprP4}, {}}})
1454 .Any({{UniP8}, {{SgprP8}, {}}});
1455
1456 addRulesForGOpcs({G_AMDGPU_WAVE_ADDRESS}).Any({{UniP5}, {{SgprP5}, {}}});
1457
1458 addRulesForGOpcs({G_AMDGPU_SPONENTRY}, Standard).Uni(S32, {{Sgpr32}, {}});
1459
1460 addRulesForGOpcs({G_SI_CALL})
1461 .Any({{_, UniP0}, {{None}, {SgprP0}}})
1462 .Any({{_, DivP0}, {{None}, {SgprP0Call_WF}}})
1463 .Any({{_, UniP4}, {{None}, {SgprP4}}})
1464 .Any({{_, DivP4}, {{None}, {SgprP4Call_WF}}});
1465
1466 bool hasSALUFloat = ST->hasSALUFloatInsts();
1467
1468 addRulesForGOpcs({G_FADD, G_FMUL, G_STRICT_FADD, G_STRICT_FMUL}, Standard)
1469 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
1470 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
1471 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1472 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
1473 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat)
1474 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1475 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
1476 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
1477 .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}, !hasSALUFloat)
1479 hasSALUFloat)
1480 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
1484 .Any({{DivV2S64}, {{VgprV2S64}, {VgprV2S64, VgprV2S64}}});
1485
1486 addRulesForGOpcs({G_FSUB, G_STRICT_FSUB}, Standard)
1487 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1488 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1489 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
1490 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
1491 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
1492 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat);
1493
1494 addRulesForGOpcs({G_FMAD}, Standard)
1495 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}})
1496 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
1497 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
1498 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
1499
1500 addRulesForGOpcs({G_FLDEXP, G_STRICT_FLDEXP}, Standard)
1501 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
1502 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1503 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}})
1504 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1505 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr32}})
1506 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
1507
1508 addRulesForGOpcs({G_FMA, G_STRICT_FMA}, Standard)
1509 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
1510 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}})
1511 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64, Vgpr64}})
1512 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64, Vgpr64}})
1516 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16, Sgpr16}}, hasSALUFloat)
1517 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}}, !hasSALUFloat)
1518 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32, Sgpr32}}, hasSALUFloat)
1519 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}}, !hasSALUFloat)
1520 .Uni(V2S16,
1522 hasSALUFloat)
1524 !hasSALUFloat)
1527
1528 addRulesForGOpcs({G_AMDGPU_FMED3}, Standard)
1529 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}})
1530 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
1531 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
1532 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
1533
1534 // TODO: This opcode is generated from the i64->i16 signed clamped pattern in
1535 // the PreLegalizerCombiner. Move the combine to RegBankCombiner to keep more
1536 // instructions on SALU.
1537 addRulesForGOpcs({G_AMDGPU_SMED3}, Standard)
1538 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
1539 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
1540
1541 // FNEG and FABS are either folded as source modifiers or can be selected as
1542 // bitwise XOR and AND with Mask. XOR and AND are available on SALU but for
1543 // targets without SALU float we still select them as VGPR since there would
1544 // be no real sgpr use.
1545 addRulesForGOpcs({G_FNEG, G_FABS}, Standard)
1546 .Uni(S16, {{UniInVgprS16}, {Vgpr16}}, !hasSALUFloat)
1547 .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasSALUFloat)
1548 .Div(S16, {{Vgpr16}, {Vgpr16}})
1549 .Uni(S32, {{UniInVgprS32}, {Vgpr32}}, !hasSALUFloat)
1550 .Uni(S32, {{Sgpr32}, {Sgpr32}}, hasSALUFloat)
1551 .Div(S32, {{Vgpr32}, {Vgpr32}})
1552 .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
1553 .Div(S64, {{Vgpr64}, {Vgpr64}})
1554 .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}}, !hasSALUFloat)
1555 .Uni(V2S16, {{SgprV2S16}, {SgprV2S16}, ScalarizeToS16}, hasSALUFloat)
1556 .Div(V2S16, {{VgprV2S16}, {VgprV2S16}})
1557 .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32}}})
1558 .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32}}});
1559
1560 addRulesForGOpcs({G_FCANONICALIZE}, Standard)
1561 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
1562 .Div(S32, {{Vgpr32}, {Vgpr32}})
1563 .Uni(S16, {{UniInVgprS16}, {Vgpr16}})
1564 .Div(S16, {{Vgpr16}, {Vgpr16}})
1565 .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
1566 .Div(S64, {{Vgpr64}, {Vgpr64}})
1567 .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}})
1568 .Div(V2S16, {{VgprV2S16}, {VgprV2S16}})
1569 .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32}}})
1570 .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32}}})
1571 .Any({{UniV2S64}, {{UniInVgprV2S64}, {VgprV2S64}}})
1572 .Any({{DivV2S64}, {{VgprV2S64}, {VgprV2S64}}});
1573
1574 bool hasPST = ST->hasPseudoScalarTrans();
1575 addRulesForGOpcs({G_FSQRT}, Standard)
1576 .Div(S16, {{Vgpr16}, {Vgpr16}})
1577 .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasPST)
1578 .Uni(S16, {{UniInVgprS16}, {Vgpr16}}, !hasPST);
1579
1580 addRulesForGOpcs({G_FPTOUI, G_FPTOSI, G_FPTOUI_SAT, G_FPTOSI_SAT})
1581 .Any({{UniS16, S16}, {{UniInVgprS16}, {Vgpr16}}})
1582 .Any({{DivS16, S16}, {{Vgpr16}, {Vgpr16}}})
1583 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}, hasSALUFloat)
1584 .Any({{UniS32, S16}, {{UniInVgprS32}, {Vgpr16}}}, !hasSALUFloat)
1585 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}})
1586 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
1587 .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat)
1588 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
1589 .Any({{UniS32, S64}, {{UniInVgprS32}, {Vgpr64}}})
1590 .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}});
1591
1592 addRulesForGOpcs({G_UITOFP, G_SITOFP})
1593 .Any({{UniS16, S16}, {{UniInVgprS16}, {Vgpr16}}})
1594 .Any({{DivS16, S16}, {{Vgpr16}, {Vgpr16}}})
1595 .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}}, hasSALUFloat)
1596 .Any({{UniS16, S32}, {{UniInVgprS16}, {Vgpr32}}}, !hasSALUFloat)
1597 .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
1598 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
1599 .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat)
1600 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
1601 .Any({{UniS64, S32}, {{UniInVgprS64}, {Vgpr32}}})
1602 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}}});
1603
1604 addRulesForGOpcs({G_AMDGPU_S_BUFFER_PREFETCH})
1606
1607 Predicate IsDataPF([](const MachineInstr &MI) -> bool {
1608 // prefetch cache type: 0 == instruction (I$) prefetch, 1 == data prefetch.
1609 return MI.getOperand(3).getImm() != 0;
1610 });
1611
1612 bool HasSMemPF = ST->hasSafeSmemPrefetch();
1613 bool HasVMemPF = ST->hasVmemPrefInsts();
1614 addRulesForGOpcs({G_PREFETCH})
1615 // Safe smem prefetch keeps both data and instruction prefetch.
1616 .Any({{UniPtr64}, {{}, {SgprPtr64}}}, HasSMemPF)
1617 // Vmem prefetch keeps data prefetch only.
1618 .Any({{{UniPtr64}, IsDataPF}, {{}, {SgprPtr64}}}, !HasSMemPF && HasVMemPF)
1619 .Any({{{UniPtr64}, IsDataPF}, {{}, {}, DeletePrefetch}},
1620 !HasSMemPF && !HasVMemPF)
1621 .Any({{{UniPtr64}, !IsDataPF}, {{}, {}, DeletePrefetch}}, !HasSMemPF)
1622
1623 .Any({{{DivPtr64}, IsDataPF}, {{}, {VgprPtr64}}}, HasVMemPF)
1624 .Any({{{DivPtr64}, IsDataPF}, {{}, {}, DeletePrefetch}}, !HasVMemPF)
1625 .Any({{{DivPtr64}, !IsDataPF}, {{}, {}, DeletePrefetch}})
1626
1627 .Any({{P3}, {{}, {}, DeletePrefetch}})
1628 .Any({{P5}, {{}, {}, DeletePrefetch}})
1629 .Any({{UniP6}, {{}, {SgprP6}}}, HasSMemPF)
1630 .Any({{UniP6}, {{}, {}, DeletePrefetch}}, !HasSMemPF);
1631
1632 addRulesForGOpcs({G_FPEXT})
1633 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}})
1634 .Any({{UniS64, S32}, {{UniInVgprS64}, {Vgpr32}}})
1635 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}}})
1636 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}, hasSALUFloat)
1637 .Any({{UniS32, S16}, {{UniInVgprS32}, {Vgpr16}}}, !hasSALUFloat);
1638
1639 addRulesForGOpcs({G_AMDGPU_CVT_PK_I16_I32}, Standard)
1640 .Uni(V2S16, {{UniInVgprV2S16}, {Vgpr32, Vgpr32}})
1641 .Div(V2S16, {{VgprV2S16}, {Vgpr32, Vgpr32}});
1642
1643 addRulesForGOpcs({G_AMDGPU_FMIN_LEGACY, G_AMDGPU_FMAX_LEGACY}, Standard)
1644 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
1645 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
1646
1647 bool hasSALUMinimumMaximumInsts = ST->hasSALUMinimumMaximumInsts();
1648
1649 addRulesForGOpcs({G_FMINIMUM, G_FMAXIMUM}, Standard)
1650 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUMinimumMaximumInsts)
1651 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUMinimumMaximumInsts)
1652 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1653 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUMinimumMaximumInsts)
1654 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUMinimumMaximumInsts)
1655 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1656 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
1657 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
1659 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
1660
1661 addRulesForGOpcs({G_FMINNUM_IEEE, G_FMAXNUM_IEEE, G_FMINNUM, G_FMAXNUM,
1662 G_FMINIMUMNUM, G_FMAXIMUMNUM},
1663 Standard)
1664 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1665 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1666 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
1667 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
1669 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
1670 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
1671 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
1672 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
1673 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat);
1674
1675 addRulesForGOpcs({G_FPTRUNC})
1676 .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
1677 .Any({{UniS32, S64}, {{UniInVgprS32}, {Vgpr64}}})
1678 .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}})
1680 .Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}})
1681 .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}}, hasSALUFloat)
1682 .Any({{UniS16, S32}, {{UniInVgprS16}, {Vgpr32}}}, !hasSALUFloat);
1683
1684 addRulesForGOpcs({G_INTRINSIC_FPTRUNC_ROUND})
1685 .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}}, hasSALUFloat)
1686 .Any({{UniS16, S32}, {{UniInVgprS16}, {Vgpr32}}}, !hasSALUFloat)
1687 .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
1688 .Any({{UniS16, S64}, {{UniInVgprS16}, {Vgpr64}}})
1689 .Any({{DivS16, S64}, {{Vgpr16}, {Vgpr64}}})
1690 .Any({{UniS32, S64}, {{UniInVgprS32}, {Vgpr64}}})
1691 .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}});
1692
1693 addRulesForGOpcs({G_IS_FPCLASS})
1694 .Any({{DivS1, S16}, {{Vcc}, {Vgpr16}}})
1695 .Any({{UniS1, S16}, {{UniInVcc}, {Vgpr16}}})
1696 .Any({{DivS1, S32}, {{Vcc}, {Vgpr32}}})
1697 .Any({{UniS1, S32}, {{UniInVcc}, {Vgpr32}}})
1698 .Any({{DivS1, S64}, {{Vcc}, {Vgpr64}}})
1699 .Any({{UniS1, S64}, {{UniInVcc}, {Vgpr64}}});
1700
1701 addRulesForGOpcs({G_FCMP}, Standard)
1702 .Any({{UniS1, _, S16}, {{Sgpr32Trunc}, {None, Sgpr16, Sgpr16}}},
1703 hasSALUFloat)
1704 .Any({{UniS1, _, S16}, {{UniInVcc}, {None, Vgpr16, Vgpr16}}},
1705 !hasSALUFloat)
1706 .Any({{DivS1, _, S16}, {{Vcc}, {None, Vgpr16, Vgpr16}}})
1707 .Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}},
1708 hasSALUFloat)
1709 .Any({{UniS1, _, S32}, {{UniInVcc}, {None, Vgpr32, Vgpr32}}},
1710 !hasSALUFloat)
1711 .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
1712 .Any({{UniS1, _, S64}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}})
1713 .Any({{DivS1, _, S64}, {{Vcc}, {None, Vgpr64, Vgpr64}}});
1714
1715 addRulesForGOpcs({G_INTRINSIC_ROUNDEVEN, G_FEXP2, G_FLOG2}, Standard)
1716 .Uni(S16, {{UniInVgprS16}, {Vgpr16}})
1717 .Div(S16, {{Vgpr16}, {Vgpr16}})
1718 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
1719 .Div(S32, {{Vgpr32}, {Vgpr32}})
1720 .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
1721 .Div(S64, {{Vgpr64}, {Vgpr64}});
1722
1723 addRulesForGOpcs({G_INTRINSIC_TRUNC, G_FFLOOR, G_FCEIL}, Standard)
1724 .Uni(S16, {{UniInVgprS16}, {Vgpr16}})
1725 .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasSALUFloat)
1726 .Div(S16, {{Vgpr16}, {Vgpr16}})
1727 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
1728 .Uni(S32, {{Sgpr32}, {Sgpr32}}, hasSALUFloat)
1729 .Div(S32, {{Vgpr32}, {Vgpr32}})
1730 .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
1731 .Div(S64, {{Vgpr64}, {Vgpr64}});
1732
1733 addRulesForGOpcs({G_AMDGPU_GLOBAL_LOAD_MONITOR, G_AMDGPU_FLAT_LOAD_MONITOR},
1734 StandardB)
1735 .Uni(B32, {{UniInVgprB32}, {SgprPtr64}})
1736 .Div(B32, {{VgprB32}, {VgprPtr64}})
1737 .Uni(B64, {{UniInVgprB64}, {SgprPtr64}})
1738 .Div(B64, {{VgprB64}, {VgprPtr64}})
1739 .Uni(B128, {{UniInVgprB128}, {SgprPtr64}})
1740 .Div(B128, {{VgprB128}, {VgprPtr64}});
1741
1742 addRulesForGOpcs({G_AMDGPU_WHOLE_WAVE_FUNC_SETUP})
1743 .Any({{DivS1}, {{Vcc}, {}}});
1744
1745 addRulesForGOpcs({G_AMDGPU_WHOLE_WAVE_FUNC_RETURN}).Any({{}, {{}, {Vcc}}});
1746
1747 using namespace Intrinsic;
1748
1749 addRulesForIOpcs({returnaddress}).Any({{UniP0}, {{SgprP0}, {}}});
1750
1751 // Note: amdgcn.icmp with i1 inputs is legalized to ballot in the legalizer,
1752 // so no S1 rules are needed here.
1753 addRulesForIOpcs({amdgcn_icmp})
1754 .Any({{UniS64, _, S16}, {{Sgpr64}, {IntrId, Vgpr16, Vgpr16}}})
1755 .Any({{UniS64, _, S32}, {{Sgpr64}, {IntrId, Vgpr32, Vgpr32}}})
1756 .Any({{UniS64, _, S64}, {{Sgpr64}, {IntrId, Vgpr64, Vgpr64}}})
1757
1758 .Any({{UniS32, _, S16}, {{Sgpr32}, {IntrId, Vgpr16, Vgpr16}}})
1759 .Any({{UniS32, _, S32}, {{Sgpr32}, {IntrId, Vgpr32, Vgpr32}}})
1760 .Any({{UniS32, _, S64}, {{Sgpr32}, {IntrId, Vgpr64, Vgpr64}}});
1761
1762 addRulesForIOpcs({amdgcn_fcmp})
1763 .Any({{UniS64, _, S16}, {{Sgpr64}, {IntrId, Vgpr16, Vgpr16}}})
1764 .Any({{UniS64, _, S32}, {{Sgpr64}, {IntrId, Vgpr32, Vgpr32}}})
1765 .Any({{UniS64, _, S64}, {{Sgpr64}, {IntrId, Vgpr64, Vgpr64}}})
1766
1767 .Any({{UniS32, _, S16}, {{Sgpr32}, {IntrId, Vgpr16, Vgpr16}}})
1768 .Any({{UniS32, _, S32}, {{Sgpr32}, {IntrId, Vgpr32, Vgpr32}}})
1769 .Any({{UniS32, _, S64}, {{Sgpr32}, {IntrId, Vgpr64, Vgpr64}}});
1770
1771 addRulesForIOpcs({amdgcn_s_getpc}).Any({{UniS64, _}, {{Sgpr64}, {}}});
1772
1773 addRulesForIOpcs({amdgcn_s_getreg}).Any({{}, {{Sgpr32}, {IntrId}}});
1774
1775 addRulesForIOpcs({amdgcn_s_setreg})
1776 .Any({{_, _, S32}, {{}, {IntrId, Imm, SgprB32_ReadFirstLane}}});
1777
1778 addRulesForIOpcs({amdgcn_s_sendmsg, amdgcn_s_sendmsghalt})
1779 .Any({{}, {{}, {IntrId, Imm, SgprB32_M0}}});
1780
1781 addRulesForIOpcs({amdgcn_s_sendmsg_rtn})
1782 .Any({{S32}, {{Sgpr32}, {}}})
1783 .Any({{S64}, {{Sgpr64}, {}}});
1784
1785 addRulesForIOpcs({amdgcn_s_memrealtime, amdgcn_s_memtime}, Standard)
1786 .Uni(S64, {{Sgpr64}, {IntrId}});
1787
1788 addRulesForIOpcs({amdgcn_groupstaticsize, amdgcn_pops_exiting_wave_id,
1789 amdgcn_reloc_constant, amdgcn_s_get_waveid_in_workgroup},
1790 Standard)
1791 .Uni(S32, {{Sgpr32}, {IntrId}});
1792
1793 // Intrinsics with no register operands.
1794 addRulesForIOpcs({amdgcn_asyncmark,
1795 amdgcn_endpgm,
1796 amdgcn_iglp_opt,
1797 amdgcn_init_exec,
1798 amdgcn_s_barrier,
1799 amdgcn_s_barrier_leave,
1800 amdgcn_s_barrier_signal,
1801 amdgcn_s_barrier_wait,
1802 amdgcn_s_monitor_sleep,
1803 amdgcn_s_nop,
1804 amdgcn_s_sethalt,
1805 amdgcn_s_setprio,
1806 amdgcn_s_setprio_inc_wg,
1807 amdgcn_s_sleep,
1808 amdgcn_s_ttracedata_imm,
1809 amdgcn_s_wait_asynccnt,
1810 amdgcn_s_wait_bvhcnt,
1811 amdgcn_s_wait_dscnt,
1812 amdgcn_s_wait_event,
1813 amdgcn_s_wait_event_export_ready,
1814 amdgcn_s_wait_expcnt,
1815 amdgcn_s_wait_kmcnt,
1816 amdgcn_s_wait_loadcnt,
1817 amdgcn_s_wait_samplecnt,
1818 amdgcn_s_wait_storecnt,
1819 amdgcn_s_wait_tensorcnt,
1820 amdgcn_s_waitcnt,
1821 amdgcn_sched_barrier,
1822 amdgcn_sched_group_barrier,
1823 amdgcn_unreachable,
1824 amdgcn_wait_asyncmark,
1825 amdgcn_wave_barrier})
1826 .Any({{}, {{}, {}}});
1827
1828 addRulesForIOpcs({amdgcn_init_exec_from_input})
1829 .Any({{}, {{}, {IntrId, Sgpr32}}});
1830
1831 addRulesForIOpcs({amdgcn_s_ttracedata}).Any({{}, {{}, {IntrId, SgprB32_M0}}});
1832
1833 addRulesForIOpcs({amdgcn_s_sleep_var})
1834 .Any({{}, {{}, {IntrId, SgprB32_ReadFirstLane}}});
1835
1836 addRulesForIOpcs({amdgcn_s_barrier_join, amdgcn_s_wakeup_barrier})
1837 .Any({{}, {{}, {IntrId, SgprB32_M0}}});
1838
1839 addRulesForIOpcs({amdgcn_s_barrier_signal_var, amdgcn_s_barrier_init})
1840 .Any({{}, {{}, {IntrId, SgprB32_M0, SgprB32_M0}}});
1841
1842 addRulesForIOpcs({amdgcn_s_barrier_signal_isfirst})
1843 .Any({{UniS1}, {{Sgpr32Trunc}, {}}});
1844
1845 addRulesForIOpcs(
1846 {amdgcn_s_get_named_barrier_state, amdgcn_s_get_barrier_state}, Standard)
1847 .Uni(S32, {{Sgpr32}, {IntrId, SgprB32_M0}});
1848
1849 addRulesForIOpcs({amdgcn_flat_prefetch}).Any({{}, {{}, {IntrId, VgprP0}}});
1850
1851 addRulesForIOpcs({amdgcn_global_prefetch}).Any({{}, {{}, {IntrId, VgprP1}}});
1852
1853 addRulesForIOpcs({amdgcn_s_prefetch_data, amdgcn_s_prefetch_inst})
1855
1856 addRulesForIOpcs({amdgcn_class})
1857 .Any({{UniS1, _, S16}, {{UniInVcc}, {IntrId, Vgpr16, Vgpr32}}})
1858 .Any({{DivS1, _, S16}, {{Vcc}, {IntrId, Vgpr16, Vgpr32}}})
1859 .Any({{UniS1, _, S32}, {{UniInVcc}, {IntrId, Vgpr32, Vgpr32}}})
1860 .Any({{DivS1, _, S32}, {{Vcc}, {IntrId, Vgpr32, Vgpr32}}})
1861 .Any({{UniS1, _, S64}, {{UniInVcc}, {IntrId, Vgpr64, Vgpr32}}})
1862 .Any({{DivS1, _, S64}, {{Vcc}, {IntrId, Vgpr64, Vgpr32}}});
1863
1864 // This is "intrinsic lane mask" it was set to i32/i64 in llvm-ir.
1865 addRulesForIOpcs({amdgcn_end_cf})
1866 .Any({{_, UniS32}, {{}, {IntrId, Sgpr32}}})
1867 .Any({{_, UniS64}, {{}, {IntrId, Sgpr64}}});
1868
1869 addRulesForIOpcs({amdgcn_if_break}, Standard)
1870 .Uni(S64, {{Sgpr64}, {IntrId, Vcc, Sgpr64}})
1871 .Uni(S32, {{Sgpr32}, {IntrId, Vcc, Sgpr32}});
1872
1873 addRulesForIOpcs({amdgcn_exp})
1874 .Any({{_, _, _, S32, S32, S32, S32},
1875 {{}, {IntrId, Imm, Imm, Vgpr32, Vgpr32, Vgpr32, Vgpr32}}});
1876
1877 addRulesForIOpcs({amdgcn_exp_compr})
1878 .Any({{_, _, _, V2S16}, {{}, {IntrId, Imm, Imm, VgprV2S16, VgprV2S16}}});
1879
1880 addRulesForIOpcs({amdgcn_exp_row})
1881 .Any({{_, _, _, S32, S32, S32, S32, _, S32},
1882 {{},
1884 SgprB32_M0}}});
1885
1886 addRulesForIOpcs({amdgcn_lds_direct_load}, StandardB)
1887 .Div(B32, {{VgprB32}, {IntrId, SgprB32_M0}});
1888
1889 addRulesForIOpcs({amdgcn_lds_param_load}, Standard)
1890 .Div(S32, {{Vgpr32}, {IntrId, Imm, Imm, SgprB32_M0}});
1891
1892 addRulesForIOpcs({amdgcn_mbcnt_lo, amdgcn_mbcnt_hi}, Standard)
1893 .Div(S32, {{}, {Vgpr32, None, Vgpr32, Vgpr32}});
1894
1895 addRulesForIOpcs({amdgcn_readfirstlane})
1896 .Any({{UniB32, _, DivB32}, {{}, {SgprB32, None, VgprB32}}})
1897 // this should not exist in the first place, it is from call lowering
1898 // readfirstlaning just in case register is not in sgpr.
1899 .Any({{UniS32, _, UniS32}, {{}, {Sgpr32, None, Vgpr32}}});
1900
1901 addRulesForIOpcs({amdgcn_readlane}, StandardB)
1903
1904 addRulesForIOpcs({amdgcn_s_quadmask, amdgcn_s_wqm}, StandardB)
1906 .Uni(B64, {{SgprB64}, {IntrId, SgprB64_ReadFirstLane}});
1907
1908 addRulesForIOpcs({amdgcn_writelane}, StandardB)
1909 .Div(B32,
1910 {{VgprB32},
1912
1913 addRulesForIOpcs({amdgcn_add_max_i32, amdgcn_add_max_u32, amdgcn_add_min_i32,
1914 amdgcn_add_min_u32},
1915 Standard)
1916 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1917 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1918
1919 addRulesForIOpcs({amdgcn_pk_add_max_i16, amdgcn_pk_add_max_u16,
1920 amdgcn_pk_add_min_i16, amdgcn_pk_add_min_u16},
1921 Standard)
1924
1925 addRulesForIOpcs({amdgcn_permlane16, amdgcn_permlanex16}, Standard)
1926 .Div(S32, {{Vgpr32},
1929
1930 addRulesForIOpcs({amdgcn_permlane_bcast, amdgcn_permlane_up,
1931 amdgcn_permlane_down, amdgcn_permlane_xor},
1932 StandardB)
1933 .Div(B32,
1934 {{VgprB32},
1936
1937 addRulesForIOpcs({amdgcn_permlane_idx_gen}, Standard)
1939
1940 addRulesForIOpcs({amdgcn_perm}, Standard)
1941 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1942 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1943
1944 addRulesForIOpcs(
1945 {amdgcn_wave_reduce_add, amdgcn_wave_reduce_and, amdgcn_wave_reduce_fadd,
1946 amdgcn_wave_reduce_fmax, amdgcn_wave_reduce_fmin,
1947 amdgcn_wave_reduce_fsub, amdgcn_wave_reduce_max, amdgcn_wave_reduce_min,
1948 amdgcn_wave_reduce_or, amdgcn_wave_reduce_sub, amdgcn_wave_reduce_umax,
1949 amdgcn_wave_reduce_umin, amdgcn_wave_reduce_xor},
1950 Standard)
1951 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}})
1952 .Div(S32, {{Sgpr32ToVgprDst}, {IntrId, VgprB32}})
1953 .Uni(S64, {{Sgpr64}, {IntrId, Sgpr64}})
1954 .Div(S64, {{Sgpr64ToVgprDst}, {IntrId, VgprB64}});
1955
1956 addRulesForIOpcs({amdgcn_wave_shuffle}, Standard)
1957 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32}})
1958 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}});
1959
1960 addRulesForIOpcs({amdgcn_bitop3, amdgcn_fmad_ftz}, Standard)
1961 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16, Vgpr16, Vgpr16}})
1962 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16, Vgpr16, Vgpr16}})
1963 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1964 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1965
1966 addRulesForIOpcs({amdgcn_udot4, amdgcn_sdot4, amdgcn_udot8, amdgcn_sdot8,
1967 amdgcn_dot4_f32_bf8_bf8, amdgcn_dot4_f32_bf8_fp8,
1968 amdgcn_dot4_f32_fp8_fp8, amdgcn_dot4_f32_fp8_bf8},
1969 Standard)
1970 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1971 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1972
1973 addRulesForIOpcs({amdgcn_rsq, amdgcn_rsq_clamp}, Standard)
1974 .Uni(S16, {{Sgpr16}, {IntrId, Sgpr16}}, hasPST)
1975 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}}, !hasPST)
1976 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
1977 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}}, hasPST)
1978 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}}, !hasPST)
1979 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
1980 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64}})
1981 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64}});
1982
1983 addRulesForIOpcs({amdgcn_mul_u24, amdgcn_mul_i24}, Standard)
1984 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32}})
1985 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}})
1986 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr32, Vgpr32}})
1987 .Div(S64, {{Vgpr64}, {IntrId, Vgpr32, Vgpr32}});
1988
1989 addRulesForIOpcs({amdgcn_ds_bpermute, amdgcn_ds_bpermute_fi_b32,
1990 amdgcn_ds_permute, amdgcn_fmul_legacy, amdgcn_mulhi_i24,
1991 amdgcn_mulhi_u24},
1992 Standard)
1993 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32}})
1994 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}});
1995
1996 addRulesForIOpcs({amdgcn_cvt_sr_bf8_f32, amdgcn_cvt_sr_fp8_f32,
1997 amdgcn_cvt_sr_fp8_f32_e5m3, amdgcn_cvt_pk_bf8_f32,
1998 amdgcn_cvt_pk_fp8_f32, amdgcn_cvt_pk_fp8_f32_e5m3},
1999 Standard)
2000 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
2001 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
2002
2003 addRulesForIOpcs({amdgcn_cvt_off_f32_i4, amdgcn_cvt_f32_bf8,
2004 amdgcn_cvt_f32_fp8, amdgcn_cvt_f32_fp8_e5m3},
2005 Standard)
2006 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}})
2007 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}});
2008
2009 addRulesForIOpcs({amdgcn_cvt_pk_f32_bf8, amdgcn_cvt_pk_f32_fp8})
2010 .Any({{UniV2S32}, {{UniInVgprV2S32}, {IntrId, Vgpr32}}})
2011 .Any({{DivV2S32}, {{VgprV2S32}, {IntrId, Vgpr32}}});
2012
2013 addRulesForIOpcs({amdgcn_cvt_f16_bf8, amdgcn_cvt_f16_fp8}, Standard)
2014 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr32}})
2015 .Div(S16, {{Vgpr16}, {IntrId, Vgpr32}});
2016
2017 addRulesForIOpcs({amdgcn_cvt_pk_f16_bf8, amdgcn_cvt_pk_f16_fp8}, Standard)
2018 .Uni(V2S16, {{UniInVgprV2S16}, {IntrId, Vgpr16}})
2019 .Div(V2S16, {{VgprV2S16}, {IntrId, Vgpr16}});
2020
2021 addRulesForIOpcs({amdgcn_cvt_pk_bf8_f16, amdgcn_cvt_pk_fp8_f16}, Standard)
2022 .Uni(S16, {{UniInVgprS16}, {IntrId, VgprV2S16}})
2023 .Div(S16, {{Vgpr16}, {IntrId, VgprV2S16}});
2024
2025 addRulesForIOpcs({amdgcn_cvt_sr_bf8_f16, amdgcn_cvt_sr_fp8_f16}, Standard)
2026 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr16, Vgpr32, Vgpr32}})
2027 .Div(S32, {{Vgpr32}, {IntrId, Vgpr16, Vgpr32, Vgpr32}});
2028
2029 addRulesForIOpcs({amdgcn_cvt_sr_pk_f16_f32}, Standard)
2031 .Div(V2S16, {{VgprV2S16}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
2032
2033 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_fp8_f16})
2034 .Any({{DivS32}, {{Vgpr32}, {IntrId, Vgpr32, Vgpr16, Vgpr32, Vgpr32}}});
2035
2036 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_fp8_f32})
2037 .Any({{DivS32}, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32, Vgpr32}}});
2038
2039 addRulesForIOpcs({amdgcn_cubesc, amdgcn_cubetc, amdgcn_cubema, amdgcn_cubeid,
2040 amdgcn_fma_legacy},
2041 Standard)
2042 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
2043 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
2044
2045 addRulesForIOpcs({amdgcn_frexp_mant, amdgcn_fract}, Standard)
2046 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}})
2047 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
2048 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}})
2049 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
2050 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64}})
2051 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64}});
2052
2053 addRulesForIOpcs({amdgcn_prng_b32})
2054 .Any({{UniS32}, {{UniInVgprS32}, {IntrId, Vgpr32}}})
2055 .Any({{DivS32}, {{Vgpr32}, {IntrId, Vgpr32}}});
2056
2057 addRulesForIOpcs({amdgcn_sffbh}, Standard)
2058 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}})
2059 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}});
2060
2061 addRulesForIOpcs({amdgcn_ubfe, amdgcn_sbfe}, Standard)
2062 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
2063 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32, Sgpr32, Sgpr32}, S_BFE})
2064 .Uni(S64, {{Sgpr64}, {IntrId, Sgpr64, Sgpr32, Sgpr32}, S_BFE})
2065 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr32, Vgpr32}, V_BFE});
2066
2067 addRulesForIOpcs({amdgcn_cvt_pk_i16, amdgcn_cvt_pk_u16, amdgcn_cvt_pknorm_i16,
2068 amdgcn_cvt_pknorm_u16},
2069 Standard)
2070 .Uni(V2S16, {{UniInVgprV2S16}, {IntrId, Vgpr32, Vgpr32}})
2071 .Div(V2S16, {{VgprV2S16}, {IntrId, Vgpr32, Vgpr32}});
2072
2073 addRulesForIOpcs({amdgcn_cvt_pkrtz}, Standard)
2074 .Uni(V2S16, {{SgprV2S16}, {IntrId, Sgpr32, Sgpr32}}, hasSALUFloat)
2075 .Uni(V2S16, {{UniInVgprV2S16}, {IntrId, Vgpr32, Vgpr32}}, !hasSALUFloat)
2076 .Div(V2S16, {{VgprV2S16}, {IntrId, Vgpr32, Vgpr32}});
2077
2078 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk32_bf6_f16,
2079 amdgcn_cvt_scalef32_sr_pk32_fp6_f16,
2080 amdgcn_cvt_scalef32_sr_pk32_bf6_bf16,
2081 amdgcn_cvt_scalef32_sr_pk32_fp6_bf16},
2082 Standard)
2084
2085 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk32_bf6_f32,
2086 amdgcn_cvt_scalef32_sr_pk32_fp6_f32},
2087 Standard)
2089
2090 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk_fp4_f16}, Standard)
2092 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, VgprV2S16, Vgpr32, Vgpr32}});
2093
2094 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk_fp4_f32}, Standard)
2096 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, VgprV2S32, Vgpr32, Vgpr32}});
2097
2098 addRulesForIOpcs(
2099 {amdgcn_cvt_scalef32_2xpk16_fp6_f32, amdgcn_cvt_scalef32_2xpk16_bf6_f32})
2100 .Any(
2102 .Any({{UniV6S32},
2104
2105 addRulesForIOpcs({amdgcn_cvt_scalef32_f16_fp8, amdgcn_cvt_scalef32_f16_bf8},
2106 Standard)
2107 .Div(V2S16, {{VgprV2S16}, {IntrId, VgprV2S16, Vgpr32, Vgpr32}})
2109
2110 addRulesForIOpcs({amdgcn_cvt_scalef32_f32_fp8, amdgcn_cvt_scalef32_f32_bf8},
2111 Standard)
2112 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}})
2113 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32}});
2114
2115 addRulesForIOpcs(
2116 {amdgcn_cvt_scalef32_pk16_bf6_f16, amdgcn_cvt_scalef32_pk16_fp6_f16},
2117 Standard)
2120
2121 addRulesForIOpcs(
2122 {amdgcn_cvt_scalef32_pk16_bf6_f32, amdgcn_cvt_scalef32_pk16_fp6_f32},
2123 Standard)
2126
2127 addRulesForIOpcs(
2128 {amdgcn_cvt_scalef32_pk8_bf8_f16, amdgcn_cvt_scalef32_pk8_fp8_f16},
2129 Standard)
2132
2133 addRulesForIOpcs(
2134 {amdgcn_cvt_scalef32_pk8_bf8_f32, amdgcn_cvt_scalef32_pk8_fp8_f32},
2135 Standard)
2138
2139 addRulesForIOpcs({amdgcn_cvt_scalef32_pk8_fp4_f16}, Standard)
2140 .Div(S32, {{Vgpr32}, {IntrId, VgprV8S16, Vgpr32}})
2141 .Uni(S32, {{UniInVgprS32}, {IntrId, VgprV8S16, Vgpr32}});
2142
2143 addRulesForIOpcs({amdgcn_cvt_scalef32_pk8_fp4_f32}, Standard)
2144 .Div(S32, {{Vgpr32}, {IntrId, VgprV8S32, Vgpr32}})
2145 .Uni(S32, {{UniInVgprS32}, {IntrId, VgprV8S32, Vgpr32}});
2146
2147 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk16_bf6_f16,
2148 amdgcn_cvt_scalef32_sr_pk16_fp6_f16},
2149 Standard)
2151 .Any({{UniV3S32},
2153
2154 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk16_bf6_f32,
2155 amdgcn_cvt_scalef32_sr_pk16_fp6_f32},
2156 Standard)
2158 .Any({{UniV3S32},
2160
2161 addRulesForIOpcs(
2162 {amdgcn_cvt_scalef32_sr_pk8_bf8_f16, amdgcn_cvt_scalef32_sr_pk8_fp8_f16},
2163 Standard)
2165 .Any({{UniV2S32},
2167
2168 addRulesForIOpcs(
2169 {amdgcn_cvt_scalef32_sr_pk8_bf8_f32, amdgcn_cvt_scalef32_sr_pk8_fp8_f32},
2170 Standard)
2172 .Any({{UniV2S32},
2174
2175 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk8_fp4_f16}, Standard)
2176 .Div(S32, {{Vgpr32}, {IntrId, VgprV8S16, Vgpr32, Vgpr32}})
2177 .Uni(S32, {{UniInVgprS32}, {IntrId, VgprV8S16, Vgpr32, Vgpr32}});
2178
2179 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk8_fp4_f32}, Standard)
2180 .Div(S32, {{Vgpr32}, {IntrId, VgprV8S32, Vgpr32, Vgpr32}})
2181 .Uni(S32, {{UniInVgprS32}, {IntrId, VgprV8S32, Vgpr32, Vgpr32}});
2182
2183 addRulesForIOpcs(
2184 {amdgcn_cvt_scale_pk16_f16_bf6, amdgcn_cvt_scale_pk16_f16_fp6}, Standard)
2187
2188 addRulesForIOpcs(
2189 {amdgcn_cvt_scale_pk16_f32_bf6, amdgcn_cvt_scale_pk16_f32_fp6}, Standard)
2192
2193 addRulesForIOpcs({amdgcn_cvt_scale_pk8_f16_bf8, amdgcn_cvt_scale_pk8_f16_fp8},
2194 Standard)
2197
2198 addRulesForIOpcs({amdgcn_cvt_scale_pk8_f16_fp4}, Standard)
2199 .Any({{DivV8S16}, {{VgprV8S16}, {IntrId, Vgpr32, Vgpr32}}})
2201
2202 addRulesForIOpcs({amdgcn_cvt_scale_pk8_f32_bf8, amdgcn_cvt_scale_pk8_f32_fp8},
2203 Standard)
2206
2207 addRulesForIOpcs({amdgcn_cvt_scale_pk8_f32_fp4}, Standard)
2208 .Any({{DivV8S32}, {{VgprV8S32}, {IntrId, Vgpr32, Vgpr32}}})
2210
2211 addRulesForIOpcs(
2212 {amdgcn_cvt_scalef32_pk32_bf6_f16, amdgcn_cvt_scalef32_pk32_fp6_f16},
2213 Standard)
2216
2217 addRulesForIOpcs(
2218 {amdgcn_cvt_scalef32_pk_fp8_f32, amdgcn_cvt_scalef32_pk_bf8_f32},
2219 Standard)
2221 .Uni(V2S16,
2223
2224 addRulesForIOpcs(
2225 {amdgcn_cvt_scalef32_pk_f32_fp8, amdgcn_cvt_scalef32_pk_f32_bf8},
2226 Standard)
2227 .Any({{DivV2S32}, {{VgprV2S32}, {IntrId, Vgpr32, Vgpr32}}})
2229
2230 addRulesForIOpcs(
2231 {amdgcn_cvt_scalef32_pk_fp8_f16, amdgcn_cvt_scalef32_pk_bf8_f16},
2232 Standard)
2235
2236 addRulesForIOpcs({amdgcn_cvt_scalef32_pk_f32_fp4}, Standard)
2237 .Any({{DivV2S32}, {{VgprV2S32}, {IntrId, Vgpr32, Vgpr32}}})
2239
2240 addRulesForIOpcs({amdgcn_cvt_scalef32_pk_fp4_f32}, Standard)
2241 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32, Vgpr32}})
2242 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32, Vgpr32}});
2243
2244 addRulesForIOpcs({amdgcn_cvt_scalef32_pk_f16_fp4,
2245 amdgcn_cvt_scalef32_pk_f16_fp8,
2246 amdgcn_cvt_scalef32_pk_f16_bf8},
2247 Standard)
2248 .Div(V2S16, {{VgprV2S16}, {IntrId, Vgpr32, Vgpr32}})
2249 .Uni(V2S16, {{UniInVgprV2S16}, {IntrId, Vgpr32, Vgpr32}});
2250
2251 addRulesForIOpcs(
2252 {amdgcn_cvt_scalef32_pk32_f32_fp6, amdgcn_cvt_scalef32_pk32_f32_bf6},
2253 Standard)
2256
2257 addRulesForIOpcs(
2258 {amdgcn_cvt_scalef32_pk32_f16_fp6, amdgcn_cvt_scalef32_pk32_f16_bf6},
2259 Standard)
2262
2263 addRulesForIOpcs({amdgcn_cvt_scalef32_pk_fp4_f16}, Standard)
2264 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, VgprV2S16, Vgpr32}})
2265 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, VgprV2S16, Vgpr32}});
2266
2267 addRulesForIOpcs({amdgcn_global_load_tr_b64})
2268 .Any({{DivB64, _, UniP1}, {{VgprB64}, {IntrId, SgprP1}}})
2269 .Any({{DivB64, _, DivP1}, {{VgprB64}, {IntrId, VgprP1}}})
2270 .Any({{DivB32, _, UniP1}, {{VgprB32}, {IntrId, SgprP1}}})
2271 .Any({{DivB32, _, DivP1}, {{VgprB32}, {IntrId, VgprP1}}});
2272
2273 addRulesForIOpcs({amdgcn_global_load_tr_b128})
2274 .Any({{DivB64, _, UniP1}, {{VgprB64}, {IntrId, SgprP1}}})
2275 .Any({{DivB64, _, DivP1}, {{VgprB64}, {IntrId, VgprP1}}})
2276 .Any({{DivB128, _, UniP1}, {{VgprB128}, {IntrId, SgprP1}}})
2277 .Any({{DivB128, _, DivP1}, {{VgprB128}, {IntrId, VgprP1}}});
2278
2279 addRulesForIOpcs({amdgcn_global_load_tr4_b64})
2280 .Any({{DivV2S32, _, UniP1}, {{VgprV2S32}, {IntrId, SgprP1}}})
2281 .Any({{DivV2S32, _, DivP1}, {{VgprV2S32}, {IntrId, VgprP1}}});
2282
2283 addRulesForIOpcs({amdgcn_global_load_tr6_b96})
2284 .Any({{DivV3S32, _, UniP1}, {{VgprV3S32}, {IntrId, SgprP1}}})
2285 .Any({{DivV3S32, _, DivP1}, {{VgprV3S32}, {IntrId, VgprP1}}});
2286
2287 addRulesForIOpcs({amdgcn_ds_load_tr4_b64, amdgcn_ds_load_tr8_b64})
2288 .Any({{DivV2S32}, {{VgprV2S32}, {IntrId, VgprP3}}});
2289
2290 addRulesForIOpcs({amdgcn_ds_load_tr6_b96})
2291 .Any({{DivV3S32}, {{VgprV3S32}, {IntrId, VgprP3}}});
2292
2293 addRulesForIOpcs({amdgcn_ds_load_tr16_b128})
2294 .Any({{DivB128}, {{VgprB128}, {IntrId, VgprP3}}});
2295
2296 addRulesForIOpcs({amdgcn_global_atomic_ordered_add_b64})
2297 .Any({{DivS64}, {{Vgpr64}, {IntrId, VgprP1, Vgpr64}}});
2298
2299 addRulesForIOpcs(
2300 {amdgcn_global_atomic_fmin_num, amdgcn_global_atomic_fmax_num}, Standard)
2301 .Div(S32, {{Vgpr32}, {IntrId, VgprP1, Vgpr32}});
2302
2303 addRulesForIOpcs({amdgcn_flat_atomic_fmin_num, amdgcn_flat_atomic_fmax_num},
2304 Standard)
2305 .Div(S32, {{Vgpr32}, {IntrId, VgprP0, Vgpr32}});
2306
2307 addRulesForIOpcs({amdgcn_raw_buffer_load_lds})
2308 .Any({{_}, {{}, {IntrId, SgprV4S32, SgprP3, Imm, Vgpr32, Sgpr32}}});
2309
2310 addRulesForIOpcs({amdgcn_raw_buffer_load_async_lds})
2311 .Any({{_}, {{}, {IntrId, SgprV4S32, SgprB32_M0, Imm, Vgpr32, Sgpr32}}});
2312
2313 addRulesForIOpcs({amdgcn_struct_buffer_load_async_lds})
2314 .Any(
2315 {{_},
2317
2318 addRulesForIOpcs({amdgcn_struct_buffer_load_lds})
2319 .Any({{_},
2320 {{}, {IntrId, SgprV4S32, SgprP3, Imm, Vgpr32, Vgpr32, Sgpr32}}});
2321
2322 addRulesForIOpcs({amdgcn_raw_ptr_buffer_load_lds})
2323 .Any({{_}, {{}, {IntrId, SgprP8, SgprP3, Imm, Vgpr32, Sgpr32}}});
2324
2325 addRulesForIOpcs({amdgcn_raw_ptr_buffer_load_async_lds})
2326 .Any({{}, {{}, {IntrId, SgprP8, SgprB32_M0, Imm, VgprB32, SgprB32}}});
2327
2328 addRulesForIOpcs({amdgcn_struct_ptr_buffer_load_async_lds})
2329 .Any({{_},
2330 {{}, {IntrId, SgprP8, SgprB32_M0, Imm, Vgpr32, Vgpr32, Sgpr32}}});
2331
2332 addRulesForIOpcs({amdgcn_struct_ptr_buffer_load_lds})
2333 .Any({{_}, {{}, {IntrId, SgprP8, SgprP3, Imm, Vgpr32, Vgpr32, Sgpr32}}});
2334
2335 addRulesForIOpcs(
2336 {amdgcn_global_load_lds, amdgcn_load_to_lds, amdgcn_load_async_to_lds})
2337 .Any({{}, {{}, {IntrId, VgprP1, SgprB32_M0}}});
2338
2339 addRulesForIOpcs({amdgcn_global_load_async_to_lds_b8,
2340 amdgcn_global_load_async_to_lds_b32,
2341 amdgcn_global_load_async_to_lds_b64,
2342 amdgcn_global_load_async_to_lds_b128,
2343 amdgcn_global_store_async_from_lds_b8,
2344 amdgcn_global_store_async_from_lds_b32,
2345 amdgcn_global_store_async_from_lds_b64,
2346 amdgcn_global_store_async_from_lds_b128})
2347 .Any({{}, {{}, {IntrId, VgprP1, VgprP3}}});
2348
2349 addRulesForIOpcs({amdgcn_global_load_async_lds})
2350 .Any({{}, {{}, {IntrId, VgprP1, SgprB32_M0}}});
2351
2352 addRulesForIOpcs({amdgcn_tensor_load_to_lds, amdgcn_tensor_store_from_lds})
2353 .Any({{},
2354 {{},
2358
2359 addRulesForIOpcs({amdgcn_cluster_load_b32})
2361 .Any({{DivB32, _, UniP1}, {{VgprB32}, {IntrId, SgprP1, Imm, SgprB32_M0}}})
2362 .Any(
2363 {{DivB32, _, DivP1}, {{VgprB32}, {IntrId, VgprP1, Imm, SgprB32_M0}}});
2364
2365 addRulesForIOpcs({amdgcn_cluster_load_b64})
2367 .Any({{DivB64, _, UniP1}, {{VgprB64}, {IntrId, SgprP1, Imm, SgprB32_M0}}})
2368 .Any(
2369 {{DivB64, _, DivP1}, {{VgprB64}, {IntrId, VgprP1, Imm, SgprB32_M0}}});
2370
2371 addRulesForIOpcs({amdgcn_cluster_load_b128})
2373 .Any({{DivB128, _, UniP1},
2374 {{VgprB128}, {IntrId, SgprP1, Imm, SgprB32_M0}}})
2375 .Any({{DivB128, _, DivP1},
2376 {{VgprB128}, {IntrId, VgprP1, Imm, SgprB32_M0}}});
2377
2378 addRulesForIOpcs({amdgcn_cluster_load_async_to_lds_b8,
2379 amdgcn_cluster_load_async_to_lds_b32,
2380 amdgcn_cluster_load_async_to_lds_b64,
2381 amdgcn_cluster_load_async_to_lds_b128})
2382 .Any({{}, {{}, {IntrId, VgprP1, VgprP3, Imm, Imm, SgprB32_M0}}});
2383
2384 addRulesForIOpcs({amdgcn_perm_pk16_b4_u4}, StandardB)
2385 .Uni(B64, {{UniInVgprB64}, {IntrId, Vgpr32, Vgpr32, VgprV2S32}})
2386 .Div(B64, {{VgprB64}, {IntrId, Vgpr32, Vgpr32, VgprV2S32}});
2387
2388 addRulesForIOpcs({amdgcn_perm_pk16_b6_u4}, StandardB)
2390 .Div(B96, {{VgprB96}, {IntrId, Vgpr32, VgprB64, VgprV2S32}});
2391
2392 addRulesForIOpcs({amdgcn_perm_pk16_b8_u4}, StandardB)
2394 .Div(B128, {{VgprB128}, {IntrId, VgprB64, VgprB64, VgprV2S32}});
2395
2396 addRulesForIOpcs({amdgcn_wwm, amdgcn_strict_wwm, amdgcn_wqm, amdgcn_softwqm,
2397 amdgcn_strict_wqm},
2398 StandardB)
2399 .Div(B32, {{VgprB32}, {IntrId, VgprB32}})
2400 .Uni(B32, {{SgprB32}, {IntrId, SgprB32}})
2401 .Div(B64, {{VgprB64}, {IntrId, VgprB64}})
2402 .Uni(B64, {{SgprB64}, {IntrId, SgprB64}})
2403 .Div(B96, {{VgprB96}, {IntrId, VgprB96}})
2404 .Uni(B96, {{SgprB96}, {IntrId, SgprB96}})
2405 .Div(B128, {{VgprB128}, {IntrId, VgprB128}})
2406 .Uni(B128, {{SgprB128}, {IntrId, SgprB128}})
2407 .Any({{UniB256}, {{SgprB256}, {IntrId, SgprB256}}})
2408 .Any({{DivB256}, {{VgprB256}, {IntrId, VgprB256}}})
2409 .Any({{UniB512}, {{SgprB512}, {IntrId, SgprB512}}})
2410 .Any({{DivB512}, {{VgprB512}, {IntrId, VgprB512}}});
2411
2412 addRulesForIOpcs({amdgcn_init_whole_wave}).Any({{DivS1}, {{Vcc}, {IntrId}}});
2413
2414 addRulesForIOpcs({amdgcn_kill, amdgcn_wqm_demote})
2415 .Any({{}, {{}, {IntrId, Vcc}}});
2416
2417 addRulesForIOpcs({amdgcn_set_inactive}, StandardB)
2418 .Div(B32, {{VgprB32}, {IntrId, VgprB32, VgprB32}});
2419
2420 addRulesForIOpcs({amdgcn_set_inactive_chain_arg}, Standard)
2421 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}});
2422
2423 addRulesForIOpcs({amdgcn_cvt_sr_bf16_f32, amdgcn_cvt_sr_f16_f32}, Standard)
2424 .Div(V2S16, {{VgprV2S16}, {IntrId, VgprV2S16, Vgpr32, Vgpr32}});
2425
2426 addRulesForIOpcs({amdgcn_ballot}, Standard)
2427 .Uni(S64, {{Sgpr64}, {IntrId, Vcc}})
2428 .Uni(S32, {{Sgpr32}, {IntrId, Vcc}});
2429
2430 addRulesForIOpcs({amdgcn_inverse_ballot})
2431 .Any({{DivS1, _, S32}, {{Vcc}, {IntrId, SgprB32_ReadFirstLane}}})
2432 .Any({{DivS1, _, S64}, {{Vcc}, {IntrId, SgprB64_ReadFirstLane}}});
2433
2434 addRulesForIOpcs({amdgcn_live_mask, amdgcn_ps_live})
2435 .Any({{DivS1}, {{Vcc}, {}}});
2436
2437 addRulesForIOpcs({amdgcn_mov_dpp, amdgcn_mov_dpp8}, StandardB)
2438 .Div(B32, {{VgprB32}, {IntrId, VgprB32}})
2439 .Div(B64, {{VgprB64}, {IntrId, VgprB64}});
2440
2441 addRulesForIOpcs({amdgcn_update_dpp}, StandardB)
2442 .Div(B32, {{VgprB32}, {IntrId, VgprB32, VgprB32}})
2443 .Div(B64, {{VgprB64}, {IntrId, VgprB64, VgprB64}});
2444
2445 addRulesForIOpcs({amdgcn_sin, amdgcn_cos}, Standard)
2446 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
2447 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}})
2448 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
2449 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}});
2450
2451 addRulesForIOpcs({amdgcn_trig_preop}, Standard)
2452 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr32}})
2453 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64, Vgpr32}});
2454
2455 addRulesForIOpcs({amdgcn_exp2}, Standard)
2456 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
2457 .Uni(S16, {{Sgpr16}, {IntrId, Sgpr16}}, hasPST)
2458 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}}, !hasPST)
2459 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
2460 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}}, hasPST)
2461 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}}, !hasPST);
2462
2463 addRulesForIOpcs({amdgcn_rcp, amdgcn_sqrt}, Standard)
2464 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
2465 .Uni(S16, {{Sgpr16}, {IntrId, Sgpr16}}, hasPST)
2466 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}}, !hasPST)
2467 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
2468 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}}, hasPST)
2469 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}}, !hasPST)
2470 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64}})
2471 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64}});
2472
2473 addRulesForIOpcs({amdgcn_log}, Standard)
2474 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
2475 .Uni(S16, {{Sgpr16}, {IntrId, Sgpr16}}, hasPST)
2476 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}}, !hasPST)
2477 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
2478 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}}, hasPST)
2479 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}}, !hasPST);
2480
2481 addRulesForIOpcs({amdgcn_ds_atomic_async_barrier_arrive_b64})
2482 .Any({{}, {{}, {IntrId, VgprP3}}});
2483
2484 addRulesForIOpcs({amdgcn_ds_atomic_barrier_arrive_rtn_b64}, Standard)
2485 .Div(S64, {{Vgpr64}, {IntrId, VgprP3, Vgpr64}});
2486
2487 addRulesForIOpcs({amdgcn_ds_add_gs_reg_rtn, amdgcn_ds_sub_gs_reg_rtn},
2488 Standard)
2489 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
2490 .Div(S64, {{Vgpr64}, {IntrId, Vgpr32}});
2491
2492 addRulesForIOpcs({amdgcn_ds_append, amdgcn_ds_consume}, Standard)
2493 .Uni(S32, {{UniInVgprS32}, {IntrId, SgprB32_M0}})
2494 .Div(S32, {{Vgpr32}, {IntrId, SgprB32_M0}});
2495
2496 addRulesForIOpcs(
2497 {amdgcn_ds_bvh_stack_rtn, amdgcn_ds_bvh_stack_push4_pop1_rtn}, Standard)
2498 .Div(S32, {{Vgpr32, Vgpr32}, {IntrId, Vgpr32, Vgpr32, VgprV4S32}});
2499
2500 addRulesForIOpcs({amdgcn_ds_bvh_stack_push8_pop1_rtn}, Standard)
2501 .Div(S32, {{Vgpr32, Vgpr32}, {IntrId, Vgpr32, Vgpr32, VgprV8S32}});
2502
2503 addRulesForIOpcs({amdgcn_ds_bvh_stack_push8_pop2_rtn}, Standard)
2504 .Div(S64, {{Vgpr64, Vgpr32}, {IntrId, Vgpr32, Vgpr32, VgprV8S32}});
2505
2506 addRulesForIOpcs({amdgcn_ds_gws_sema_p, amdgcn_ds_gws_sema_v,
2507 amdgcn_ds_gws_sema_release_all})
2508 .Any({{}, {{}, {IntrId, SgprB32_M0}}});
2509
2510 addRulesForIOpcs(
2511 {amdgcn_ds_gws_barrier, amdgcn_ds_gws_init, amdgcn_ds_gws_sema_br})
2512 .Any({{}, {{}, {IntrId, Vgpr32, SgprB32_M0}}});
2513
2514 addRulesForIOpcs({amdgcn_ds_ordered_add, amdgcn_ds_ordered_swap}, Standard)
2515 .Div(S32, {{Vgpr32}, {IntrId, SgprB32_M0, Vgpr32}});
2516
2517 addRulesForIOpcs({amdgcn_ds_swizzle}, Standard)
2518 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}})
2519 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}});
2520
2521 addRulesForIOpcs({amdgcn_permlane16_var, amdgcn_permlanex16_var}, Standard)
2522 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
2523
2524 addRulesForIOpcs({amdgcn_permlane16_swap, amdgcn_permlane32_swap}, Standard)
2525 .Div(S32, {{Vgpr32, Vgpr32}, {IntrId, Vgpr32, Vgpr32}});
2526
2527 addRulesForIOpcs({amdgcn_permlane64}, StandardB)
2528 .Div(B32, {{VgprB32}, {IntrId, VgprB32}});
2529
2530 addRulesForIOpcs({amdgcn_ds_read_tr4_b64, amdgcn_ds_read_tr8_b64})
2531 .Any({{DivV2S32}, {{VgprV2S32}, {IntrId, VgprP3}}});
2532
2533 addRulesForIOpcs({amdgcn_ds_read_tr6_b96})
2534 .Any({{DivV3S32}, {{VgprV3S32}, {IntrId, VgprP3}}});
2535
2536 addRulesForIOpcs({amdgcn_ds_read_tr16_b64})
2537 .Any({{DivV4S16}, {{VgprV4S16}, {IntrId, VgprP3}}});
2538
2539 addRulesForIOpcs({amdgcn_interp_p1}, Standard)
2540 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Imm, Imm, SgprB32_M0}});
2541
2542 addRulesForIOpcs({amdgcn_interp_p1_f16}, Standard)
2543 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Imm, Imm, Imm, SgprB32_M0}});
2544
2545 addRulesForIOpcs({amdgcn_interp_p2}, Standard)
2546 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Imm, Imm, SgprB32_M0}});
2547
2548 addRulesForIOpcs({amdgcn_interp_p2_f16}, Standard)
2549 .Div(S16,
2551
2552 addRulesForIOpcs({amdgcn_interp_mov}, Standard)
2553 .Div(S32, {{Vgpr32}, {IntrId, Imm, Imm, Imm, SgprB32_M0}});
2554
2555 addRulesForIOpcs({amdgcn_interp_inreg_p10, amdgcn_interp_inreg_p2,
2556 amdgcn_interp_inreg_p10_f16, amdgcn_interp_p10_rtz_f16},
2557 Standard)
2558 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
2559 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
2560
2561 addRulesForIOpcs({amdgcn_interp_inreg_p2_f16, amdgcn_interp_p2_rtz_f16},
2562 Standard)
2563 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
2564 .Div(S16, {{Vgpr16}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
2565
2566 addRulesForIOpcs({amdgcn_frexp_exp})
2567 .Any({{UniS16}, {{UniInVgprS16}, {IntrId, Vgpr16}}})
2568 .Any({{DivS16}, {{Vgpr16}, {IntrId, Vgpr16}}})
2569 .Any({{UniS32, _, S32}, {{UniInVgprS32}, {IntrId, Vgpr32}}})
2570 .Any({{DivS32, _, S32}, {{Vgpr32}, {IntrId, Vgpr32}}})
2571 .Any({{UniS32, _, S64}, {{UniInVgprS32}, {IntrId, Vgpr64}}})
2572 .Any({{DivS32, _, S64}, {{Vgpr32}, {IntrId, Vgpr64}}});
2573
2574 addRulesForIOpcs({amdgcn_div_fmas}, Standard)
2575 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32, Vcc}})
2576 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32, Vcc}})
2577 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr64, Vgpr64, Vcc}})
2578 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64, Vgpr64, Vgpr64, Vcc}});
2579
2580 addRulesForIOpcs({amdgcn_div_fixup}, Standard)
2581 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16, Vgpr16, Vgpr16}})
2582 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16, Vgpr16, Vgpr16}})
2583 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
2584 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
2585 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr64, Vgpr64}})
2586 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64, Vgpr64, Vgpr64}});
2587
2588 addRulesForIOpcs({amdgcn_div_scale}, Standard)
2589 .Div(S32, {{Vgpr32, Vcc}, {IntrId, Vgpr32, Vgpr32}})
2590 .Uni(S32, {{UniInVgprS32, UniInVcc}, {IntrId, Vgpr32, Vgpr32}})
2591 .Div(S64, {{Vgpr64, Vcc}, {IntrId, Vgpr64, Vgpr64}})
2592 .Uni(S64, {{UniInVgprS64, UniInVcc}, {IntrId, Vgpr64, Vgpr64}});
2593
2594 addRulesForIOpcs({amdgcn_fdot2, amdgcn_sdot2, amdgcn_udot2}, Standard)
2596 .Div(S32, {{Vgpr32}, {IntrId, VgprV2S16, VgprV2S16, Vgpr32}});
2597
2598 addRulesForIOpcs({amdgcn_fdot2_f16_f16}, Standard)
2600 .Div(S16, {{Vgpr16}, {IntrId, VgprV2S16, VgprV2S16, Vgpr16}});
2601
2602 addRulesForIOpcs({amdgcn_sudot4, amdgcn_sudot8}, Standard)
2603 .Uni(S32, {{UniInVgprS32}, {IntrId, Imm, Vgpr32, Imm, Vgpr32, Vgpr32}})
2604 .Div(S32, {{Vgpr32}, {IntrId, Imm, Vgpr32, Imm, Vgpr32, Vgpr32}});
2605
2606 addRulesForIOpcs({amdgcn_s_alloc_vgpr})
2608
2609 addRulesForIOpcs({amdgcn_sat_pk4_i4_i8, amdgcn_sat_pk4_u4_u8}, Standard)
2610 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr32}})
2611 .Div(S16, {{Vgpr16}, {IntrId, Vgpr32}});
2612
2613 bool HasGFX90AInsts = ST->hasGFX90AInsts();
2614
2615 // On gfx90a+ both AGPR-form and VGPR-form exists
2616 addRulesForIOpcs({amdgcn_mfma_f32_32x32x1f32, amdgcn_mfma_f32_16x16x1f32,
2617 amdgcn_mfma_f32_4x4x1f32, amdgcn_mfma_f32_32x32x2f32,
2618 amdgcn_mfma_f32_16x16x4f32, amdgcn_mfma_f32_32x32x4f16,
2619 amdgcn_mfma_f32_16x16x4f16, amdgcn_mfma_f32_4x4x4f16,
2620 amdgcn_mfma_f32_32x32x8f16, amdgcn_mfma_f32_16x16x16f16,
2621 amdgcn_mfma_i32_32x32x4i8, amdgcn_mfma_i32_16x16x4i8,
2622 amdgcn_mfma_i32_4x4x4i8, amdgcn_mfma_i32_32x32x8i8,
2623 amdgcn_mfma_i32_16x16x16i8, amdgcn_mfma_f32_32x32x2bf16,
2624 amdgcn_mfma_f32_16x16x2bf16, amdgcn_mfma_f32_4x4x2bf16,
2625 amdgcn_mfma_f32_32x32x4bf16, amdgcn_mfma_f32_16x16x8bf16})
2626 .Any({{DivAnyTy},
2628 !HasGFX90AInsts)
2629 .Any({{DivAnyTy},
2630 {{VgprOrAgprAnyTy},
2632 HasGFX90AInsts);
2633
2634 // gfx90a+ only MFMAs
2635 addRulesForIOpcs(
2636 {
2637 amdgcn_mfma_f32_32x32x4bf16_1k,
2638 amdgcn_mfma_f32_16x16x4bf16_1k,
2639 amdgcn_mfma_f32_4x4x4bf16_1k,
2640 amdgcn_mfma_f32_32x32x8bf16_1k,
2641 amdgcn_mfma_f32_16x16x16bf16_1k,
2642 amdgcn_mfma_f64_16x16x4f64,
2643 amdgcn_mfma_f64_4x4x4f64,
2644 amdgcn_mfma_i32_16x16x32_i8,
2645 amdgcn_mfma_i32_32x32x16_i8,
2646 amdgcn_mfma_f32_16x16x8_xf32,
2647 amdgcn_mfma_f32_32x32x4_xf32,
2648 amdgcn_mfma_f32_16x16x32_bf8_bf8,
2649 amdgcn_mfma_f32_16x16x32_bf8_fp8,
2650 amdgcn_mfma_f32_16x16x32_fp8_bf8,
2651 amdgcn_mfma_f32_16x16x32_fp8_fp8,
2652 amdgcn_mfma_f32_32x32x16_bf8_bf8,
2653 amdgcn_mfma_f32_32x32x16_bf8_fp8,
2654 amdgcn_mfma_f32_32x32x16_fp8_bf8,
2655 amdgcn_mfma_f32_32x32x16_fp8_fp8,
2656 // gfx950
2657 amdgcn_mfma_f32_16x16x32_f16,
2658 amdgcn_mfma_f32_32x32x16_f16,
2659 amdgcn_mfma_i32_16x16x64_i8,
2660 amdgcn_mfma_i32_32x32x32_i8,
2661 // TODO: bf16 variants fail in IRTranslator.
2662 // amdgcn_mfma_f32_16x16x32_bf16, amdgcn_mfma_f32_32x32x16_bf16,
2663 })
2664 .Any({{DivAnyTy},
2665 {{VgprOrAgprAnyTy},
2667
2668 addRulesForIOpcs(
2669 {// gfx942+
2670 amdgcn_smfmac_f32_16x16x32_f16, amdgcn_smfmac_f32_32x32x16_f16,
2671 amdgcn_smfmac_f32_16x16x32_bf16, amdgcn_smfmac_f32_32x32x16_bf16,
2672 amdgcn_smfmac_i32_16x16x64_i8, amdgcn_smfmac_i32_32x32x32_i8,
2673 amdgcn_smfmac_f32_16x16x64_bf8_bf8, amdgcn_smfmac_f32_16x16x64_bf8_fp8,
2674 amdgcn_smfmac_f32_16x16x64_fp8_bf8, amdgcn_smfmac_f32_16x16x64_fp8_fp8,
2675 amdgcn_smfmac_f32_32x32x32_bf8_bf8, amdgcn_smfmac_f32_32x32x32_bf8_fp8,
2676 amdgcn_smfmac_f32_32x32x32_fp8_bf8, amdgcn_smfmac_f32_32x32x32_fp8_fp8,
2677 // gfx950+
2678 amdgcn_smfmac_f32_16x16x64_f16, amdgcn_smfmac_f32_32x32x32_f16,
2679 amdgcn_smfmac_i32_16x16x128_i8, amdgcn_smfmac_i32_32x32x64_i8,
2680 amdgcn_smfmac_f32_16x16x128_bf8_bf8, amdgcn_smfmac_f32_16x16x128_bf8_fp8,
2681 amdgcn_smfmac_f32_16x16x128_fp8_bf8, amdgcn_smfmac_f32_16x16x128_fp8_fp8,
2682 amdgcn_smfmac_f32_32x32x64_bf8_bf8, amdgcn_smfmac_f32_32x32x64_bf8_fp8,
2683 amdgcn_smfmac_f32_32x32x64_fp8_bf8, amdgcn_smfmac_f32_32x32x64_fp8_fp8})
2684 .Any({{DivAnyTy},
2685 {{VgprOrAgprAnyTy},
2687
2688 addRulesForIOpcs({amdgcn_mfma_scale_f32_32x32x64_f8f6f4,
2689 amdgcn_mfma_scale_f32_16x16x128_f8f6f4})
2690 .Any({{DivAnyTy},
2691 {{VgprOrAgprAnyTy},
2693 Vgpr32, Imm, Vgpr32}}});
2694
2695 // WMMA/SWMMAC intrinsics: all register operands map to VGPR.
2696 addRulesForIOpcs(
2697 {// WMMA GFX11+
2698 amdgcn_wmma_f32_16x16x16_f16, amdgcn_wmma_f32_16x16x16_bf16,
2699 amdgcn_wmma_f16_16x16x16_f16, amdgcn_wmma_bf16_16x16x16_bf16,
2700 amdgcn_wmma_f16_16x16x16_f16_tied, amdgcn_wmma_bf16_16x16x16_bf16_tied,
2701 amdgcn_wmma_i32_16x16x16_iu8, amdgcn_wmma_i32_16x16x16_iu4,
2702 // WMMA GFX12
2703 amdgcn_wmma_f32_16x16x16_fp8_fp8, amdgcn_wmma_f32_16x16x16_fp8_bf8,
2704 amdgcn_wmma_f32_16x16x16_bf8_fp8, amdgcn_wmma_f32_16x16x16_bf8_bf8,
2705 amdgcn_wmma_i32_16x16x32_iu4,
2706 // WMMA GFX1250
2707 amdgcn_wmma_f32_16x16x4_f32, amdgcn_wmma_f32_16x16x32_bf16,
2708 amdgcn_wmma_f32_16x16x32_f16, amdgcn_wmma_f16_16x16x32_f16,
2709 amdgcn_wmma_bf16_16x16x32_bf16, amdgcn_wmma_bf16f32_16x16x32_bf16,
2710 amdgcn_wmma_f32_16x16x64_fp8_fp8, amdgcn_wmma_f32_16x16x64_fp8_bf8,
2711 amdgcn_wmma_f32_16x16x64_bf8_fp8, amdgcn_wmma_f32_16x16x64_bf8_bf8,
2712 amdgcn_wmma_f16_16x16x64_fp8_fp8, amdgcn_wmma_f16_16x16x64_fp8_bf8,
2713 amdgcn_wmma_f16_16x16x64_bf8_fp8, amdgcn_wmma_f16_16x16x64_bf8_bf8,
2714 amdgcn_wmma_f16_16x16x128_fp8_fp8, amdgcn_wmma_f16_16x16x128_fp8_bf8,
2715 amdgcn_wmma_f16_16x16x128_bf8_fp8, amdgcn_wmma_f16_16x16x128_bf8_bf8,
2716 amdgcn_wmma_f32_16x16x128_fp8_fp8, amdgcn_wmma_f32_16x16x128_fp8_bf8,
2717 amdgcn_wmma_f32_16x16x128_bf8_fp8, amdgcn_wmma_f32_16x16x128_bf8_bf8,
2718 amdgcn_wmma_i32_16x16x64_iu8, amdgcn_wmma_f32_16x16x128_f8f6f4,
2719 amdgcn_wmma_scale_f32_16x16x128_f8f6f4,
2720 amdgcn_wmma_scale16_f32_16x16x128_f8f6f4, amdgcn_wmma_f32_32x16x128_f4,
2721 amdgcn_wmma_scale_f32_32x16x128_f4, amdgcn_wmma_scale16_f32_32x16x128_f4,
2722 // WMMA GFX1251
2723 amdgcn_wmma_f64_16x16x4_f64,
2724 // SWMMAC GFX12
2725 amdgcn_swmmac_f32_16x16x32_f16, amdgcn_swmmac_f32_16x16x32_bf16,
2726 amdgcn_swmmac_f16_16x16x32_f16, amdgcn_swmmac_bf16_16x16x32_bf16,
2727 amdgcn_swmmac_i32_16x16x32_iu8, amdgcn_swmmac_i32_16x16x32_iu4,
2728 amdgcn_swmmac_i32_16x16x64_iu4, amdgcn_swmmac_f32_16x16x32_fp8_fp8,
2729 amdgcn_swmmac_f32_16x16x32_fp8_bf8, amdgcn_swmmac_f32_16x16x32_bf8_fp8,
2730 amdgcn_swmmac_f32_16x16x32_bf8_bf8,
2731 // SWMMAC GFX1250
2732 amdgcn_swmmac_f32_16x16x64_f16, amdgcn_swmmac_f32_16x16x64_bf16,
2733 amdgcn_swmmac_f16_16x16x64_f16, amdgcn_swmmac_bf16_16x16x64_bf16,
2734 amdgcn_swmmac_bf16f32_16x16x64_bf16, amdgcn_swmmac_f32_16x16x128_fp8_fp8,
2735 amdgcn_swmmac_f32_16x16x128_fp8_bf8, amdgcn_swmmac_f32_16x16x128_bf8_fp8,
2736 amdgcn_swmmac_f32_16x16x128_bf8_bf8, amdgcn_swmmac_f16_16x16x128_fp8_fp8,
2737 amdgcn_swmmac_f16_16x16x128_fp8_bf8, amdgcn_swmmac_f16_16x16x128_bf8_fp8,
2738 amdgcn_swmmac_f16_16x16x128_bf8_bf8, amdgcn_swmmac_i32_16x16x128_iu8})
2739 .Any({{}, {{}, {}, ApplyAllVgpr}});
2740
2741} // end initialize rules
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU address space definition.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
constexpr LLT S16
constexpr LLT S1
constexpr LLT V2S16
constexpr LLT S32
constexpr LLT V4S32
constexpr LLT V3S32
constexpr LLT S64
constexpr LLT V2S32
constexpr LLT S128
UniformityLLTOpPredicateID LLTToBId(LLT Ty)
bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, const MachineUniformityInfo &MUI, const MachineRegisterInfo &MRI)
UniformityLLTOpPredicateID LLTToId(LLT Ty)
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
#define _
IRTranslator LLVM IR MI
Register Reg
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
bool operator()(const MachineInstr &MI) const
Predicate operator||(const Predicate &RHS) const
Predicate operator&&(const Predicate &RHS) const
Predicate(std::function< bool(const MachineInstr &)> Pred)
Predicate operator!() const
RegBankLegalizeRules(const GCNSubtarget &ST, MachineRegisterInfo &MRI)
const SetOfRulesForOpcode * getRulesForOpc(MachineInstr &MI) const
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
void addFastRuleDivergent(UniformityLLTOpPredicateID Ty, RegBankLLTMapping RuleApplyIDs)
void addFastRuleUniform(UniformityLLTOpPredicateID Ty, RegBankLLTMapping RuleApplyIDs)
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
bool isSigned() const
Definition InstrTypes.h:993
bool isDivergentAtDef(ConstValueRefT V) const
Whether V is divergent at its definition.
bool isUniformAtDef(ConstValueRefT V) const
Whether V is uniform/non-divergent at its definition.
bool isEquality() const
Return true if this predicate is either EQ or NE.
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
TypeSize getValue() const
Representation of each machine instruction.
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const TargetRegisterInfo * getTargetRegisterInfo() const
Wrapper class representing virtual and physical registers.
Definition Register.h:20
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
bool isAnyPtr(LLT Ty, unsigned Width)
bool isUniformMMO(const MachineMemOperand *MMO)
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
SmallVector< UniformityLLTOpPredicateID, 4 > OpUniformityAndTypes
PredicateMapping(std::initializer_list< UniformityLLTOpPredicateID > OpList, std::function< bool(const MachineInstr &)> TestFunc=nullptr)
bool match(const MachineInstr &MI, const MachineUniformityInfo &MUI, const MachineRegisterInfo &MRI) const
std::function< bool(const MachineInstr &)> TestFunc
RegBankLLTMapping(std::initializer_list< RegBankLLTMappingApplyID > DstOpMappingList, std::initializer_list< RegBankLLTMappingApplyID > SrcOpMappingList, LoweringMethodID LoweringMethod=DoNotLower)
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39