LLVM 23.0.0git
AMDGPURegBankLegalizeRules.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeRules.cpp ------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Definitions of RegBankLegalize Rules for all opcodes.
10/// Implementation of container for all the Rules and search.
11/// Fast search for most common case when Rule.Predicate checks LLT and
12/// uniformity of register in operand 0.
13//
14//===----------------------------------------------------------------------===//
15
17#include "AMDGPUInstrInfo.h"
18#include "GCNSubtarget.h"
21#include "llvm/IR/IntrinsicsAMDGPU.h"
23
24#define DEBUG_TYPE "amdgpu-regbanklegalize"
25
26using namespace llvm;
27using namespace AMDGPU;
28
29bool AMDGPU::isAnyPtr(LLT Ty, unsigned Width) {
30 return Ty.isPointer() && Ty.getSizeInBits() == Width;
31}
32
34 std::initializer_list<RegBankLLTMappingApplyID> DstOpMappingList,
35 std::initializer_list<RegBankLLTMappingApplyID> SrcOpMappingList,
37 : DstOpMapping(DstOpMappingList), SrcOpMapping(SrcOpMappingList),
39
41 std::initializer_list<UniformityLLTOpPredicateID> OpList,
42 std::function<bool(const MachineInstr &)> TestFunc)
44
46 const MachineUniformityInfo &MUI,
47 const MachineRegisterInfo &MRI) {
48 switch (UniID) {
49 case S1:
50 return MRI.getType(Reg) == LLT::scalar(1);
51 case S16:
52 return MRI.getType(Reg) == LLT::scalar(16);
53 case S32:
54 return MRI.getType(Reg) == LLT::scalar(32);
55 case S64:
56 return MRI.getType(Reg) == LLT::scalar(64);
57 case S128:
58 return MRI.getType(Reg) == LLT::scalar(128);
59 case P0:
60 return MRI.getType(Reg) == LLT::pointer(0, 64);
61 case P1:
62 return MRI.getType(Reg) == LLT::pointer(1, 64);
63 case P2:
64 return MRI.getType(Reg) == LLT::pointer(2, 32);
65 case P3:
66 return MRI.getType(Reg) == LLT::pointer(3, 32);
67 case P4:
68 return MRI.getType(Reg) == LLT::pointer(4, 64);
69 case P5:
70 return MRI.getType(Reg) == LLT::pointer(5, 32);
71 case P8:
72 return MRI.getType(Reg) == LLT::pointer(8, 128);
73 case Ptr32:
74 return isAnyPtr(MRI.getType(Reg), 32);
75 case Ptr64:
76 return isAnyPtr(MRI.getType(Reg), 64);
77 case Ptr128:
78 return isAnyPtr(MRI.getType(Reg), 128);
79 case V2S16:
80 return MRI.getType(Reg) == LLT::fixed_vector(2, 16);
81 case V2S32:
82 return MRI.getType(Reg) == LLT::fixed_vector(2, 32);
83 case V3S32:
84 return MRI.getType(Reg) == LLT::fixed_vector(3, 32);
85 case V4S32:
86 return MRI.getType(Reg) == LLT::fixed_vector(4, 32);
87 case B32:
88 return MRI.getType(Reg).getSizeInBits() == 32;
89 case B64:
90 return MRI.getType(Reg).getSizeInBits() == 64;
91 case B96:
92 return MRI.getType(Reg).getSizeInBits() == 96;
93 case B128:
94 return MRI.getType(Reg).getSizeInBits() == 128;
95 case B160:
96 return MRI.getType(Reg).getSizeInBits() == 160;
97 case B256:
98 return MRI.getType(Reg).getSizeInBits() == 256;
99 case B512:
100 return MRI.getType(Reg).getSizeInBits() == 512;
101 case UniS1:
102 return MRI.getType(Reg) == LLT::scalar(1) && MUI.isUniform(Reg);
103 case UniS16:
104 return MRI.getType(Reg) == LLT::scalar(16) && MUI.isUniform(Reg);
105 case UniS32:
106 return MRI.getType(Reg) == LLT::scalar(32) && MUI.isUniform(Reg);
107 case UniS64:
108 return MRI.getType(Reg) == LLT::scalar(64) && MUI.isUniform(Reg);
109 case UniS128:
110 return MRI.getType(Reg) == LLT::scalar(128) && MUI.isUniform(Reg);
111 case UniP0:
112 return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isUniform(Reg);
113 case UniP1:
114 return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isUniform(Reg);
115 case UniP2:
116 return MRI.getType(Reg) == LLT::pointer(2, 32) && MUI.isUniform(Reg);
117 case UniP3:
118 return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isUniform(Reg);
119 case UniP4:
120 return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniform(Reg);
121 case UniP5:
122 return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniform(Reg);
123 case UniP8:
124 return MRI.getType(Reg) == LLT::pointer(8, 128) && MUI.isUniform(Reg);
125 case UniPtr32:
126 return isAnyPtr(MRI.getType(Reg), 32) && MUI.isUniform(Reg);
127 case UniPtr64:
128 return isAnyPtr(MRI.getType(Reg), 64) && MUI.isUniform(Reg);
129 case UniPtr128:
130 return isAnyPtr(MRI.getType(Reg), 128) && MUI.isUniform(Reg);
131 case UniV2S16:
132 return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isUniform(Reg);
133 case UniV2S32:
134 return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && MUI.isUniform(Reg);
135 case UniB32:
136 return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isUniform(Reg);
137 case UniB64:
138 return MRI.getType(Reg).getSizeInBits() == 64 && MUI.isUniform(Reg);
139 case UniB96:
140 return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isUniform(Reg);
141 case UniB128:
142 return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isUniform(Reg);
143 case UniB160:
144 return MRI.getType(Reg).getSizeInBits() == 160 && MUI.isUniform(Reg);
145 case UniB256:
146 return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isUniform(Reg);
147 case UniB512:
148 return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isUniform(Reg);
149 case UniBRC: {
150 if (!MUI.isUniform(Reg))
151 return false;
152 // Check if there is SGPR register class of same size as the LLT.
153 const SIRegisterInfo *TRI =
154 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
155 // There is no 16 bit SGPR register class. Extra size check is required
156 // since getSGPRClassForBitWidth returns SReg_32RegClass for Size 16.
157 unsigned LLTSize = MRI.getType(Reg).getSizeInBits();
158 return LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize);
159 }
160 case DivS1:
161 return MRI.getType(Reg) == LLT::scalar(1) && MUI.isDivergent(Reg);
162 case DivS16:
163 return MRI.getType(Reg) == LLT::scalar(16) && MUI.isDivergent(Reg);
164 case DivS32:
165 return MRI.getType(Reg) == LLT::scalar(32) && MUI.isDivergent(Reg);
166 case DivS64:
167 return MRI.getType(Reg) == LLT::scalar(64) && MUI.isDivergent(Reg);
168 case DivS128:
169 return MRI.getType(Reg) == LLT::scalar(128) && MUI.isDivergent(Reg);
170 case DivP0:
171 return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isDivergent(Reg);
172 case DivP1:
173 return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isDivergent(Reg);
174 case DivP2:
175 return MRI.getType(Reg) == LLT::pointer(2, 32) && MUI.isDivergent(Reg);
176 case DivP3:
177 return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isDivergent(Reg);
178 case DivP4:
179 return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isDivergent(Reg);
180 case DivP5:
181 return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isDivergent(Reg);
182 case DivPtr32:
183 return isAnyPtr(MRI.getType(Reg), 32) && MUI.isDivergent(Reg);
184 case DivPtr64:
185 return isAnyPtr(MRI.getType(Reg), 64) && MUI.isDivergent(Reg);
186 case DivPtr128:
187 return isAnyPtr(MRI.getType(Reg), 128) && MUI.isDivergent(Reg);
188 case DivV2S16:
189 return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isDivergent(Reg);
190 case DivV2S32:
191 return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && MUI.isDivergent(Reg);
192 case DivV3S32:
193 return MRI.getType(Reg) == LLT::fixed_vector(3, 32) && MUI.isDivergent(Reg);
194 case DivV4S16:
195 return MRI.getType(Reg) == LLT::fixed_vector(4, 16) && MUI.isDivergent(Reg);
196 case DivV6S32:
197 return MRI.getType(Reg) == LLT::fixed_vector(6, 32) && MUI.isDivergent(Reg);
198 case DivB32:
199 return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergent(Reg);
200 case DivB64:
201 return MRI.getType(Reg).getSizeInBits() == 64 && MUI.isDivergent(Reg);
202 case DivB96:
203 return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isDivergent(Reg);
204 case DivB128:
205 return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isDivergent(Reg);
206 case DivB160:
207 return MRI.getType(Reg).getSizeInBits() == 160 && MUI.isDivergent(Reg);
208 case DivB256:
209 return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isDivergent(Reg);
210 case DivB512:
211 return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isDivergent(Reg);
212 case DivBRC: {
213 if (!MUI.isDivergent(Reg))
214 return false;
215 // Check if there is VGPR register class of same size as the LLT.
216 const SIRegisterInfo *TRI =
217 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
218 return TRI->getSGPRClassForBitWidth(MRI.getType(Reg).getSizeInBits());
219 }
220 case BRC: {
221 // Check if there is SGPR and VGPR register class of same size as the LLT.
222 const SIRegisterInfo *TRI =
223 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
224 unsigned LLTSize = MRI.getType(Reg).getSizeInBits();
225 return LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize) &&
226 TRI->getVGPRClassForBitWidth(LLTSize);
227 }
228 case _:
229 return true;
230 default:
231 llvm_unreachable("missing matchUniformityAndLLT");
232 }
233}
234
236 const MachineUniformityInfo &MUI,
237 const MachineRegisterInfo &MRI) const {
238 // Check LLT signature.
239 for (unsigned i = 0; i < OpUniformityAndTypes.size(); ++i) {
240 const MachineOperand &MO = MI.getOperand(i);
241 if (OpUniformityAndTypes[i] == _) {
242 assert((!MI.getOperand(i).isReg() ||
243 !MI.getOperand(i).getReg().isVirtual()) &&
244 "_ is for non-register and physical register operands only");
245 continue;
246 }
247
248 // Remaining IDs check registers.
249 if (!MO.isReg())
250 return false;
251
252 if (!matchUniformityAndLLT(MO.getReg(), OpUniformityAndTypes[i], MUI, MRI))
253 return false;
254 }
255
256 // More complex check.
257 if (TestFunc)
258 return TestFunc(MI);
259
260 return true;
261}
262
264
266 : FastTypes(FastTypes) {}
267
269 if (Ty == LLT::scalar(16))
270 return S16;
271 if (Ty == LLT::scalar(32))
272 return S32;
273 if (Ty == LLT::scalar(64))
274 return S64;
275 if (Ty == LLT::fixed_vector(2, 16))
276 return V2S16;
277 if (Ty == LLT::fixed_vector(2, 32))
278 return V2S32;
279 if (Ty == LLT::fixed_vector(3, 32))
280 return V3S32;
281 if (Ty == LLT::fixed_vector(4, 32))
282 return V4S32;
283 return _;
284}
285
287 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
288 isAnyPtr(Ty, 32))
289 return B32;
290 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
291 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
292 return B64;
293 if (Ty == LLT::fixed_vector(3, 32))
294 return B96;
295 if (Ty == LLT::fixed_vector(4, 32) || Ty == LLT::fixed_vector(2, 64) ||
296 Ty == LLT::fixed_vector(8, 16) || isAnyPtr(Ty, 128))
297 return B128;
298 return _;
299}
300
301const RegBankLLTMapping *
303 const MachineRegisterInfo &MRI,
304 const MachineUniformityInfo &MUI) const {
305 // Search in "Fast Rules".
306 // Note: if fast rules are enabled, RegBankLLTMapping must be added in each
307 // slot that could "match fast Predicate". If not, InvalidMapping is
308 // returned which results in failure, does not search "Slow Rules".
309 if (FastTypes != NoFastRules) {
310 Register Reg = MI.getOperand(0).getReg();
311 int Slot;
312 if (FastTypes == StandardB)
313 Slot = getFastPredicateSlot(LLTToBId(MRI.getType(Reg)));
314 else
315 Slot = getFastPredicateSlot(LLTToId(MRI.getType(Reg)));
316
317 if (Slot != -1)
318 return MUI.isUniform(Reg) ? &Uni[Slot] : &Div[Slot];
319 }
320
321 // Slow search for more complex rules.
322 for (const RegBankLegalizeRule &Rule : Rules) {
323 if (Rule.Predicate.match(MI, MUI, MRI))
324 return &Rule.OperandMapping;
325 }
326
327 return nullptr;
328}
329
331 Rules.push_back(Rule);
332}
333
335 RegBankLLTMapping RuleApplyIDs) {
336 int Slot = getFastPredicateSlot(Ty);
337 assert(Slot != -1 && "Ty unsupported in this FastRulesTypes");
338 Div[Slot] = std::move(RuleApplyIDs);
339}
340
342 RegBankLLTMapping RuleApplyIDs) {
343 int Slot = getFastPredicateSlot(Ty);
344 assert(Slot != -1 && "Ty unsupported in this FastRulesTypes");
345 Uni[Slot] = std::move(RuleApplyIDs);
346}
347
348int SetOfRulesForOpcode::getFastPredicateSlot(
350 switch (FastTypes) {
351 case Standard: {
352 switch (Ty) {
353 case S32:
354 return 0;
355 case S16:
356 return 1;
357 case S64:
358 return 2;
359 case V2S16:
360 return 3;
361 default:
362 return -1;
363 }
364 }
365 case StandardB: {
366 switch (Ty) {
367 case B32:
368 return 0;
369 case B64:
370 return 1;
371 case B96:
372 return 2;
373 case B128:
374 return 3;
375 default:
376 return -1;
377 }
378 }
379 case Vector: {
380 switch (Ty) {
381 case S32:
382 return 0;
383 case V2S32:
384 return 1;
385 case V3S32:
386 return 2;
387 case V4S32:
388 return 3;
389 default:
390 return -1;
391 }
392 }
393 default:
394 return -1;
395 }
396}
397
398RegBankLegalizeRules::RuleSetInitializer
399RegBankLegalizeRules::addRulesForGOpcs(std::initializer_list<unsigned> OpcList,
400 FastRulesTypes FastTypes) {
401 return RuleSetInitializer(OpcList, GRulesAlias, GRules, FastTypes);
402}
403
404RegBankLegalizeRules::RuleSetInitializer
405RegBankLegalizeRules::addRulesForIOpcs(std::initializer_list<unsigned> OpcList,
406 FastRulesTypes FastTypes) {
407 return RuleSetInitializer(OpcList, IRulesAlias, IRules, FastTypes);
408}
409
412 unsigned Opc = MI.getOpcode();
413 if (Opc == AMDGPU::G_INTRINSIC || Opc == AMDGPU::G_INTRINSIC_CONVERGENT ||
414 Opc == AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS ||
415 Opc == AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS) {
416 unsigned IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
417 auto IRAIt = IRulesAlias.find(IntrID);
418 if (IRAIt == IRulesAlias.end())
419 return nullptr;
420 return &IRules.at(IRAIt->second);
421 }
422
423 auto GRAIt = GRulesAlias.find(Opc);
424 if (GRAIt == GRulesAlias.end())
425 return nullptr;
426 return &GRules.at(GRAIt->second);
427}
428
429// Syntactic sugar wrapper for predicate lambda that enables '&&', '||' and '!'.
430class Predicate {
431private:
432 struct Elt {
433 // Save formula composed of Pred, '&&', '||' and '!' as a jump table.
434 // Sink ! to Pred. For example !((A && !B) || C) -> (!A || B) && !C
435 // Sequences of && and || will be represented by jumps, for example:
436 // (A && B && ... X) or (A && B && ... X) || Y
437 // A == true jump to B
438 // A == false jump to end or Y, result is A(false) or Y
439 // (A || B || ... X) or (A || B || ... X) && Y
440 // A == true jump to end or Y, result is A(true) or Y
441 // A == false jump to B
442 // Notice that when negating expression, we simply flip Neg on each Pred
443 // and swap TJumpOffset and FJumpOffset (&& becomes ||, || becomes &&).
444 std::function<bool(const MachineInstr &)> Pred;
445 bool Neg; // Neg of Pred is calculated before jump
446 unsigned TJumpOffset;
447 unsigned FJumpOffset;
448 };
449
450 SmallVector<Elt, 8> Expression;
451
452 Predicate(SmallVectorImpl<Elt> &&Expr) { Expression.swap(Expr); };
453
454public:
455 Predicate(std::function<bool(const MachineInstr &)> Pred) {
456 Expression.push_back({Pred, false, 1, 1});
457 };
458
459 bool operator()(const MachineInstr &MI) const {
460 unsigned Idx = 0;
461 unsigned ResultIdx = Expression.size();
462 bool Result;
463 do {
464 Result = Expression[Idx].Pred(MI);
465 Result = Expression[Idx].Neg ? !Result : Result;
466 if (Result) {
467 Idx += Expression[Idx].TJumpOffset;
468 } else {
469 Idx += Expression[Idx].FJumpOffset;
470 }
471 } while ((Idx != ResultIdx));
472
473 return Result;
474 };
475
476 Predicate operator!() const {
477 SmallVector<Elt, 8> NegExpression;
478 for (const Elt &ExprElt : Expression) {
479 NegExpression.push_back({ExprElt.Pred, !ExprElt.Neg, ExprElt.FJumpOffset,
480 ExprElt.TJumpOffset});
481 }
482 return Predicate(std::move(NegExpression));
483 };
484
485 Predicate operator&&(const Predicate &RHS) const {
486 SmallVector<Elt, 8> AndExpression = Expression;
487
488 unsigned RHSSize = RHS.Expression.size();
489 unsigned ResultIdx = Expression.size();
490 for (unsigned i = 0; i < ResultIdx; ++i) {
491 // LHS results in false, whole expression results in false.
492 if (i + AndExpression[i].FJumpOffset == ResultIdx)
493 AndExpression[i].FJumpOffset += RHSSize;
494 }
495
496 AndExpression.append(RHS.Expression);
497
498 return Predicate(std::move(AndExpression));
499 }
500
501 Predicate operator||(const Predicate &RHS) const {
502 SmallVector<Elt, 8> OrExpression = Expression;
503
504 unsigned RHSSize = RHS.Expression.size();
505 unsigned ResultIdx = Expression.size();
506 for (unsigned i = 0; i < ResultIdx; ++i) {
507 // LHS results in true, whole expression results in true.
508 if (i + OrExpression[i].TJumpOffset == ResultIdx)
509 OrExpression[i].TJumpOffset += RHSSize;
510 }
511
512 OrExpression.append(RHS.Expression);
513
514 return Predicate(std::move(OrExpression));
515 }
516};
517
518// Initialize rules
521 : ST(&_ST), MRI(&_MRI) {
522
523 addRulesForGOpcs({G_ADD, G_SUB}, Standard)
524 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}})
525 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
526 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
527 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
529 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
530 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}})
531 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}});
532
533 addRulesForGOpcs({G_UADDO, G_USUBO}, Standard)
534 .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32}})
535 .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32}});
536
537 addRulesForGOpcs({G_UADDE, G_USUBE, G_SADDE, G_SSUBE}, Standard)
539 .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32, Vcc}});
540
541 addRulesForGOpcs({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT}, Standard)
542 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}})
543 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
544 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
545 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
547 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
548
549 bool HasVecMulU64 = ST->hasVectorMulU64();
550 addRulesForGOpcs({G_MUL}, Standard)
551 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
552 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
553 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
554 .Uni(S64, {{SgprB64}, {SgprB64, SgprB64}})
556 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
557 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}})
558 .Div(S64, {{VgprB64}, {VgprB64, VgprB64}}, HasVecMulU64)
559 .Div(S64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32Mul}, !HasVecMulU64);
560
561 bool hasMulHi = ST->hasScalarMulHiInsts();
562 addRulesForGOpcs({G_UMULH, G_SMULH}, Standard)
563 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
564 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasMulHi)
565 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasMulHi);
566
567 addRulesForGOpcs({G_AMDGPU_MAD_U64_U32}, Standard)
568 .Div(S64, {{Vgpr64, Vcc}, {Vgpr32, Vgpr32, Vgpr64}})
570
571 bool HasScalarSMulU64 = ST->hasScalarSMulU64();
572 addRulesForGOpcs({G_AMDGPU_S_MUL_U64_U32, G_AMDGPU_S_MUL_I64_I32}, Standard)
573 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}, UniMul64}, HasScalarSMulU64)
574 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}, DivSMulToMAD});
575
576 addRulesForGOpcs({G_XOR, G_OR, G_AND}, StandardB)
578 .Any({{DivS1}, {{Vcc}, {Vcc, Vcc}}})
579 .Any({{UniS16}, {{Sgpr16}, {Sgpr16, Sgpr16}}})
580 .Any({{DivS16}, {{Vgpr16}, {Vgpr16, Vgpr16}}})
581 .Uni(B32, {{SgprB32}, {SgprB32, SgprB32}})
582 .Div(B32, {{VgprB32}, {VgprB32, VgprB32}})
583 .Uni(B64, {{SgprB64}, {SgprB64, SgprB64}})
584 .Div(B64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32});
585
586 addRulesForGOpcs({G_SHL}, Standard)
587 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32ZExt}})
588 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
590 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
591 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
592 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
593 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
594 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
595
596 addRulesForGOpcs({G_LSHR}, Standard)
597 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32ZExt, Sgpr32ZExt}})
598 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
600 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
601 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
602 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
603 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
604 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
605
606 addRulesForGOpcs({G_ASHR}, Standard)
607 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt, Sgpr32ZExt}})
608 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
610 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
611 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
612 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
613 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
614 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
615
616 addRulesForGOpcs({G_FSHR}, Standard)
617 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
618 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
619
620 addRulesForGOpcs({G_BSWAP}, Standard)
621 .Uni(S16, {{UniInVgprS16}, {Vgpr16}})
622 .Div(S16, {{Vgpr16}, {Vgpr16}})
623 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
624 .Div(S32, {{Vgpr32}, {Vgpr32}})
625 .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}})
626 .Div(V2S16, {{VgprV2S16}, {VgprV2S16}});
627
628 addRulesForGOpcs({G_AMDGPU_CVT_F32_UBYTE0, G_AMDGPU_CVT_F32_UBYTE1,
629 G_AMDGPU_CVT_F32_UBYTE2, G_AMDGPU_CVT_F32_UBYTE3,
630 G_AMDGPU_RCP_IFLAG},
631 Standard)
632 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
633 .Div(S32, {{Vgpr32}, {Vgpr32}});
634
635 addRulesForGOpcs({G_FRAME_INDEX}).Any({{UniP5, _}, {{SgprP5}, {None}}});
636
637 addRulesForGOpcs({G_UBFX, G_SBFX}, Standard)
638 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32, Sgpr32}, S_BFE})
639 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}})
640 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32, Sgpr32}, S_BFE})
641 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32, Vgpr32}, V_BFE});
642
643 addRulesForGOpcs({G_SMIN, G_SMAX}, Standard)
644 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt, Sgpr32SExt}})
645 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
646 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
647 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
649 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
650
651 addRulesForGOpcs({G_UMIN, G_UMAX}, Standard)
652 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32ZExt, Sgpr32ZExt}})
653 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
654 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
655 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
657 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
658
659 addRulesForGOpcs({G_IMPLICIT_DEF})
660 .Any({{UniS1}, {{Sgpr32Trunc}, {}}})
661 .Any({{UniS16}, {{Sgpr16}, {}}})
662 .Any({{UniBRC}, {{SgprBRC}, {}}});
663
664 addRulesForGOpcs({G_CONSTANT}, Standard)
665 .Any({{UniS1, _}, {{Sgpr32Trunc}, {}, UniCstExt}})
666 .Uni(S16, {{Sgpr16}, {}})
667 .Uni(S32, {{Sgpr32}, {}})
668 .Uni(S64, {{Sgpr64}, {}})
669 .Any({{UniPtr32, _}, {{SgprPtr32}, {}}})
670 .Any({{UniPtr64, _}, {{SgprPtr64}, {}}});
671
672 addRulesForGOpcs({G_FCONSTANT}, Standard)
673 .Uni(S16, {{Sgpr16}, {}})
674 .Uni(S32, {{Sgpr32}, {}})
675 .Uni(S64, {{Sgpr64}, {}});
676
677 addRulesForGOpcs({G_FREEZE})
678 .Any({{UniS1}, {{Sgpr32Trunc}, {Sgpr32AExt}}})
679 .Any({{DivS1}, {{Vcc}, {Vcc}}})
680 .Any({{UniS16}, {{Sgpr16}, {Sgpr16}}})
681 .Any({{UniBRC}, {{SgprBRC}, {SgprBRC}}})
682 .Any({{DivBRC}, {{VgprBRC}, {VgprBRC}}});
683
684 addRulesForGOpcs({G_BITCAST})
685 .Any({{UniBRC}, {{SgprBRC}, {SgprBRC}}})
686 .Any({{DivBRC}, {{VgprBRC}, {VgprBRC}}});
687
688 addRulesForGOpcs({G_UNMERGE_VALUES})
689 .Any({{UniS16}, {{}, {}, UnmergeToShiftTrunc}})
690 .Any({{UniBRC}, {{}, {}, VerifyAllSgpr}})
691 .Any({{DivBRC}, {{}, {}, ApplyAllVgpr}});
692
693 addRulesForGOpcs({G_PHI})
694 .Any({{UniS1}, {{}, {}, AextToS32InIncomingBlockGPHI}})
695 .Any({{UniS16}, {{}, {}, VerifyAllSgprGPHI}})
696 .Any({{UniBRC}, {{}, {}, VerifyAllSgprGPHI}})
697 .Any({{DivBRC}, {{}, {}, VerifyAllSgprOrVgprGPHI}});
698
699 addRulesForGOpcs({G_EXTRACT_VECTOR_ELT})
700 .Any({{UniB32, UniBRC, UniS32}, {{SgprB32}, {SgprBRC, Sgpr32}}})
701 .Any({{DivB32, DivBRC, UniS32}, {{VgprB32}, {VgprBRC, Sgpr32}}})
702 .Any({{DivB32, BRC, DivS32},
704 .Any({{UniB64, UniBRC, UniS32}, {{SgprB64}, {SgprBRC, Sgpr32}}})
705 .Any({{DivB64, DivBRC, UniS32},
707 .Any({{DivB64, BRC, DivS32},
709
710 addRulesForGOpcs({G_INSERT_VECTOR_ELT})
712 {{SgprBRC}, {SgprBRC, SgprB32, Sgpr32}}})
713 .Any(
714 {{DivBRC, BRC, B32, UniS32}, {{VgprBRC}, {VgprBRC, VgprB32, Sgpr32}}})
715 .Any({{DivBRC, BRC, B32, DivS32},
719 .Any({{DivBRC, BRC, B64, UniS32},
721 .Any({{DivBRC, BRC, B64, DivS32},
723
724 // LOAD {Div}, {{VgprDst...}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
725 // LOAD {Uni}, {{UniInVgprDst...}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
726 // LOAD_NORET {}, {{}, {Imm, VgprSrc, ..., Sgpr_WF_RsrcIdx}}
727 // STORE {}, {{}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
728 addRulesForGOpcs({G_AMDGPU_INTRIN_IMAGE_LOAD, G_AMDGPU_INTRIN_IMAGE_LOAD_D16,
729 G_AMDGPU_INTRIN_IMAGE_LOAD_NORET,
730 G_AMDGPU_INTRIN_IMAGE_STORE,
731 G_AMDGPU_INTRIN_IMAGE_STORE_D16})
732 .Any({{}, {{}, {}, ApplyINTRIN_IMAGE}});
733
734 Predicate isSignedICmp([](const MachineInstr &MI) -> bool {
735 auto Pred =
736 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
737 return CmpInst::isSigned(Pred);
738 });
739
740 Predicate isEqualityICmp([](const MachineInstr &MI) -> bool {
741 auto Pred =
742 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
743 return ICmpInst::isEquality(Pred);
744 });
745
746 bool HasScalarCompareEq64 = ST->hasScalarCompareEq64();
747 // clang-format off
748 addRulesForGOpcs({G_ICMP})
749 .Any({{{UniS1, _, S16}, isEqualityICmp}, {{Sgpr32Trunc}, {None, Sgpr32ZExt, Sgpr32ZExt}}})
750 .Any({{{UniS1, _, S16}, !isEqualityICmp && isSignedICmp}, {{Sgpr32Trunc}, {None, Sgpr32SExt, Sgpr32SExt}}})
751 .Any({{{UniS1, _, S16}, !isEqualityICmp && !isSignedICmp}, {{Sgpr32Trunc}, {None, Sgpr32ZExt, Sgpr32ZExt}}})
752 .Any({{{DivS1, _, S16}}, {{Vcc}, {None, Vgpr16, Vgpr16}}})
753 .Any({{{UniS1, _, S32}}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}})
754 .Any({{{DivS1, _, S32}}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
755 .Any({{{UniS1, _, S64}, isEqualityICmp}, {{Sgpr32Trunc}, {None, Sgpr64, Sgpr64}}}, HasScalarCompareEq64)
756 .Any({{{UniS1, _, S64}, isEqualityICmp}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}}, !HasScalarCompareEq64)
757 .Any({{{UniS1, _, S64}, !isEqualityICmp}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}})
758 .Any({{{DivS1, _, S64}}, {{Vcc}, {None, Vgpr64, Vgpr64}}})
759 .Any({{{UniS1, _, Ptr32}}, {{Sgpr32Trunc}, {None, SgprPtr32, SgprPtr32}}})
760 .Any({{{DivS1, _, Ptr32}}, {{Vcc}, {None, VgprPtr32, VgprPtr32}}})
761 .Any({{{UniS1, _, Ptr64}, isEqualityICmp}, {{Sgpr32Trunc}, {None, SgprPtr64, SgprPtr64}}}, HasScalarCompareEq64)
762 .Any({{{UniS1, _, Ptr64}, isEqualityICmp}, {{UniInVcc}, {None, VgprPtr64, VgprPtr64}}}, !HasScalarCompareEq64)
763 .Any({{{UniS1, _, Ptr64}, !isEqualityICmp}, {{UniInVcc}, {None, VgprPtr64, VgprPtr64}}})
764 .Any({{{DivS1, _, Ptr64}}, {{Vcc}, {None, VgprPtr64, VgprPtr64}}});
765 // clang-format on
766
767 addRulesForGOpcs({G_BRCOND})
768 .Any({{UniS1}, {{}, {Sgpr32AExtBoolInReg}}})
769 .Any({{DivS1}, {{}, {Vcc}}});
770
771 addRulesForGOpcs({G_BR}).Any({{_}, {{}, {None}}});
772
773 addRulesForGOpcs({G_SELECT}, StandardB)
774 .Any({{DivS16}, {{Vgpr16}, {Vcc, Vgpr16, Vgpr16}}})
776 .Div(B32, {{VgprB32}, {Vcc, VgprB32, VgprB32}})
780
781 addRulesForGOpcs({G_ANYEXT})
782 .Any({{UniS16, S1}, {{None}, {None}}}) // should be combined away
783 .Any({{UniS32, S1}, {{None}, {None}}}) // should be combined away
784 .Any({{UniS64, S1}, {{None}, {None}}}) // should be combined away
785 .Any({{DivS16, S1}, {{Vgpr16}, {Vcc}, VccExtToSel}})
786 .Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
787 .Any({{DivS64, S1}, {{Vgpr64}, {Vcc}, VccExtToSel}})
788 .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
789 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
790 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
791 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
792
793 bool Has16bitCmp = ST->has16BitInsts();
794
795 // In global-isel G_TRUNC in-reg is treated as no-op, inst selected into COPY.
796 // It is up to user to deal with truncated bits.
797 // S1, S16, S32 and S64 results are handled with specific rules. Remaining
798 // (result, source) pairs with valid register classes are covered by the
799 // generic UniBRC/DivBRC wildcard rules.
800 addRulesForGOpcs({G_TRUNC})
801 .Any({{UniS1, UniS16}, {{None}, {None}}}) // should be combined away
802 .Any({{UniS1, UniS32}, {{None}, {None}}}) // should be combined away
803 .Any({{UniS1, UniS64}, {{None}, {None}}}) // should be combined away
804 .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}})
805 .Any({{UniBRC, UniBRC}, {{SgprBRC}, {SgprBRC}}})
806 .Any({{DivBRC, DivBRC}, {{VgprBRC}, {VgprBRC}}})
807 .Any({{UniV2S16, V2S32}, {{SgprV2S16}, {SgprV2S32}}})
808 .Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}})
809 // This is non-trivial. VgprToVccCopy is done using compare instruction.
810 .Any({{DivS1, DivS16}, {{Vcc}, {Vgpr16}, VgprToVccCopy}}, Has16bitCmp)
812 !Has16bitCmp)
813 .Any({{DivS1, DivS32}, {{Vcc}, {Vgpr32}, VgprToVccCopy}})
814 .Any({{DivS1, DivS64}, {{Vcc}, {Vgpr64}, VgprToVccCopy}});
815
816 addRulesForGOpcs({G_ZEXT})
820 .Any({{DivS16, S1}, {{Vgpr16}, {Vcc}, VccExtToSel}})
821 .Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
822 .Any({{DivS64, S1}, {{Vgpr64}, {Vcc}, VccExtToSel}})
823 .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
824 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
825 // not extending S16 to S32 is questionable.
826 .Any({{UniS64, S16}, {{Sgpr64}, {Sgpr32ZExt}, Ext32To64}})
827 .Any({{DivS64, S16}, {{Vgpr64}, {Vgpr32ZExt}, Ext32To64}})
828 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
829 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
830
831 addRulesForGOpcs({G_SEXT})
835 .Any({{DivS16, S1}, {{Vgpr16}, {Vcc}, VccExtToSel}})
836 .Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
837 .Any({{DivS64, S1}, {{Vgpr64}, {Vcc}, VccExtToSel}})
838 .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
839 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
840 // not extending S16 to S32 is questionable.
841 .Any({{UniS64, S16}, {{Sgpr64}, {Sgpr32SExt}, Ext32To64}})
842 .Any({{DivS64, S16}, {{Vgpr64}, {Vgpr32SExt}, Ext32To64}})
843 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
844 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
845
846 addRulesForGOpcs({G_SEXT_INREG})
847 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}})
848 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
849 .Any({{UniS64, S64}, {{Sgpr64}, {Sgpr64}}})
851
852 addRulesForGOpcs({G_ASSERT_ZEXT, G_ASSERT_SEXT}, Standard)
853 .Uni(S32, {{Sgpr32}, {Sgpr32, Imm}})
854 .Div(S32, {{Vgpr32}, {Vgpr32, Imm}})
855 .Uni(S64, {{Sgpr64}, {Sgpr64, Imm}})
856 .Div(S64, {{Vgpr64}, {Vgpr64, Imm}});
857
858 addRulesForGOpcs({G_ASSERT_ALIGN}, Standard)
859 .Uni(S32, {{Sgpr32}, {Sgpr32}})
860 .Div(S32, {{Vgpr32}, {Vgpr32}})
861 .Uni(S64, {{Sgpr64}, {Sgpr64}})
862 .Div(S64, {{Vgpr64}, {Vgpr64}})
863 .Any({{UniPtr32}, {{SgprPtr32}, {SgprPtr32}}})
864 .Any({{DivPtr32}, {{VgprPtr32}, {VgprPtr32}}})
865 .Any({{UniPtr64}, {{SgprPtr64}, {SgprPtr64}}})
866 .Any({{DivPtr64}, {{VgprPtr64}, {VgprPtr64}}});
867
868 // Atomic read-modify-write operations: result and value are always VGPR,
869 // pointer varies by address space.
870 addRulesForGOpcs({G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_XCHG,
871 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
872 G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN,
873 G_ATOMICRMW_UMAX, G_ATOMICRMW_UINC_WRAP,
874 G_ATOMICRMW_UDEC_WRAP, G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
875 .Any({{DivS32, P0, S32}, {{Vgpr32}, {VgprP0, Vgpr32}}})
876 .Any({{DivS64, P0, S64}, {{Vgpr64}, {VgprP0, Vgpr64}}})
877 .Any({{DivS32, P1, S32}, {{Vgpr32}, {VgprP1, Vgpr32}}})
878 .Any({{DivS64, P1, S64}, {{Vgpr64}, {VgprP1, Vgpr64}}})
879 .Any({{DivS32, P3, S32}, {{Vgpr32}, {VgprP3, Vgpr32}}})
880 .Any({{DivS64, P3, S64}, {{Vgpr64}, {VgprP3, Vgpr64}}});
881
882 bool HasAtomicFlatPkAdd16Insts = ST->hasAtomicFlatPkAdd16Insts();
883 bool HasAtomicBufferGlobalPkAddF16Insts =
884 ST->hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
885 ST->hasAtomicBufferGlobalPkAddF16Insts();
886 bool HasAtomicDsPkAdd16Insts = ST->hasAtomicDsPkAdd16Insts();
887 addRulesForGOpcs({G_ATOMICRMW_FADD})
888 .Any({{DivS32, P0, S32}, {{Vgpr32}, {VgprP0, Vgpr32}}})
889 .Any({{DivS64, P0, S64}, {{Vgpr64}, {VgprP0, Vgpr64}}})
890 .Any({{DivS32, P1, S32}, {{Vgpr32}, {VgprP1, Vgpr32}}})
891 .Any({{DivS64, P1, S64}, {{Vgpr64}, {VgprP1, Vgpr64}}})
892 .Any({{DivS32, P3, S32}, {{Vgpr32}, {VgprP3, Vgpr32}}})
893 .Any({{DivS64, P3, S64}, {{Vgpr64}, {VgprP3, Vgpr64}}})
894 .Any({{DivV2S16, P0, V2S16}, {{VgprV2S16}, {VgprP0, VgprV2S16}}},
895 HasAtomicFlatPkAdd16Insts)
896 .Any({{DivV2S16, P1, V2S16}, {{VgprV2S16}, {VgprP1, VgprV2S16}}},
897 HasAtomicBufferGlobalPkAddF16Insts)
898 .Any({{DivV2S16, P3, V2S16}, {{VgprV2S16}, {VgprP3, VgprV2S16}}},
899 HasAtomicDsPkAdd16Insts);
900
901 addRulesForGOpcs({G_ATOMIC_CMPXCHG})
902 .Any({{DivS32, P2}, {{Vgpr32}, {VgprP2, Vgpr32, Vgpr32}}})
903 .Any({{DivS64, P2}, {{Vgpr64}, {VgprP2, Vgpr64, Vgpr64}}})
904 .Any({{DivS32, P3}, {{Vgpr32}, {VgprP3, Vgpr32, Vgpr32}}})
905 .Any({{DivS64, P3}, {{Vgpr64}, {VgprP3, Vgpr64, Vgpr64}}});
906
907 addRulesForGOpcs({G_AMDGPU_ATOMIC_CMPXCHG})
908 .Any({{DivS32, P0}, {{Vgpr32}, {VgprP0, VgprV2S32}}})
909 .Any({{DivS32, P1}, {{Vgpr32}, {VgprP1, VgprV2S32}}})
910 .Any({{DivS64, P0}, {{Vgpr64}, {VgprP0, VgprV2S64}}})
911 .Any({{DivS64, P1}, {{Vgpr64}, {VgprP1, VgprV2S64}}});
912
913 addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_CMPSWAP}, Standard)
914 .Div(S32, {{Vgpr32},
916 .Div(S64, {{Vgpr64},
918
919 addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_ADD, G_AMDGPU_BUFFER_ATOMIC_AND,
920 G_AMDGPU_BUFFER_ATOMIC_DEC, G_AMDGPU_BUFFER_ATOMIC_FMAX,
921 G_AMDGPU_BUFFER_ATOMIC_FMIN, G_AMDGPU_BUFFER_ATOMIC_INC,
922 G_AMDGPU_BUFFER_ATOMIC_OR, G_AMDGPU_BUFFER_ATOMIC_SMAX,
923 G_AMDGPU_BUFFER_ATOMIC_SMIN, G_AMDGPU_BUFFER_ATOMIC_SUB,
924 G_AMDGPU_BUFFER_ATOMIC_SWAP, G_AMDGPU_BUFFER_ATOMIC_UMAX,
925 G_AMDGPU_BUFFER_ATOMIC_UMIN, G_AMDGPU_BUFFER_ATOMIC_XOR},
926 Standard)
929
930 bool hasSMRDx3 = ST->hasScalarDwordx3Loads();
931 bool hasSMRDSmall = ST->hasScalarSubwordLoads();
932 bool usesTrue16 = ST->useRealTrue16Insts();
933
934 Predicate isAlign16([](const MachineInstr &MI) -> bool {
935 return (*MI.memoperands_begin())->getAlign() >= Align(16);
936 });
937
938 Predicate isAlign4([](const MachineInstr &MI) -> bool {
939 return (*MI.memoperands_begin())->getAlign() >= Align(4);
940 });
941
942 Predicate isAtomicMMO([](const MachineInstr &MI) -> bool {
943 return (*MI.memoperands_begin())->isAtomic();
944 });
945
946 Predicate isUniMMO([](const MachineInstr &MI) -> bool {
947 return AMDGPU::isUniformMMO(*MI.memoperands_begin());
948 });
949
950 Predicate isConst([](const MachineInstr &MI) -> bool {
951 // Address space in MMO be different then address space on pointer.
952 const MachineMemOperand *MMO = *MI.memoperands_begin();
953 const unsigned AS = MMO->getAddrSpace();
954 return AS == AMDGPUAS::CONSTANT_ADDRESS ||
956 });
957
958 Predicate isVolatileMMO([](const MachineInstr &MI) -> bool {
959 return (*MI.memoperands_begin())->isVolatile();
960 });
961
962 Predicate isInvMMO([](const MachineInstr &MI) -> bool {
963 return (*MI.memoperands_begin())->isInvariant();
964 });
965
966 Predicate isNoClobberMMO([](const MachineInstr &MI) -> bool {
967 return (*MI.memoperands_begin())->getFlags() & MONoClobber;
968 });
969
970 Predicate isNaturalAligned([](const MachineInstr &MI) -> bool {
971 const MachineMemOperand *MMO = *MI.memoperands_begin();
972 return MMO->getAlign() >= Align(MMO->getSize().getValue());
973 });
974
975 Predicate is8Or16BitMMO([](const MachineInstr &MI) -> bool {
976 const MachineMemOperand *MMO = *MI.memoperands_begin();
977 const unsigned MemSize = 8 * MMO->getSize().getValue();
978 return MemSize == 16 || MemSize == 8;
979 });
980
981 Predicate is32BitMMO([](const MachineInstr &MI) -> bool {
982 const MachineMemOperand *MMO = *MI.memoperands_begin();
983 return 8 * MMO->getSize().getValue() == 32;
984 });
985
986 auto isUL = !isAtomicMMO && isUniMMO && (isConst || !isVolatileMMO) &&
987 (isConst || isInvMMO || isNoClobberMMO);
988
989 // clang-format off
990 // TODO: S32Dst, 16-bit any-extending load should not appear on True16 targets
991 addRulesForGOpcs({G_LOAD})
992 // flat, addrspace(0), never uniform - flat_load
993 .Any({{DivS16, P0}, {{Vgpr16}, {VgprP0}}}, usesTrue16)
994 .Any({{DivB32, P0}, {{VgprB32}, {VgprP0}}}) // 32-bit load, 8-bit and 16-bit any-extending load
995 .Any({{DivB64, P0}, {{VgprB64}, {VgprP0}}})
996 .Any({{DivB96, P0}, {{VgprB96}, {VgprP0}}})
997 .Any({{DivB128, P0}, {{VgprB128}, {VgprP0}}})
998
999 // global, addrspace(1)
1000 // divergent - global_load
1001 .Any({{DivS16, P1}, {{Vgpr16}, {VgprP1}}}, usesTrue16)
1002 .Any({{DivB32, P1}, {{VgprB32}, {VgprP1}}}) //32-bit load, 8-bit and 16-bit any-extending load
1003 .Any({{DivB64, P1}, {{VgprB64}, {VgprP1}}})
1004 .Any({{DivB96, P1}, {{VgprB96}, {VgprP1}}})
1005 .Any({{DivB128, P1}, {{VgprB128}, {VgprP1}}})
1006 .Any({{DivB256, P1}, {{VgprB256}, {VgprP1}, SplitLoad}})
1007 .Any({{DivB512, P1}, {{VgprB512}, {VgprP1}, SplitLoad}})
1008
1009 // uniform - s_load
1010 .Any({{{UniS16, P1}, isNaturalAligned && isUL}, {{Sgpr32Trunc}, {SgprP1}}}, usesTrue16 && hasSMRDSmall) // s16 load
1011 .Any({{{UniS16, P1}, isAlign4 && isUL}, {{Sgpr32Trunc}, {SgprP1}, WidenMMOToS32}}, usesTrue16 && !hasSMRDSmall) // s16 load to 32-bit load
1012 .Any({{{UniB32, P1}, isNaturalAligned && isUL}, {{SgprB32}, {SgprP1}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1013 // TODO: SplitLoad when !isNaturalAligned && isUL and target hasSMRDSmall
1014 .Any({{{UniB32, P1}, is8Or16BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP1}, WidenMMOToS32}}, !hasSMRDSmall) //8-bit and 16-bit any-extending load to 32-bit load
1015 .Any({{{UniB32, P1}, is32BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP1}}}) //32-bit load
1016 .Any({{{UniB64, P1}, isAlign4 && isUL}, {{SgprB64}, {SgprP1}}})
1017 .Any({{{UniB96, P1}, isAlign16 && isUL}, {{SgprB96}, {SgprP1}, WidenLoad}}, !hasSMRDx3)
1018 .Any({{{UniB96, P1}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP1}, SplitLoad}}, !hasSMRDx3)
1019 .Any({{{UniB96, P1}, isAlign4 && isUL}, {{SgprB96}, {SgprP1}}}, hasSMRDx3)
1020 .Any({{{UniB128, P1}, isAlign4 && isUL}, {{SgprB128}, {SgprP1}}})
1021 .Any({{{UniB256, P1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}})
1022 .Any({{{UniB512, P1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}})
1023
1024 // Uniform via global or buffer load, for example volatile or non-aligned
1025 // uniform load. Not using standard {{UniInVgprTy}, {VgprP1}} since it is
1026 // selected as global_load, use SgprP1 for pointer instead to match
1027 // patterns without flat-for-global, default for GFX7 and older.
1028 // -> +flat-for-global + {{UniInVgprTy}, {SgprP1}} - global_load
1029 // -> -flat-for-global + {{UniInVgprTy}, {SgprP1}} - buffer_load
1030 .Any({{{UniS16, P1}, !isNaturalAligned || !isUL}, {{UniInVgprS16}, {SgprP1}}}, usesTrue16 && hasSMRDSmall) // s16 load
1031 .Any({{{UniS16, P1}, !isAlign4 || !isUL}, {{UniInVgprS16}, {SgprP1}}}, usesTrue16 && !hasSMRDSmall) // s16 load
1032 .Any({{{UniB32, P1}, !isNaturalAligned || !isUL}, {{UniInVgprB32}, {SgprP1}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1033 .Any({{{UniB32, P1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}}, !hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1034 .Any({{{UniB64, P1}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP1}}})
1035 .Any({{{UniB96, P1}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP1}}})
1036 .Any({{{UniB128, P1}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP1}}})
1037 .Any({{{UniB256, P1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {SgprP1}, SplitLoad}})
1038 .Any({{{UniB512, P1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {SgprP1}, SplitLoad}})
1039
1040 // local, addrspace(3) - ds_load
1041 .Any({{DivS16, P3}, {{Vgpr16}, {VgprP3}}}, usesTrue16)
1042 .Any({{DivB32, P3}, {{VgprB32}, {VgprP3}}}) // 32-bit load, 8-bit and 16-bit any-extending load
1043 .Any({{DivB64, P3}, {{VgprB64}, {VgprP3}}})
1044 .Any({{DivB96, P3}, {{VgprB96}, {VgprP3}}})
1045 .Any({{DivB128, P3}, {{VgprB128}, {VgprP3}}})
1046
1047 .Any({{UniS16, P3}, {{UniInVgprS16}, {SgprP3}}}, usesTrue16) // 16-bit load
1048 .Any({{UniB32, P3}, {{UniInVgprB32}, {VgprP3}}}) // 32-bit load, 8-bit and 16-bit any-extending load
1049 .Any({{UniB64, P3}, {{UniInVgprB64}, {VgprP3}}})
1050 .Any({{UniB96, P3}, {{UniInVgprB96}, {VgprP3}}})
1051 .Any({{UniB128, P3}, {{UniInVgprB128}, {VgprP3}}})
1052
1053 // constant, addrspace(4)
1054 // divergent - global_load
1055 .Any({{DivS16, P4}, {{Vgpr16}, {VgprP4}}}, usesTrue16)
1056 .Any({{DivB32, P4}, {{VgprB32}, {VgprP4}}}) //32-bit load, 8-bit and 16-bit any-extending load
1057 .Any({{DivB64, P4}, {{VgprB64}, {VgprP4}}})
1058 .Any({{DivB96, P4}, {{VgprB96}, {VgprP4}}})
1059 .Any({{DivB128, P4}, {{VgprB128}, {VgprP4}}})
1060 .Any({{DivB256, P4}, {{VgprB256}, {VgprP4}, SplitLoad}})
1061 .Any({{DivB512, P4}, {{VgprB512}, {VgprP4}, SplitLoad}})
1062
1063 // uniform - s_load
1064 .Any({{{UniS16, P4}, isNaturalAligned && isUL}, {{Sgpr32Trunc}, {SgprP4}}}, usesTrue16 && hasSMRDSmall) // s16 load
1065 .Any({{{UniS16, P4}, isAlign4 && isUL}, {{Sgpr32Trunc}, {SgprP4}, WidenMMOToS32}}, usesTrue16 && !hasSMRDSmall) // s16 load to 32-bit load
1066 .Any({{{UniB32, P4}, isNaturalAligned && isUL}, {{SgprB32}, {SgprP4}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1067 .Any({{{UniB32, P4}, is8Or16BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP4}, WidenMMOToS32}}, !hasSMRDSmall) //8-bit and 16-bit any-extending load to 32-bit load
1068 .Any({{{UniB32, P4}, is32BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP4}}}) //32-bit load
1069 .Any({{{UniB64, P4}, isAlign4 && isUL}, {{SgprB64}, {SgprP4}}})
1070 .Any({{{UniB96, P4}, isAlign16 && isUL}, {{SgprB96}, {SgprP4}, WidenLoad}}, !hasSMRDx3)
1071 .Any({{{UniB96, P4}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP4}, SplitLoad}}, !hasSMRDx3)
1072 .Any({{{UniB96, P4}, isAlign4 && isUL}, {{SgprB96}, {SgprP4}}}, hasSMRDx3)
1073 .Any({{{UniB128, P4}, isAlign4 && isUL}, {{SgprB128}, {SgprP4}}})
1074 .Any({{{UniB256, P4}, isAlign4 && isUL}, {{SgprB256}, {SgprP4}}})
1075 .Any({{{UniB512, P4}, isAlign4 && isUL}, {{SgprB512}, {SgprP4}}})
1076
1077 // uniform in vgpr - global_load or buffer_load
1078 .Any({{{UniS16, P4}, !isNaturalAligned || !isUL}, {{UniInVgprS16}, {SgprP4}}}, usesTrue16 && hasSMRDSmall) // s16 load
1079 .Any({{{UniS16, P4}, !isAlign4 || !isUL}, {{UniInVgprS16}, {SgprP4}}}, usesTrue16 && !hasSMRDSmall) // s16 load
1080 .Any({{{UniB32, P4}, !isNaturalAligned || !isUL}, {{UniInVgprB32}, {SgprP4}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1081 .Any({{{UniB32, P4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP4}}}, !hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1082 .Any({{{UniB64, P4}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP4}}})
1083 .Any({{{UniB96, P4}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP4}}})
1084 .Any({{{UniB128, P4}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP4}}})
1085 .Any({{{UniB256, P4}, !isAlign4 || !isUL}, {{UniInVgprB256}, {SgprP4}, SplitLoad}})
1086 .Any({{{UniB512, P4}, !isAlign4 || !isUL}, {{UniInVgprB512}, {SgprP4}, SplitLoad}})
1087
1088 // private, addrspace(5), never uniform - scratch_load
1089 .Any({{DivS16, P5}, {{Vgpr16}, {VgprP5}}}, usesTrue16)
1090 .Any({{DivB32, P5}, {{VgprB32}, {VgprP5}}}) // 32-bit load, 8-bit and 16-bit any-extending load
1091 .Any({{DivB64, P5}, {{VgprB64}, {VgprP5}}})
1092 .Any({{DivB96, P5}, {{VgprB96}, {VgprP5}}})
1093 .Any({{DivB128, P5}, {{VgprB128}, {VgprP5}}})
1094
1095 .Any({{DivS32, Ptr128}, {{Vgpr32}, {VgprPtr128}}});
1096
1097
1098 addRulesForGOpcs({G_ZEXTLOAD, G_SEXTLOAD}) // i8 and i16 zeroextending loads
1099 .Any({{DivS32, P0}, {{Vgpr32}, {VgprP0}}})
1100
1101 .Any({{DivS32, P1}, {{Vgpr32}, {VgprP1}}})
1102 .Any({{{UniS32, P1}, isAlign4 && isUL}, {{Sgpr32}, {SgprP1}, WidenMMOToS32}}, !hasSMRDSmall)
1103 .Any({{{UniS32, P1}, isNaturalAligned && isUL}, {{Sgpr32}, {SgprP1}}}, hasSMRDSmall)
1104 .Any({{{UniS32, P1}, !isAlign4 || !isUL}, {{UniInVgprS32}, {SgprP1}}}, !hasSMRDSmall)
1105 .Any({{{UniS32, P1}, !isNaturalAligned || !isUL}, {{UniInVgprS32}, {SgprP1}}}, hasSMRDSmall)
1106
1107 .Any({{DivS32, P3}, {{Vgpr32}, {VgprP3}}})
1108 .Any({{UniS32, P3}, {{UniInVgprS32}, {VgprP3}}})
1109
1110 .Any({{DivS32, P4}, {{Vgpr32}, {VgprP4}}})
1111 .Any({{{UniS32, P4}, isAlign4 && isUL}, {{Sgpr32}, {SgprP4}, WidenMMOToS32}}, !hasSMRDSmall)
1112 .Any({{{UniS32, P4}, isNaturalAligned && isUL}, {{Sgpr32}, {SgprP4}}}, hasSMRDSmall)
1113 .Any({{{UniS32, P4}, !isAlign4 || !isUL}, {{UniInVgprS32}, {SgprP4}}}, !hasSMRDSmall)
1114 .Any({{{UniS32, P4}, !isNaturalAligned || !isUL}, {{UniInVgprS32}, {SgprP4}}}, hasSMRDSmall)
1115
1116 .Any({{DivS32, P5}, {{Vgpr32}, {VgprP5}}});
1117
1118 addRulesForGOpcs({G_STORE})
1119 // addrspace(0)
1120 .Any({{S16, P0}, {{}, {Vgpr16, VgprP0}}}, usesTrue16) // 16-bit store
1121 .Any({{B32, P0}, {{}, {VgprB32, VgprP0}}}) // 32-bit store, 8-bit and 16-bit truncating store
1122 .Any({{B64, P0}, {{}, {VgprB64, VgprP0}}})
1123 .Any({{B96, P0}, {{}, {VgprB96, VgprP0}}})
1124 .Any({{B128, P0}, {{}, {VgprB128, VgprP0}}})
1125
1126 // addrspace(1), there are no stores to addrspace(4)
1127 // For targets:
1128 // - with "+flat-for-global" - global_store
1129 // - without(-flat-for-global) - buffer_store addr64
1130 .Any({{S16, DivP1}, {{}, {Vgpr16, VgprP1}}}, usesTrue16) // 16-bit store
1131 .Any({{B32, DivP1}, {{}, {VgprB32, VgprP1}}}) // 32-bit store, 8-bit and 16-bit truncating store
1132 .Any({{B64, DivP1}, {{}, {VgprB64, VgprP1}}})
1133 .Any({{B96, DivP1}, {{}, {VgprB96, VgprP1}}})
1134 .Any({{B128, DivP1}, {{}, {VgprB128, VgprP1}}})
1135
1136 // For UniP1, use sgpr ptr to match flat-for-global patterns. Targets:
1137 // - with "+flat-for-global" - global_store for both sgpr and vgpr ptr
1138 // - without(-flat-for-global) - need sgpr ptr to select buffer_store
1139 .Any({{S16, UniP1}, {{}, {Vgpr16, SgprP1}}}, usesTrue16) // 16-bit store
1140 .Any({{B32, UniP1}, {{}, {VgprB32, SgprP1}}}) // 32-bit store, 8-bit and 16-bit truncating store
1141 .Any({{B64, UniP1}, {{}, {VgprB64, SgprP1}}})
1142 .Any({{B96, UniP1}, {{}, {VgprB96, SgprP1}}})
1143 .Any({{B128, UniP1}, {{}, {VgprB128, SgprP1}}})
1144
1145 // addrspace(3) and addrspace(5)
1146 .Any({{S16, Ptr32}, {{}, {Vgpr16, VgprPtr32}}}, usesTrue16) // 16-bit store
1147 .Any({{B32, Ptr32}, {{}, {VgprB32, VgprPtr32}}}) // 32-bit store, 8-bit and 16-bit truncating store
1148 .Any({{B64, Ptr32}, {{}, {VgprB64, VgprPtr32}}})
1149 .Any({{B96, Ptr32}, {{}, {VgprB96, VgprPtr32}}})
1150 .Any({{B128, Ptr32}, {{}, {VgprB128, VgprPtr32}}});
1151
1152 // clang-format on
1153
1154 addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD, G_AMDGPU_BUFFER_LOAD_FORMAT,
1155 G_AMDGPU_TBUFFER_LOAD_FORMAT},
1156 StandardB)
1165
1166 addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD_USHORT, G_AMDGPU_BUFFER_LOAD_UBYTE,
1167 G_AMDGPU_BUFFER_LOAD_SSHORT, G_AMDGPU_BUFFER_LOAD_SBYTE},
1168 StandardB)
1171
1172 addRulesForGOpcs(
1173 {G_AMDGPU_BUFFER_LOAD_UBYTE_TFE, G_AMDGPU_BUFFER_LOAD_USHORT_TFE},
1174 StandardB)
1177
1178 addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD_TFE, G_AMDGPU_BUFFER_LOAD_FORMAT_TFE},
1179 StandardB)
1187 .Any({{UniB160},
1189
1190 addRulesForGOpcs(
1191 {G_AMDGPU_BUFFER_LOAD_FORMAT_D16, G_AMDGPU_TBUFFER_LOAD_FORMAT_D16},
1192 StandardB)
1199
1200 addRulesForGOpcs({G_AMDGPU_BUFFER_STORE, G_AMDGPU_BUFFER_STORE_BYTE,
1201 G_AMDGPU_BUFFER_STORE_SHORT, G_AMDGPU_BUFFER_STORE_FORMAT,
1202 G_AMDGPU_BUFFER_STORE_FORMAT_D16,
1203 G_AMDGPU_TBUFFER_STORE_FORMAT,
1204 G_AMDGPU_TBUFFER_STORE_FORMAT_D16})
1205 .Any({{B32}, {{}, {VgprB32, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}})
1206 .Any({{B64}, {{}, {VgprB64, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}})
1207 .Any({{B96}, {{}, {VgprB96, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}})
1208 .Any({{B128}, {{}, {VgprB128, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}});
1209
1210 // Buffer atomics: resource descriptor + scalar offset are SGPR, data and
1211 // address components are VGPR.
1212 //
1213 // Operand order (SIInstructions.td BufferAtomicGenericInstruction):
1214 // dst = op vdata, rsrc, vindex, voffset, soffset, offset_imm, cachepolicy,
1215 // idxen_imm
1216 addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_FADD})
1217 .Any({{S32, S32, V4S32, S32, S32, S32},
1219 .Any({{S64, S64, V4S32, S32, S32, S32},
1221 .Any({{V2S16, V2S16, V4S32, S32, S32, S32},
1222 {{VgprV2S16},
1224
1225 addRulesForGOpcs({G_PTR_ADD})
1226 .Any({{UniPtr32}, {{SgprPtr32}, {SgprPtr32, Sgpr32}}})
1227 .Any({{DivPtr32}, {{VgprPtr32}, {VgprPtr32, Vgpr32}}})
1228 .Any({{UniPtr64}, {{SgprPtr64}, {SgprPtr64, Sgpr64}}})
1229 .Any({{DivPtr64}, {{VgprPtr64}, {VgprPtr64, Vgpr64}}});
1230
1231 addRulesForGOpcs({G_INTTOPTR})
1232 .Any({{UniPtr32}, {{SgprPtr32}, {Sgpr32}}})
1233 .Any({{DivPtr32}, {{VgprPtr32}, {Vgpr32}}})
1234 .Any({{UniPtr64}, {{SgprPtr64}, {Sgpr64}}})
1235 .Any({{DivPtr64}, {{VgprPtr64}, {Vgpr64}}})
1236 .Any({{UniPtr128}, {{SgprPtr128}, {Sgpr128}}})
1237 .Any({{DivPtr128}, {{VgprPtr128}, {Vgpr128}}});
1238
1239 addRulesForGOpcs({G_PTRTOINT})
1240 .Any({{UniS32}, {{Sgpr32}, {SgprPtr32}}})
1241 .Any({{DivS32}, {{Vgpr32}, {VgprPtr32}}})
1242 .Any({{UniS64}, {{Sgpr64}, {SgprPtr64}}})
1243 .Any({{DivS64}, {{Vgpr64}, {VgprPtr64}}})
1244 .Any({{UniS128}, {{Sgpr128}, {SgprPtr128}}})
1245 .Any({{DivS128}, {{Vgpr128}, {VgprPtr128}}});
1246
1247 // FIXME: Update llvm/test/CodeGen/AMDGPU/ptrmask.ll to use GlobalISel.
1248 // Currently crashes on P8 (buffer resource) tests due to legalizer issue.
1249 addRulesForGOpcs({G_PTRMASK})
1250 .Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}})
1251 .Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}})
1252 .Any({{UniP3}, {{SgprP3}, {SgprP3, Sgpr32}}})
1253 .Any({{DivP3}, {{VgprP3}, {VgprP3, Vgpr32}}});
1254
1255 addRulesForGOpcs({G_ABS}, Standard)
1256 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}})
1257 .Div(S16, {{Vgpr16}, {Vgpr16}, AbsToNegMax})
1258 .Uni(S32, {{Sgpr32}, {Sgpr32}})
1259 .Div(S32, {{Vgpr32}, {Vgpr32}, AbsToNegMax})
1260 .Uni(V2S16, {{SgprV2S16}, {SgprV2S16}, AbsToS32})
1261 .Div(V2S16, {{VgprV2S16}, {VgprV2S16}, AbsToNegMax});
1262
1263 addRulesForGOpcs({G_BITREVERSE}, Standard)
1264 .Uni(S32, {{Sgpr32}, {Sgpr32}})
1265 .Div(S32, {{Vgpr32}, {Vgpr32}})
1266 .Uni(S64, {{Sgpr64}, {Sgpr64}})
1267 .Div(S64, {{Vgpr64}, {Vgpr64}});
1268
1269 addRulesForGOpcs({G_AMDGPU_FFBH_U32, G_AMDGPU_FFBL_B32, G_CTLZ_ZERO_UNDEF,
1270 G_CTTZ_ZERO_UNDEF})
1271 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}})
1272 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
1273 .Any({{UniS32, S64}, {{Sgpr32}, {Sgpr64}}})
1275
1276 addRulesForGOpcs({G_FENCE}).Any({{{}}, {{}, {}}});
1277
1278 addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard)
1279 .Uni(S64, {{Sgpr64}, {}});
1280
1281 addRulesForGOpcs({G_BLOCK_ADDR}).Any({{UniP0}, {{SgprP0}, {}}});
1282
1283 addRulesForGOpcs({G_GLOBAL_VALUE})
1284 .Any({{UniP0}, {{SgprP0}, {}}})
1285 .Any({{UniP1}, {{SgprP1}, {}}})
1286 .Any({{UniP3}, {{SgprP3}, {}}})
1287 .Any({{UniP4}, {{SgprP4}, {}}})
1288 .Any({{UniP8}, {{SgprP8}, {}}});
1289
1290 addRulesForGOpcs({G_AMDGPU_WAVE_ADDRESS}).Any({{UniP5}, {{SgprP5}, {}}});
1291
1292 addRulesForGOpcs({G_SI_CALL})
1293 .Any({{_, UniP0}, {{None}, {SgprP0}}})
1294 .Any({{_, DivP0}, {{None}, {SgprP0Call_WF}}})
1295 .Any({{_, UniP4}, {{None}, {SgprP4}}})
1296 .Any({{_, DivP4}, {{None}, {SgprP4Call_WF}}});
1297
1298 bool hasSALUFloat = ST->hasSALUFloatInsts();
1299
1300 addRulesForGOpcs({G_FADD, G_FMUL, G_STRICT_FADD, G_STRICT_FMUL}, Standard)
1301 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
1302 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
1303 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1304 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
1305 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat)
1306 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1307 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
1308 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
1309 .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}, !hasSALUFloat)
1311 hasSALUFloat)
1312 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
1313
1314 addRulesForGOpcs({G_FSUB, G_STRICT_FSUB}, Standard)
1315 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1316 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1317 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
1318 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
1319 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
1320 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat);
1321
1322 addRulesForGOpcs({G_FMAD}, Standard)
1323 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}})
1324 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
1325 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
1326 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
1327
1328 addRulesForGOpcs({G_FLDEXP, G_STRICT_FLDEXP}, Standard)
1329 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
1330 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1331 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}})
1332 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1333 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr32}})
1334 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
1335
1336 addRulesForGOpcs({G_FMA, G_STRICT_FMA}, Standard)
1337 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
1338 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}})
1339 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64, Vgpr64}})
1340 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64, Vgpr64}})
1344 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16, Sgpr16}}, hasSALUFloat)
1345 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}}, !hasSALUFloat)
1346 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32, Sgpr32}}, hasSALUFloat)
1347 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}}, !hasSALUFloat)
1348 .Uni(V2S16,
1350 hasSALUFloat)
1352 !hasSALUFloat);
1353
1354 addRulesForGOpcs({G_AMDGPU_FMED3}, Standard)
1355 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}})
1356 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
1357 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
1358 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
1359
1360 // TODO: This opcode is generated from the i64->i16 signed clamped pattern in
1361 // the PreLegalizerCombiner. Move the combine to RegBankCombiner to keep more
1362 // instructions on SALU.
1363 addRulesForGOpcs({G_AMDGPU_SMED3}, Standard)
1364 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
1365 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
1366
1367 // FNEG and FABS are either folded as source modifiers or can be selected as
1368 // bitwise XOR and AND with Mask. XOR and AND are available on SALU but for
1369 // targets without SALU float we still select them as VGPR since there would
1370 // be no real sgpr use.
1371 addRulesForGOpcs({G_FNEG, G_FABS}, Standard)
1372 .Uni(S16, {{UniInVgprS16}, {Vgpr16}}, !hasSALUFloat)
1373 .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasSALUFloat)
1374 .Div(S16, {{Vgpr16}, {Vgpr16}})
1375 .Uni(S32, {{UniInVgprS32}, {Vgpr32}}, !hasSALUFloat)
1376 .Uni(S32, {{Sgpr32}, {Sgpr32}}, hasSALUFloat)
1377 .Div(S32, {{Vgpr32}, {Vgpr32}})
1378 .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
1379 .Div(S64, {{Vgpr64}, {Vgpr64}})
1380 .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}}, !hasSALUFloat)
1381 .Uni(V2S16, {{SgprV2S16}, {SgprV2S16}, ScalarizeToS16}, hasSALUFloat)
1382 .Div(V2S16, {{VgprV2S16}, {VgprV2S16}})
1383 .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32}}})
1384 .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32}}});
1385
1386 addRulesForGOpcs({G_FCANONICALIZE}, Standard)
1387 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
1388 .Div(S32, {{Vgpr32}, {Vgpr32}})
1389 .Uni(S16, {{UniInVgprS16}, {Vgpr16}})
1390 .Div(S16, {{Vgpr16}, {Vgpr16}})
1391 .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
1392 .Div(S64, {{Vgpr64}, {Vgpr64}})
1393 .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}})
1394 .Div(V2S16, {{VgprV2S16}, {VgprV2S16}})
1395 .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32}}})
1396 .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32}}});
1397
1398 bool hasPST = ST->hasPseudoScalarTrans();
1399 addRulesForGOpcs({G_FSQRT}, Standard)
1400 .Div(S16, {{Vgpr16}, {Vgpr16}})
1401 .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasPST)
1402 .Uni(S16, {{UniInVgprS16}, {Vgpr16}}, !hasPST);
1403
1404 addRulesForGOpcs({G_FPTOUI, G_FPTOSI})
1405 .Any({{UniS16, S16}, {{UniInVgprS16}, {Vgpr16}}})
1406 .Any({{DivS16, S16}, {{Vgpr16}, {Vgpr16}}})
1407 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}, hasSALUFloat)
1408 .Any({{UniS32, S16}, {{UniInVgprS32}, {Vgpr16}}}, !hasSALUFloat)
1409 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}})
1410 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
1411 .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat)
1412 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
1413 .Any({{UniS32, S64}, {{UniInVgprS32}, {Vgpr64}}})
1414 .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}});
1415
1416 addRulesForGOpcs({G_UITOFP, G_SITOFP})
1417 .Any({{UniS16, S16}, {{UniInVgprS16}, {Vgpr16}}})
1418 .Any({{DivS16, S16}, {{Vgpr16}, {Vgpr16}}})
1419 .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}}, hasSALUFloat)
1420 .Any({{UniS16, S32}, {{UniInVgprS16}, {Vgpr32}}}, !hasSALUFloat)
1421 .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
1422 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
1423 .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat)
1424 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
1425 .Any({{UniS64, S32}, {{UniInVgprS64}, {Vgpr32}}})
1426 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}}});
1427
1428 addRulesForGOpcs({G_AMDGPU_S_BUFFER_PREFETCH})
1430
1431 addRulesForGOpcs({G_FPEXT})
1432 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}})
1433 .Any({{UniS64, S32}, {{UniInVgprS64}, {Vgpr32}}})
1434 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}}})
1435 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}, hasSALUFloat)
1436 .Any({{UniS32, S16}, {{UniInVgprS32}, {Vgpr16}}}, !hasSALUFloat);
1437
1438 addRulesForGOpcs({G_AMDGPU_CVT_PK_I16_I32}, Standard)
1439 .Uni(V2S16, {{UniInVgprV2S16}, {Vgpr32, Vgpr32}})
1440 .Div(V2S16, {{VgprV2S16}, {Vgpr32, Vgpr32}});
1441
1442 addRulesForGOpcs({G_AMDGPU_FMIN_LEGACY, G_AMDGPU_FMAX_LEGACY}, Standard)
1443 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
1444 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
1445
1446 bool hasSALUMinimumMaximumInsts = ST->hasSALUMinimumMaximumInsts();
1447
1448 addRulesForGOpcs({G_FMINIMUM, G_FMAXIMUM}, Standard)
1449 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUMinimumMaximumInsts)
1450 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUMinimumMaximumInsts)
1451 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1452 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUMinimumMaximumInsts)
1453 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUMinimumMaximumInsts)
1454 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1455 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
1456 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
1458 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
1459
1460 addRulesForGOpcs({G_FMINNUM_IEEE, G_FMAXNUM_IEEE, G_FMINNUM, G_FMAXNUM},
1461 Standard)
1462 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1463 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1464 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
1465 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
1467 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
1468 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
1469 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
1470 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
1471 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat);
1472
1473 addRulesForGOpcs({G_FPTRUNC})
1474 .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
1475 .Any({{UniS32, S64}, {{UniInVgprS32}, {Vgpr64}}})
1476 .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}})
1478 .Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}})
1479 .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}}, hasSALUFloat)
1480 .Any({{UniS16, S32}, {{UniInVgprS16}, {Vgpr32}}}, !hasSALUFloat);
1481
1482 addRulesForGOpcs({G_IS_FPCLASS})
1483 .Any({{DivS1, S16}, {{Vcc}, {Vgpr16}}})
1484 .Any({{UniS1, S16}, {{UniInVcc}, {Vgpr16}}})
1485 .Any({{DivS1, S32}, {{Vcc}, {Vgpr32}}})
1486 .Any({{UniS1, S32}, {{UniInVcc}, {Vgpr32}}})
1487 .Any({{DivS1, S64}, {{Vcc}, {Vgpr64}}})
1488 .Any({{UniS1, S64}, {{UniInVcc}, {Vgpr64}}});
1489
1490 addRulesForGOpcs({G_FCMP}, Standard)
1491 .Any({{UniS1, _, S16}, {{Sgpr32Trunc}, {None, Sgpr16, Sgpr16}}},
1492 hasSALUFloat)
1493 .Any({{UniS1, _, S16}, {{UniInVcc}, {None, Vgpr16, Vgpr16}}},
1494 !hasSALUFloat)
1495 .Any({{DivS1, _, S16}, {{Vcc}, {None, Vgpr16, Vgpr16}}})
1496 .Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}},
1497 hasSALUFloat)
1498 .Any({{UniS1, _, S32}, {{UniInVcc}, {None, Vgpr32, Vgpr32}}},
1499 !hasSALUFloat)
1500 .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
1501 .Any({{UniS1, _, S64}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}})
1502 .Any({{DivS1, _, S64}, {{Vcc}, {None, Vgpr64, Vgpr64}}});
1503
1504 addRulesForGOpcs({G_INTRINSIC_TRUNC, G_INTRINSIC_ROUNDEVEN, G_FFLOOR, G_FCEIL,
1505 G_FEXP2, G_FLOG2},
1506 Standard)
1507 .Uni(S16, {{UniInVgprS16}, {Vgpr16}})
1508 .Div(S16, {{Vgpr16}, {Vgpr16}})
1509 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
1510 .Div(S32, {{Vgpr32}, {Vgpr32}})
1511 .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
1512 .Div(S64, {{Vgpr64}, {Vgpr64}});
1513
1514 using namespace Intrinsic;
1515
1516 addRulesForIOpcs({returnaddress}).Any({{UniP0}, {{SgprP0}, {}}});
1517
1518 addRulesForIOpcs({amdgcn_s_getpc}).Any({{UniS64, _}, {{Sgpr64}, {None}}});
1519
1520 addRulesForIOpcs({amdgcn_s_getreg}).Any({{}, {{Sgpr32}, {IntrId, Imm}}});
1521
1522 addRulesForIOpcs({amdgcn_s_setreg})
1523 .Any({{_, _, S32}, {{}, {IntrId, Imm, SgprB32_ReadFirstLane}}});
1524
1525 addRulesForIOpcs({amdgcn_s_sendmsg, amdgcn_s_sendmsghalt})
1526 .Any({{}, {{}, {IntrId, Imm, SgprB32_M0}}});
1527
1528 addRulesForIOpcs({amdgcn_s_sendmsg_rtn})
1529 .Any({{S32}, {{Sgpr32}, {}}})
1530 .Any({{S64}, {{Sgpr64}, {}}});
1531
1532 addRulesForIOpcs({amdgcn_s_memrealtime, amdgcn_s_memtime}, Standard)
1533 .Uni(S64, {{Sgpr64}, {IntrId}});
1534
1535 addRulesForIOpcs({amdgcn_groupstaticsize, amdgcn_pops_exiting_wave_id,
1536 amdgcn_reloc_constant, amdgcn_s_get_waveid_in_workgroup},
1537 Standard)
1538 .Uni(S32, {{Sgpr32}, {IntrId}});
1539
1540 // Intrinsics with no register operands.
1541 addRulesForIOpcs({amdgcn_asyncmark,
1542 amdgcn_endpgm,
1543 amdgcn_init_exec,
1544 amdgcn_s_barrier,
1545 amdgcn_s_barrier_leave,
1546 amdgcn_s_barrier_signal,
1547 amdgcn_s_barrier_wait,
1548 amdgcn_s_monitor_sleep,
1549 amdgcn_s_nop,
1550 amdgcn_s_sethalt,
1551 amdgcn_s_setprio,
1552 amdgcn_s_setprio_inc_wg,
1553 amdgcn_s_sleep,
1554 amdgcn_s_ttracedata_imm,
1555 amdgcn_s_wait_asynccnt,
1556 amdgcn_s_wait_bvhcnt,
1557 amdgcn_s_wait_dscnt,
1558 amdgcn_s_wait_event,
1559 amdgcn_s_wait_event_export_ready,
1560 amdgcn_s_wait_expcnt,
1561 amdgcn_s_wait_kmcnt,
1562 amdgcn_s_wait_loadcnt,
1563 amdgcn_s_wait_samplecnt,
1564 amdgcn_s_wait_storecnt,
1565 amdgcn_s_wait_tensorcnt,
1566 amdgcn_s_waitcnt,
1567 amdgcn_unreachable,
1568 amdgcn_wait_asyncmark,
1569 amdgcn_wave_barrier})
1570 .Any({{}, {{}, {}}});
1571
1572 addRulesForIOpcs({amdgcn_init_exec_from_input})
1573 .Any({{}, {{}, {IntrId, Sgpr32, Imm}}});
1574
1575 addRulesForIOpcs({amdgcn_s_ttracedata}).Any({{}, {{}, {IntrId, SgprB32_M0}}});
1576
1577 addRulesForIOpcs({amdgcn_s_sleep_var})
1578 .Any({{}, {{}, {IntrId, SgprB32_ReadFirstLane}}});
1579
1580 addRulesForIOpcs({amdgcn_s_barrier_join, amdgcn_s_wakeup_barrier})
1581 .Any({{}, {{}, {IntrId, SgprB32_M0}}});
1582
1583 addRulesForIOpcs({amdgcn_s_barrier_signal_var, amdgcn_s_barrier_init})
1584 .Any({{}, {{}, {IntrId, SgprB32_M0, SgprB32_M0}}});
1585
1586 addRulesForIOpcs({amdgcn_s_barrier_signal_isfirst})
1587 .Any({{UniS1}, {{Sgpr32Trunc}, {}}});
1588
1589 addRulesForIOpcs(
1590 {amdgcn_s_get_named_barrier_state, amdgcn_s_get_barrier_state}, Standard)
1591 .Uni(S32, {{Sgpr32}, {IntrId, SgprB32_M0}});
1592
1593 addRulesForIOpcs({amdgcn_flat_prefetch}).Any({{}, {{}, {IntrId, VgprP0}}});
1594
1595 addRulesForIOpcs({amdgcn_global_prefetch}).Any({{}, {{}, {IntrId, VgprP1}}});
1596
1597 addRulesForIOpcs({amdgcn_s_prefetch_data})
1599
1600 addRulesForIOpcs({amdgcn_class})
1601 .Any({{UniS1, _, S16}, {{UniInVcc}, {IntrId, Vgpr16, Vgpr32}}})
1602 .Any({{DivS1, _, S16}, {{Vcc}, {IntrId, Vgpr16, Vgpr32}}})
1603 .Any({{UniS1, _, S32}, {{UniInVcc}, {IntrId, Vgpr32, Vgpr32}}})
1604 .Any({{DivS1, _, S32}, {{Vcc}, {IntrId, Vgpr32, Vgpr32}}})
1605 .Any({{UniS1, _, S64}, {{UniInVcc}, {IntrId, Vgpr64, Vgpr32}}})
1606 .Any({{DivS1, _, S64}, {{Vcc}, {IntrId, Vgpr64, Vgpr32}}});
1607
1608 // This is "intrinsic lane mask" it was set to i32/i64 in llvm-ir.
1609 addRulesForIOpcs({amdgcn_end_cf})
1610 .Any({{_, UniS32}, {{}, {IntrId, Sgpr32}}})
1611 .Any({{_, UniS64}, {{}, {IntrId, Sgpr64}}});
1612
1613 addRulesForIOpcs({amdgcn_if_break}, Standard)
1614 .Uni(S64, {{Sgpr64}, {IntrId, Vcc, Sgpr64}})
1615 .Uni(S32, {{Sgpr32}, {IntrId, Vcc, Sgpr32}});
1616
1617 addRulesForIOpcs({amdgcn_exp})
1618 .Any({{_, _, _, S32, S32, S32, S32},
1619 {{}, {IntrId, Imm, Imm, Vgpr32, Vgpr32, Vgpr32, Vgpr32}}});
1620
1621 addRulesForIOpcs({amdgcn_exp_compr})
1622 .Any({{_, _, _, V2S16}, {{}, {IntrId, Imm, Imm, VgprV2S16, VgprV2S16}}});
1623
1624 addRulesForIOpcs({amdgcn_exp_row})
1625 .Any({{_, _, _, S32, S32, S32, S32, _, S32},
1626 {{},
1628 SgprB32_M0}}});
1629
1630 addRulesForIOpcs({amdgcn_lds_direct_load}, StandardB)
1631 .Div(B32, {{VgprB32}, {IntrId, SgprB32_M0}});
1632
1633 addRulesForIOpcs({amdgcn_lds_param_load}, Standard)
1634 .Div(S32, {{Vgpr32}, {IntrId, Imm, Imm, SgprB32_M0}});
1635
1636 addRulesForIOpcs({amdgcn_mbcnt_lo, amdgcn_mbcnt_hi}, Standard)
1637 .Div(S32, {{}, {Vgpr32, None, Vgpr32, Vgpr32}});
1638
1639 addRulesForIOpcs({amdgcn_readfirstlane})
1640 .Any({{UniB32, _, DivB32}, {{}, {SgprB32, None, VgprB32}}})
1641 // this should not exist in the first place, it is from call lowering
1642 // readfirstlaning just in case register is not in sgpr.
1643 .Any({{UniS32, _, UniS32}, {{}, {Sgpr32, None, Vgpr32}}});
1644
1645 addRulesForIOpcs({amdgcn_readlane}, StandardB)
1647
1648 addRulesForIOpcs({amdgcn_writelane}, StandardB)
1649 .Div(B32,
1650 {{VgprB32},
1652
1653 addRulesForIOpcs({amdgcn_add_max_i32, amdgcn_add_max_u32, amdgcn_add_min_i32,
1654 amdgcn_add_min_u32},
1655 Standard)
1656 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1657 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1658
1659 addRulesForIOpcs({amdgcn_pk_add_max_i16, amdgcn_pk_add_max_u16,
1660 amdgcn_pk_add_min_i16, amdgcn_pk_add_min_u16},
1661 Standard)
1664
1665 addRulesForIOpcs({amdgcn_permlane16, amdgcn_permlanex16}, Standard)
1666 .Div(S32, {{Vgpr32},
1669
1670 addRulesForIOpcs({amdgcn_perm}, Standard)
1671 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1672 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1673
1674 addRulesForIOpcs(
1675 {amdgcn_wave_reduce_add, amdgcn_wave_reduce_and, amdgcn_wave_reduce_fadd,
1676 amdgcn_wave_reduce_fmax, amdgcn_wave_reduce_fmin,
1677 amdgcn_wave_reduce_fsub, amdgcn_wave_reduce_max, amdgcn_wave_reduce_min,
1678 amdgcn_wave_reduce_or, amdgcn_wave_reduce_sub, amdgcn_wave_reduce_umax,
1679 amdgcn_wave_reduce_umin, amdgcn_wave_reduce_xor},
1680 Standard)
1681 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}})
1682 .Div(S32, {{Sgpr32ToVgprDst}, {IntrId, VgprB32}})
1683 .Uni(S64, {{Sgpr64}, {IntrId, Sgpr64}})
1684 .Div(S64, {{Sgpr64ToVgprDst}, {IntrId, VgprB64}});
1685
1686 addRulesForIOpcs({amdgcn_bitop3, amdgcn_fmad_ftz}, Standard)
1687 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16, Vgpr16, Vgpr16}})
1688 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16, Vgpr16, Vgpr16}})
1689 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1690 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1691
1692 addRulesForIOpcs({amdgcn_udot4, amdgcn_sdot4, amdgcn_udot8, amdgcn_sdot8,
1693 amdgcn_dot4_f32_bf8_bf8, amdgcn_dot4_f32_bf8_fp8,
1694 amdgcn_dot4_f32_fp8_fp8, amdgcn_dot4_f32_fp8_bf8},
1695 Standard)
1696 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1697 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1698
1699 addRulesForIOpcs({amdgcn_mul_u24, amdgcn_mul_i24}, Standard)
1700 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32}})
1701 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}})
1702 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr32, Vgpr32}})
1703 .Div(S64, {{Vgpr64}, {IntrId, Vgpr32, Vgpr32}});
1704
1705 addRulesForIOpcs({amdgcn_ds_bpermute, amdgcn_ds_bpermute_fi_b32,
1706 amdgcn_ds_permute, amdgcn_fmul_legacy, amdgcn_mulhi_i24,
1707 amdgcn_mulhi_u24},
1708 Standard)
1709 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32}})
1710 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}});
1711
1712 addRulesForIOpcs({amdgcn_cubesc, amdgcn_cubetc, amdgcn_cubema, amdgcn_cubeid,
1713 amdgcn_fma_legacy},
1714 Standard)
1715 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1716 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1717
1718 addRulesForIOpcs({amdgcn_frexp_mant, amdgcn_fract}, Standard)
1719 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}})
1720 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
1721 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}})
1722 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
1723 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64}})
1724 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64}});
1725
1726 addRulesForIOpcs({amdgcn_prng_b32})
1727 .Any({{UniS32}, {{UniInVgprS32}, {IntrId, Vgpr32}}})
1728 .Any({{DivS32}, {{Vgpr32}, {IntrId, Vgpr32}}});
1729
1730 addRulesForIOpcs({amdgcn_sffbh}, Standard)
1731 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}})
1732 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}});
1733
1734 addRulesForIOpcs({amdgcn_ubfe, amdgcn_sbfe}, Standard)
1735 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1736 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32, Sgpr32, Sgpr32}, S_BFE})
1737 .Uni(S64, {{Sgpr64}, {IntrId, Sgpr64, Sgpr32, Sgpr32}, S_BFE})
1738 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr32, Vgpr32}, V_BFE});
1739
1740 addRulesForIOpcs({amdgcn_cvt_pk_i16, amdgcn_cvt_pk_u16, amdgcn_cvt_pknorm_i16,
1741 amdgcn_cvt_pknorm_u16, amdgcn_cvt_pkrtz},
1742 Standard)
1743 .Uni(V2S16, {{UniInVgprV2S16}, {IntrId, Vgpr32, Vgpr32}})
1744 .Div(V2S16, {{VgprV2S16}, {IntrId, Vgpr32, Vgpr32}});
1745
1746 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk32_bf6_f16,
1747 amdgcn_cvt_scalef32_sr_pk32_fp6_f16,
1748 amdgcn_cvt_scalef32_sr_pk32_bf6_bf16,
1749 amdgcn_cvt_scalef32_sr_pk32_fp6_bf16},
1750 Standard)
1752
1753 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk32_bf6_f32,
1754 amdgcn_cvt_scalef32_sr_pk32_fp6_f32},
1755 Standard)
1757
1758 addRulesForIOpcs({amdgcn_global_load_tr_b64})
1759 .Any({{DivB64, _, UniP1}, {{VgprB64}, {IntrId, SgprP1}}})
1760 .Any({{DivB64, _, DivP1}, {{VgprB64}, {IntrId, VgprP1}}})
1761 .Any({{DivB32, _, UniP1}, {{VgprB32}, {IntrId, SgprP1}}})
1762 .Any({{DivB32, _, DivP1}, {{VgprB32}, {IntrId, VgprP1}}});
1763
1764 addRulesForIOpcs({amdgcn_global_load_tr_b128})
1765 .Any({{DivB64, _, UniP1}, {{VgprB64}, {IntrId, SgprP1}}})
1766 .Any({{DivB64, _, DivP1}, {{VgprB64}, {IntrId, VgprP1}}})
1767 .Any({{DivB128, _, UniP1}, {{VgprB128}, {IntrId, SgprP1}}})
1768 .Any({{DivB128, _, DivP1}, {{VgprB128}, {IntrId, VgprP1}}});
1769
1770 addRulesForIOpcs({amdgcn_global_load_tr4_b64})
1771 .Any({{DivV2S32, _, UniP1}, {{VgprV2S32}, {IntrId, SgprP1}}})
1772 .Any({{DivV2S32, _, DivP1}, {{VgprV2S32}, {IntrId, VgprP1}}});
1773
1774 addRulesForIOpcs({amdgcn_global_load_tr6_b96})
1775 .Any({{DivV3S32, _, UniP1}, {{VgprV3S32}, {IntrId, SgprP1}}})
1776 .Any({{DivV3S32, _, DivP1}, {{VgprV3S32}, {IntrId, VgprP1}}});
1777
1778 addRulesForIOpcs({amdgcn_ds_load_tr4_b64, amdgcn_ds_load_tr8_b64})
1779 .Any({{DivV2S32}, {{VgprV2S32}, {IntrId, VgprP3}}});
1780
1781 addRulesForIOpcs({amdgcn_ds_load_tr6_b96})
1782 .Any({{DivV3S32}, {{VgprV3S32}, {IntrId, VgprP3}}});
1783
1784 addRulesForIOpcs({amdgcn_ds_load_tr16_b128})
1785 .Any({{DivB128}, {{VgprB128}, {IntrId, VgprP3}}});
1786
1787 addRulesForIOpcs({amdgcn_global_atomic_ordered_add_b64})
1788 .Any({{DivS64}, {{Vgpr64}, {IntrId, VgprP1, Vgpr64}}});
1789
1790 addRulesForIOpcs(
1791 {amdgcn_global_atomic_fmin_num, amdgcn_global_atomic_fmax_num}, Standard)
1792 .Div(S32, {{Vgpr32}, {IntrId, VgprP1, Vgpr32}});
1793
1794 addRulesForIOpcs({amdgcn_flat_atomic_fmin_num, amdgcn_flat_atomic_fmax_num},
1795 Standard)
1796 .Div(S32, {{Vgpr32}, {IntrId, VgprP0, Vgpr32}});
1797
1798 addRulesForIOpcs({amdgcn_raw_buffer_load_lds})
1799 .Any({{_}, {{}, {IntrId, SgprV4S32, SgprP3, Imm, Vgpr32, Sgpr32}}});
1800
1801 addRulesForIOpcs({amdgcn_struct_buffer_load_lds})
1802 .Any({{_},
1803 {{}, {IntrId, SgprV4S32, SgprP3, Imm, Vgpr32, Vgpr32, Sgpr32}}});
1804
1805 addRulesForIOpcs({amdgcn_raw_ptr_buffer_load_lds})
1806 .Any({{_}, {{}, {IntrId, SgprP8, SgprP3, Imm, Vgpr32, Sgpr32}}});
1807
1808 addRulesForIOpcs({amdgcn_struct_ptr_buffer_load_lds})
1809 .Any({{_}, {{}, {IntrId, SgprP8, SgprP3, Imm, Vgpr32, Vgpr32, Sgpr32}}});
1810
1811 addRulesForIOpcs({amdgcn_global_load_lds})
1812 .Any({{}, {{}, {IntrId, VgprP1, SgprB32_M0}}});
1813
1814 addRulesForIOpcs({amdgcn_global_load_async_to_lds_b8,
1815 amdgcn_global_load_async_to_lds_b32,
1816 amdgcn_global_load_async_to_lds_b64,
1817 amdgcn_global_load_async_to_lds_b128,
1818 amdgcn_global_store_async_from_lds_b8,
1819 amdgcn_global_store_async_from_lds_b32,
1820 amdgcn_global_store_async_from_lds_b64,
1821 amdgcn_global_store_async_from_lds_b128})
1822 .Any({{}, {{}, {IntrId, VgprP1, VgprP3}}});
1823
1824 addRulesForIOpcs({amdgcn_perm_pk16_b4_u4}, StandardB)
1825 .Uni(B64, {{UniInVgprB64}, {IntrId, Vgpr32, Vgpr32, VgprV2S32}})
1826 .Div(B64, {{VgprB64}, {IntrId, Vgpr32, Vgpr32, VgprV2S32}});
1827
1828 addRulesForIOpcs({amdgcn_perm_pk16_b6_u4}, StandardB)
1830 .Div(B96, {{VgprB96}, {IntrId, Vgpr32, VgprB64, VgprV2S32}});
1831
1832 addRulesForIOpcs({amdgcn_perm_pk16_b8_u4}, StandardB)
1834 .Div(B128, {{VgprB128}, {IntrId, VgprB64, VgprB64, VgprV2S32}});
1835
1836 addRulesForIOpcs({amdgcn_wwm, amdgcn_strict_wwm, amdgcn_wqm, amdgcn_softwqm,
1837 amdgcn_strict_wqm},
1838 StandardB)
1839 .Div(B32, {{VgprB32}, {IntrId, VgprB32}})
1840 .Uni(B32, {{SgprB32}, {IntrId, SgprB32}})
1841 .Div(B64, {{VgprB64}, {IntrId, VgprB64}})
1842 .Uni(B64, {{SgprB64}, {IntrId, SgprB64}})
1843 .Div(B96, {{VgprB96}, {IntrId, VgprB96}})
1844 .Uni(B96, {{SgprB96}, {IntrId, SgprB96}})
1845 .Div(B128, {{VgprB128}, {IntrId, VgprB128}})
1846 .Uni(B128, {{SgprB128}, {IntrId, SgprB128}})
1847 .Any({{UniB256}, {{SgprB256}, {IntrId, SgprB256}}})
1848 .Any({{DivB256}, {{VgprB256}, {IntrId, VgprB256}}})
1849 .Any({{UniB512}, {{SgprB512}, {IntrId, SgprB512}}})
1850 .Any({{DivB512}, {{VgprB512}, {IntrId, VgprB512}}});
1851
1852 addRulesForIOpcs({amdgcn_wqm_demote}).Any({{}, {{}, {IntrId, Vcc}}});
1853
1854 addRulesForIOpcs({amdgcn_inverse_ballot})
1855 .Any({{DivS1, _, S32}, {{Vcc}, {IntrId, SgprB32_ReadFirstLane}}})
1856 .Any({{DivS1, _, S64}, {{Vcc}, {IntrId, SgprB64_ReadFirstLane}}});
1857
1858 addRulesForIOpcs({amdgcn_live_mask, amdgcn_ps_live})
1859 .Any({{DivS1}, {{Vcc}, {}}});
1860
1861 addRulesForIOpcs({amdgcn_mov_dpp, amdgcn_mov_dpp8}, StandardB)
1862 .Div(B32, {{VgprB32}, {IntrId, VgprB32}})
1863 .Div(B64, {{VgprB64}, {IntrId, VgprB64}});
1864
1865 addRulesForIOpcs({amdgcn_update_dpp}, StandardB)
1866 .Div(B32, {{VgprB32}, {IntrId, VgprB32, VgprB32}})
1867 .Div(B64, {{VgprB64}, {IntrId, VgprB64, VgprB64}});
1868
1869 addRulesForIOpcs({amdgcn_sin, amdgcn_cos}, Standard)
1870 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
1871 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}})
1872 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
1873 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}});
1874
1875 addRulesForIOpcs({amdgcn_trig_preop}, Standard)
1876 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr32}})
1877 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64, Vgpr32}});
1878
1879 addRulesForIOpcs({amdgcn_exp2}, Standard)
1880 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
1881 .Uni(S16, {{Sgpr16}, {IntrId, Sgpr16}}, hasPST)
1882 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}}, !hasPST)
1883 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
1884 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}}, hasPST)
1885 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}}, !hasPST);
1886
1887 addRulesForIOpcs({amdgcn_ds_add_gs_reg_rtn, amdgcn_ds_sub_gs_reg_rtn},
1888 Standard)
1889 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
1890 .Div(S64, {{Vgpr64}, {IntrId, Vgpr32}});
1891
1892 addRulesForIOpcs({amdgcn_ds_append, amdgcn_ds_consume}, Standard)
1893 .Uni(S32, {{UniInVgprS32}, {IntrId, SgprB32_M0}})
1894 .Div(S32, {{Vgpr32}, {IntrId, SgprB32_M0}});
1895
1896 addRulesForIOpcs(
1897 {amdgcn_ds_bvh_stack_rtn, amdgcn_ds_bvh_stack_push4_pop1_rtn}, Standard)
1898 .Div(S32, {{Vgpr32, Vgpr32}, {IntrId, Vgpr32, Vgpr32, VgprV4S32}});
1899
1900 addRulesForIOpcs({amdgcn_ds_bvh_stack_push8_pop1_rtn}, Standard)
1901 .Div(S32, {{Vgpr32, Vgpr32}, {IntrId, Vgpr32, Vgpr32, VgprV8S32}});
1902
1903 addRulesForIOpcs({amdgcn_ds_bvh_stack_push8_pop2_rtn}, Standard)
1904 .Div(S64, {{Vgpr64, Vgpr32}, {IntrId, Vgpr32, Vgpr32, VgprV8S32}});
1905
1906 addRulesForIOpcs({amdgcn_ds_gws_sema_p, amdgcn_ds_gws_sema_v,
1907 amdgcn_ds_gws_sema_release_all})
1908 .Any({{}, {{}, {IntrId, SgprB32_M0}}});
1909
1910 addRulesForIOpcs(
1911 {amdgcn_ds_gws_barrier, amdgcn_ds_gws_init, amdgcn_ds_gws_sema_br})
1912 .Any({{}, {{}, {IntrId, Vgpr32, SgprB32_M0}}});
1913
1914 addRulesForIOpcs({amdgcn_ds_ordered_add, amdgcn_ds_ordered_swap}, Standard)
1915 .Div(S32, {{Vgpr32}, {IntrId, SgprB32_M0, Vgpr32}});
1916
1917 addRulesForIOpcs({amdgcn_ds_swizzle}, Standard)
1918 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}})
1919 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}});
1920
1921 addRulesForIOpcs({amdgcn_permlane16_var, amdgcn_permlanex16_var}, Standard)
1922 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1923
1924 addRulesForIOpcs({amdgcn_permlane16_swap, amdgcn_permlane32_swap}, Standard)
1925 .Div(S32, {{Vgpr32, Vgpr32}, {IntrId, Vgpr32, Vgpr32}});
1926
1927 addRulesForIOpcs({amdgcn_permlane64}, StandardB)
1928 .Div(B32, {{VgprB32}, {IntrId, VgprB32}});
1929
1930 addRulesForIOpcs({amdgcn_ds_read_tr4_b64, amdgcn_ds_read_tr8_b64})
1931 .Any({{DivV2S32}, {{VgprV2S32}, {IntrId, VgprP3}}});
1932
1933 addRulesForIOpcs({amdgcn_ds_read_tr6_b96})
1934 .Any({{DivV3S32}, {{VgprV3S32}, {IntrId, VgprP3}}});
1935
1936 addRulesForIOpcs({amdgcn_ds_read_tr16_b64})
1937 .Any({{DivV4S16}, {{VgprV4S16}, {IntrId, VgprP3}}});
1938
1939 addRulesForIOpcs({amdgcn_interp_p1}, Standard)
1940 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Imm, Imm, SgprB32_M0}});
1941
1942 addRulesForIOpcs({amdgcn_interp_p1_f16}, Standard)
1943 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Imm, Imm, Imm, SgprB32_M0}});
1944
1945 addRulesForIOpcs({amdgcn_interp_p2}, Standard)
1946 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Imm, Imm, SgprB32_M0}});
1947
1948 addRulesForIOpcs({amdgcn_interp_p2_f16}, Standard)
1949 .Div(S16,
1951
1952 addRulesForIOpcs({amdgcn_interp_mov}, Standard)
1953 .Div(S32, {{Vgpr32}, {IntrId, Imm, Imm, Imm, SgprB32_M0}});
1954
1955 addRulesForIOpcs({amdgcn_interp_inreg_p10, amdgcn_interp_inreg_p2,
1956 amdgcn_interp_inreg_p10_f16, amdgcn_interp_p10_rtz_f16},
1957 Standard)
1958 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1959 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1960
1961 addRulesForIOpcs({amdgcn_interp_inreg_p2_f16, amdgcn_interp_p2_rtz_f16},
1962 Standard)
1963 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1964 .Div(S16, {{Vgpr16}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1965
1966 addRulesForIOpcs({amdgcn_div_fmas}, Standard)
1967 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32, Vcc}})
1968 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32, Vcc}})
1969 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr64, Vgpr64, Vcc}})
1970 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64, Vgpr64, Vgpr64, Vcc}});
1971
1972 addRulesForIOpcs({amdgcn_div_fixup}, Standard)
1973 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16, Vgpr16, Vgpr16}})
1974 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16, Vgpr16, Vgpr16}})
1975 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1976 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1977 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr64, Vgpr64}})
1978 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64, Vgpr64, Vgpr64}});
1979
1980 addRulesForIOpcs({amdgcn_div_scale}, Standard)
1981 .Div(S32, {{Vgpr32, Vcc}, {IntrId, Vgpr32, Vgpr32}})
1982 .Uni(S32, {{UniInVgprS32, UniInVcc}, {IntrId, Vgpr32, Vgpr32}})
1983 .Div(S64, {{Vgpr64, Vcc}, {IntrId, Vgpr64, Vgpr64}})
1984 .Uni(S64, {{UniInVgprS64, UniInVcc}, {IntrId, Vgpr64, Vgpr64}});
1985
1986 addRulesForIOpcs({amdgcn_fdot2, amdgcn_sdot2, amdgcn_udot2}, Standard)
1988 .Div(S32, {{Vgpr32}, {IntrId, VgprV2S16, VgprV2S16, Vgpr32}});
1989
1990 addRulesForIOpcs({amdgcn_fdot2_f16_f16}, Standard)
1992 .Div(S16, {{Vgpr16}, {IntrId, VgprV2S16, VgprV2S16, Vgpr16}});
1993
1994 addRulesForIOpcs({amdgcn_sudot4, amdgcn_sudot8}, Standard)
1995 .Uni(S32, {{UniInVgprS32}, {IntrId, Imm, Vgpr32, Imm, Vgpr32, Vgpr32}})
1996 .Div(S32, {{Vgpr32}, {IntrId, Imm, Vgpr32, Imm, Vgpr32, Vgpr32}});
1997
1998 addRulesForIOpcs({amdgcn_s_alloc_vgpr})
2000
2001 addRulesForIOpcs({amdgcn_sat_pk4_i4_i8, amdgcn_sat_pk4_u4_u8}, Standard)
2002 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr32}})
2003 .Div(S16, {{Vgpr16}, {IntrId, Vgpr32}});
2004
2005 // WMMA/SWMMAC intrinsics: all register operands map to VGPR.
2006 addRulesForIOpcs(
2007 {// WMMA GFX11+
2008 amdgcn_wmma_f32_16x16x16_f16, amdgcn_wmma_f32_16x16x16_bf16,
2009 amdgcn_wmma_f16_16x16x16_f16, amdgcn_wmma_bf16_16x16x16_bf16,
2010 amdgcn_wmma_f16_16x16x16_f16_tied, amdgcn_wmma_bf16_16x16x16_bf16_tied,
2011 amdgcn_wmma_i32_16x16x16_iu8, amdgcn_wmma_i32_16x16x16_iu4,
2012 // WMMA GFX12
2013 amdgcn_wmma_f32_16x16x16_fp8_fp8, amdgcn_wmma_f32_16x16x16_fp8_bf8,
2014 amdgcn_wmma_f32_16x16x16_bf8_fp8, amdgcn_wmma_f32_16x16x16_bf8_bf8,
2015 amdgcn_wmma_i32_16x16x32_iu4,
2016 // WMMA GFX1250
2017 amdgcn_wmma_f32_16x16x4_f32, amdgcn_wmma_f32_16x16x32_bf16,
2018 amdgcn_wmma_f32_16x16x32_f16, amdgcn_wmma_f16_16x16x32_f16,
2019 amdgcn_wmma_bf16_16x16x32_bf16, amdgcn_wmma_bf16f32_16x16x32_bf16,
2020 amdgcn_wmma_f32_16x16x64_fp8_fp8, amdgcn_wmma_f32_16x16x64_fp8_bf8,
2021 amdgcn_wmma_f32_16x16x64_bf8_fp8, amdgcn_wmma_f32_16x16x64_bf8_bf8,
2022 amdgcn_wmma_f16_16x16x64_fp8_fp8, amdgcn_wmma_f16_16x16x64_fp8_bf8,
2023 amdgcn_wmma_f16_16x16x64_bf8_fp8, amdgcn_wmma_f16_16x16x64_bf8_bf8,
2024 amdgcn_wmma_f16_16x16x128_fp8_fp8, amdgcn_wmma_f16_16x16x128_fp8_bf8,
2025 amdgcn_wmma_f16_16x16x128_bf8_fp8, amdgcn_wmma_f16_16x16x128_bf8_bf8,
2026 amdgcn_wmma_f32_16x16x128_fp8_fp8, amdgcn_wmma_f32_16x16x128_fp8_bf8,
2027 amdgcn_wmma_f32_16x16x128_bf8_fp8, amdgcn_wmma_f32_16x16x128_bf8_bf8,
2028 amdgcn_wmma_i32_16x16x64_iu8, amdgcn_wmma_f32_16x16x128_f8f6f4,
2029 amdgcn_wmma_scale_f32_16x16x128_f8f6f4,
2030 amdgcn_wmma_scale16_f32_16x16x128_f8f6f4, amdgcn_wmma_f32_32x16x128_f4,
2031 amdgcn_wmma_scale_f32_32x16x128_f4, amdgcn_wmma_scale16_f32_32x16x128_f4,
2032 // SWMMAC GFX12
2033 amdgcn_swmmac_f32_16x16x32_f16, amdgcn_swmmac_f32_16x16x32_bf16,
2034 amdgcn_swmmac_f16_16x16x32_f16, amdgcn_swmmac_bf16_16x16x32_bf16,
2035 amdgcn_swmmac_i32_16x16x32_iu8, amdgcn_swmmac_i32_16x16x32_iu4,
2036 amdgcn_swmmac_i32_16x16x64_iu4, amdgcn_swmmac_f32_16x16x32_fp8_fp8,
2037 amdgcn_swmmac_f32_16x16x32_fp8_bf8, amdgcn_swmmac_f32_16x16x32_bf8_fp8,
2038 amdgcn_swmmac_f32_16x16x32_bf8_bf8,
2039 // SWMMAC GFX1250
2040 amdgcn_swmmac_f32_16x16x64_f16, amdgcn_swmmac_f32_16x16x64_bf16,
2041 amdgcn_swmmac_f16_16x16x64_f16, amdgcn_swmmac_bf16_16x16x64_bf16,
2042 amdgcn_swmmac_bf16f32_16x16x64_bf16, amdgcn_swmmac_f32_16x16x128_fp8_fp8,
2043 amdgcn_swmmac_f32_16x16x128_fp8_bf8, amdgcn_swmmac_f32_16x16x128_bf8_fp8,
2044 amdgcn_swmmac_f32_16x16x128_bf8_bf8, amdgcn_swmmac_f16_16x16x128_fp8_fp8,
2045 amdgcn_swmmac_f16_16x16x128_fp8_bf8, amdgcn_swmmac_f16_16x16x128_bf8_fp8,
2046 amdgcn_swmmac_f16_16x16x128_bf8_bf8, amdgcn_swmmac_i32_16x16x128_iu8})
2047 .Any({{}, {{}, {}, ApplyAllVgpr}});
2048
2049} // end initialize rules
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU address space definition.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
constexpr LLT S16
constexpr LLT S1
constexpr LLT V2S16
constexpr LLT S32
constexpr LLT V4S32
constexpr LLT V3S32
constexpr LLT S64
constexpr LLT V2S32
constexpr LLT S128
UniformityLLTOpPredicateID LLTToBId(LLT Ty)
bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, const MachineUniformityInfo &MUI, const MachineRegisterInfo &MRI)
UniformityLLTOpPredicateID LLTToId(LLT Ty)
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
#define _
IRTranslator LLVM IR MI
Register Reg
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
bool operator()(const MachineInstr &MI) const
Predicate operator||(const Predicate &RHS) const
Predicate operator&&(const Predicate &RHS) const
Predicate(std::function< bool(const MachineInstr &)> Pred)
Predicate operator!() const
RegBankLegalizeRules(const GCNSubtarget &ST, MachineRegisterInfo &MRI)
const SetOfRulesForOpcode * getRulesForOpc(MachineInstr &MI) const
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
void addFastRuleDivergent(UniformityLLTOpPredicateID Ty, RegBankLLTMapping RuleApplyIDs)
void addFastRuleUniform(UniformityLLTOpPredicateID Ty, RegBankLLTMapping RuleApplyIDs)
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
bool isSigned() const
Definition InstrTypes.h:930
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
bool isUniform(ConstValueRefT V) const
Whether V is uniform/non-divergent.
bool isEquality() const
Return true if this predicate is either EQ or NE.
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
TypeSize getValue() const
Representation of each machine instruction.
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const TargetRegisterInfo * getTargetRegisterInfo() const
Wrapper class representing virtual and physical registers.
Definition Register.h:20
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
bool isAnyPtr(LLT Ty, unsigned Width)
bool isUniformMMO(const MachineMemOperand *MMO)
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
SmallVector< UniformityLLTOpPredicateID, 4 > OpUniformityAndTypes
PredicateMapping(std::initializer_list< UniformityLLTOpPredicateID > OpList, std::function< bool(const MachineInstr &)> TestFunc=nullptr)
bool match(const MachineInstr &MI, const MachineUniformityInfo &MUI, const MachineRegisterInfo &MRI) const
std::function< bool(const MachineInstr &)> TestFunc
RegBankLLTMapping(std::initializer_list< RegBankLLTMappingApplyID > DstOpMappingList, std::initializer_list< RegBankLLTMappingApplyID > SrcOpMappingList, LoweringMethodID LoweringMethod=DoNotLower)
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39