LLVM  14.0.0git
X86LowerAMXType.cpp
Go to the documentation of this file.
1 //===- Target/X86/X86LowerAMXType.cpp - -------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file Pass to transform <256 x i32> load/store
10 /// <256 x i32> is bitcasted to x86_amx on X86, and AMX instruction set only
11 /// provides simple operation on x86_amx. The basic elementwise operation
12 /// is not supported by AMX. Since x86_amx is bitcasted from vector <256 x i32>
13 /// and only AMX intrinsics can operate on the type, we need transform
14 /// load/store <256 x i32> instruction to AMX load/store. If the bitcast can
15 /// not be combined with load/store, we transform the bitcast to amx load/store
16 /// and <256 x i32> store/load.
17 ///
18 /// If Front End not use O0 but the Mid/Back end use O0, (e.g. "Clang -O2 -S
19 /// -emit-llvm t.c" + "llc t.ll") we should make sure the amx data is volatile,
20 /// because that is necessary for AMX fast register allocation. (In Fast
21 /// registera allocation, register will be allocated before spill/reload, so
22 /// there is no additional register for amx to identify the step in spill.)
23 /// The volatileTileData() will handle this case.
24 /// e.g.
25 /// ----------------------------------------------------------
26 /// | def %td = ... |
27 /// | ... |
28 /// | "use %td" |
29 /// ----------------------------------------------------------
30 /// will transfer to -->
31 /// ----------------------------------------------------------
32 /// | def %td = ... |
33 /// | call void @llvm.x86.tilestored64.internal(mem, %td) |
34 /// | ... |
35 /// | %td2 = call x86_amx @llvm.x86.tileloadd64.internal(mem)|
36 /// | "use %td2" |
37 /// ----------------------------------------------------------
38 //
39 //===----------------------------------------------------------------------===//
40 //
41 #include "X86.h"
43 #include "llvm/ADT/SetVector.h"
44 #include "llvm/ADT/SmallSet.h"
48 #include "llvm/CodeGen/Passes.h"
51 #include "llvm/IR/DataLayout.h"
52 #include "llvm/IR/Function.h"
53 #include "llvm/IR/IRBuilder.h"
54 #include "llvm/IR/Instructions.h"
55 #include "llvm/IR/IntrinsicInst.h"
56 #include "llvm/IR/IntrinsicsX86.h"
57 #include "llvm/IR/PatternMatch.h"
58 #include "llvm/InitializePasses.h"
59 #include "llvm/Pass.h"
63 
64 using namespace llvm;
65 using namespace PatternMatch;
66 
67 #define DEBUG_TYPE "lower-amx-type"
68 
69 static bool isAMXCast(Instruction *II) {
70  return match(II,
71  m_Intrinsic<Intrinsic::x86_cast_vector_to_tile>(m_Value())) ||
72  match(II, m_Intrinsic<Intrinsic::x86_cast_tile_to_vector>(m_Value()));
73 }
74 
76  Type *Ty) {
77  Function &F = *BB->getParent();
78  Module *M = BB->getModule();
79  const DataLayout &DL = M->getDataLayout();
80 
81  LLVMContext &Ctx = Builder.getContext();
82  auto AllocaAlignment = DL.getPrefTypeAlign(Type::getX86_AMXTy(Ctx));
83  unsigned AllocaAS = DL.getAllocaAddrSpace();
84  AllocaInst *AllocaRes =
85  new AllocaInst(Ty, AllocaAS, "", &F.getEntryBlock().front());
86  AllocaRes->setAlignment(AllocaAlignment);
87  return AllocaRes;
88 }
89 
91  for (Instruction &I : F.getEntryBlock())
92  if (!isa<AllocaInst>(&I))
93  return &I;
94  llvm_unreachable("No terminator in the entry block!");
95 }
96 
97 static std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) {
98  IRBuilder<> Builder(II);
99  Value *Row = nullptr, *Col = nullptr;
100  switch (II->getIntrinsicID()) {
101  default:
102  llvm_unreachable("Expect amx intrinsics");
103  case Intrinsic::x86_tileloadd64_internal:
104  case Intrinsic::x86_tileloaddt164_internal:
105  case Intrinsic::x86_tilestored64_internal: {
106  Row = II->getArgOperand(0);
107  Col = II->getArgOperand(1);
108  break;
109  }
110  // a * b + c
111  // The shape depends on which operand.
112  case Intrinsic::x86_tdpbssd_internal:
113  case Intrinsic::x86_tdpbsud_internal:
114  case Intrinsic::x86_tdpbusd_internal:
115  case Intrinsic::x86_tdpbuud_internal:
116  case Intrinsic::x86_tdpbf16ps_internal: {
117  switch (OpNo) {
118  case 3:
119  Row = II->getArgOperand(0);
120  Col = II->getArgOperand(1);
121  break;
122  case 4:
123  Row = II->getArgOperand(0);
124  Col = II->getArgOperand(2);
125  break;
126  case 5:
127  if (isa<ConstantInt>(II->getArgOperand(2)))
128  Row = Builder.getInt16(
129  (cast<ConstantInt>(II->getOperand(2))->getSExtValue()) / 4);
130  else if (isa<Instruction>(II->getArgOperand(2))) {
131  // When it is not a const value and it is not a function argument, we
132  // create Row after the definition of II->getOperand(2) instead of
133  // before II. For example, II is %118, we try to getshape for %117:
134  // %117 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x
135  // i32> %115).
136  // %118 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16
137  // %104, i16 %105, i16 %106, x86_amx %110, x86_amx %114, x86_amx
138  // %117).
139  // If we create %row = udiv i16 %106, 4 before %118(aka. II), then its
140  // definition is after its user(new tileload for %117).
141  // So, the best choice is to create %row right after the definition of
142  // %106.
143  Builder.SetInsertPoint(cast<Instruction>(II->getOperand(2)));
144  Row = Builder.CreateUDiv(II->getOperand(2), Builder.getInt16(4));
145  cast<Instruction>(Row)->moveAfter(cast<Instruction>(II->getOperand(2)));
146  } else {
147  // When it is not a const value and it is a function argument, we create
148  // Row at the entry bb.
149  IRBuilder<> NewBuilder(
151  Row = NewBuilder.CreateUDiv(II->getOperand(2), NewBuilder.getInt16(4));
152  }
153  Col = II->getArgOperand(1);
154  break;
155  }
156  break;
157  }
158  }
159 
160  return std::make_pair(Row, Col);
161 }
162 
163 namespace {
164 class X86LowerAMXType {
165  Function &Func;
166 
167  // In AMX intrinsics we let Shape = {Row, Col}, but the
168  // RealCol = Col / ElementSize. We may use the RealCol
169  // as a new Row for other new created AMX intrinsics.
170  std::map<Value *, Value *> Col2Row;
171 
172 public:
173  X86LowerAMXType(Function &F) : Func(F) {}
174  bool visit();
175  void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast);
176  void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST);
177  bool transformBitcast(BitCastInst *Bitcast);
178 };
179 
180 // %src = load <256 x i32>, <256 x i32>* %addr, align 64
181 // %2 = bitcast <256 x i32> %src to x86_amx
182 // -->
183 // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
184 // i8* %addr, i64 %stride64)
185 void X86LowerAMXType::combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) {
186  Value *Row = nullptr, *Col = nullptr;
187  Use &U = *(Bitcast->use_begin());
188  unsigned OpNo = U.getOperandNo();
189  auto *II = cast<IntrinsicInst>(U.getUser());
190  std::tie(Row, Col) = getShape(II, OpNo);
192  // Use the maximun column as stride.
193  Value *Stride = Builder.getInt64(64);
194  Value *I8Ptr =
195  Builder.CreateBitCast(LD->getOperand(0), Builder.getInt8PtrTy());
196  std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride};
197 
198  Value *NewInst =
199  Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, None, Args);
200  Bitcast->replaceAllUsesWith(NewInst);
201 }
202 
203 // %src = call x86_amx @llvm.x86.tileloadd64.internal(%row, %col, %addr,
204 // %stride);
205 // %13 = bitcast x86_amx %src to <256 x i32>
206 // store <256 x i32> %13, <256 x i32>* %addr, align 64
207 // -->
208 // call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
209 // %stride64, %13)
210 void X86LowerAMXType::combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST) {
211 
212  Value *Tile = Bitcast->getOperand(0);
213  auto *II = cast<IntrinsicInst>(Tile);
214  // Tile is output from AMX intrinsic. The first operand of the
215  // intrinsic is row, the second operand of the intrinsic is column.
216  Value *Row = II->getOperand(0);
217  Value *Col = II->getOperand(1);
219  // Use the maximum column as stride. It must be the same with load
220  // stride.
221  Value *Stride = Builder.getInt64(64);
222  Value *I8Ptr =
223  Builder.CreateBitCast(ST->getOperand(1), Builder.getInt8PtrTy());
224  std::array<Value *, 5> Args = {Row, Col, I8Ptr, Stride, Tile};
225  Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
226  if (Bitcast->hasOneUse())
227  return;
228  // %13 = bitcast x86_amx %src to <256 x i32>
229  // store <256 x i32> %13, <256 x i32>* %addr, align 64
230  // %add = <256 x i32> %13, <256 x i32> %src2
231  // -->
232  // %13 = bitcast x86_amx %src to <256 x i32>
233  // call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
234  // %stride64, %13)
235  // %14 = load <256 x i32>, %addr
236  // %add = <256 x i32> %14, <256 x i32> %src2
237  Value *Vec = Builder.CreateLoad(Bitcast->getType(), ST->getOperand(1));
238  Bitcast->replaceAllUsesWith(Vec);
239 }
240 
241 // transform bitcast to <store, load> instructions.
242 bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) {
244  AllocaInst *AllocaAddr;
245  Value *I8Ptr, *Stride;
246  auto *Src = Bitcast->getOperand(0);
247 
248  auto Prepare = [&](Type *MemTy) {
249  AllocaAddr = createAllocaInstAtEntry(Builder, Bitcast->getParent(), MemTy);
250  I8Ptr = Builder.CreateBitCast(AllocaAddr, Builder.getInt8PtrTy());
251  Stride = Builder.getInt64(64);
252  };
253 
254  if (Bitcast->getType()->isX86_AMXTy()) {
255  // %2 = bitcast <256 x i32> %src to x86_amx
256  // -->
257  // %addr = alloca <256 x i32>, align 64
258  // store <256 x i32> %src, <256 x i32>* %addr, align 64
259  // %addr2 = bitcast <256 x i32>* to i8*
260  // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
261  // i8* %addr2,
262  // i64 64)
263  Use &U = *(Bitcast->use_begin());
264  unsigned OpNo = U.getOperandNo();
265  auto *II = dyn_cast<IntrinsicInst>(U.getUser());
266  if (!II)
267  return false; // May be bitcast from x86amx to <256 x i32>.
268  Prepare(Bitcast->getOperand(0)->getType());
269  Builder.CreateStore(Src, AllocaAddr);
270  // TODO we can pick an constant operand for the shape.
271  Value *Row = nullptr, *Col = nullptr;
272  std::tie(Row, Col) = getShape(II, OpNo);
273  std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride};
274  Value *NewInst = Builder.CreateIntrinsic(
275  Intrinsic::x86_tileloadd64_internal, None, Args);
276  Bitcast->replaceAllUsesWith(NewInst);
277  } else {
278  // %2 = bitcast x86_amx %src to <256 x i32>
279  // -->
280  // %addr = alloca <256 x i32>, align 64
281  // %addr2 = bitcast <256 x i32>* to i8*
282  // call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col,
283  // i8* %addr2, i64 %stride)
284  // %2 = load <256 x i32>, <256 x i32>* %addr, align 64
285  auto *II = dyn_cast<IntrinsicInst>(Src);
286  if (!II)
287  return false; // May be bitcast from <256 x i32> to x86amx.
288  Prepare(Bitcast->getType());
289  Value *Row = II->getOperand(0);
290  Value *Col = II->getOperand(1);
291  std::array<Value *, 5> Args = {Row, Col, I8Ptr, Stride, Src};
292  Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
293  Value *NewInst = Builder.CreateLoad(Bitcast->getType(), AllocaAddr);
294  Bitcast->replaceAllUsesWith(NewInst);
295  }
296 
297  return true;
298 }
299 
300 bool X86LowerAMXType::visit() {
302  Col2Row.clear();
303 
304  for (BasicBlock *BB : post_order(&Func)) {
305  for (BasicBlock::reverse_iterator II = BB->rbegin(), IE = BB->rend();
306  II != IE;) {
307  Instruction &Inst = *II++;
308  auto *Bitcast = dyn_cast<BitCastInst>(&Inst);
309  if (!Bitcast)
310  continue;
311 
312  Value *Src = Bitcast->getOperand(0);
313  if (Bitcast->getType()->isX86_AMXTy()) {
314  if (Bitcast->user_empty()) {
315  DeadInsts.push_back(Bitcast);
316  continue;
317  }
318  LoadInst *LD = dyn_cast<LoadInst>(Src);
319  if (!LD) {
320  if (transformBitcast(Bitcast))
321  DeadInsts.push_back(Bitcast);
322  continue;
323  }
324  // If load has mutli-user, duplicate a vector load.
325  // %src = load <256 x i32>, <256 x i32>* %addr, align 64
326  // %2 = bitcast <256 x i32> %src to x86_amx
327  // %add = add <256 x i32> %src, <256 x i32> %src2
328  // -->
329  // %src = load <256 x i32>, <256 x i32>* %addr, align 64
330  // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
331  // i8* %addr, i64 %stride64)
332  // %add = add <256 x i32> %src, <256 x i32> %src2
333 
334  // If load has one user, the load will be eliminated in DAG ISel.
335  // %src = load <256 x i32>, <256 x i32>* %addr, align 64
336  // %2 = bitcast <256 x i32> %src to x86_amx
337  // -->
338  // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
339  // i8* %addr, i64 %stride64)
340  combineLoadBitcast(LD, Bitcast);
341  DeadInsts.push_back(Bitcast);
342  if (LD->hasOneUse())
343  DeadInsts.push_back(LD);
344  } else if (Src->getType()->isX86_AMXTy()) {
345  if (Bitcast->user_empty()) {
346  DeadInsts.push_back(Bitcast);
347  continue;
348  }
349  StoreInst *ST = nullptr;
350  for (auto UI = Bitcast->use_begin(), UE = Bitcast->use_end();
351  UI != UE;) {
352  Value *I = (UI++)->getUser();
353  ST = dyn_cast<StoreInst>(I);
354  if (ST)
355  break;
356  }
357  if (!ST) {
358  if (transformBitcast(Bitcast))
359  DeadInsts.push_back(Bitcast);
360  continue;
361  }
362  // If bitcast (%13) has one use, combine bitcast and store to amx store.
363  // %src = call x86_amx @llvm.x86.tileloadd64.internal(%row, %col, %addr,
364  // %stride);
365  // %13 = bitcast x86_amx %src to <256 x i32>
366  // store <256 x i32> %13, <256 x i32>* %addr, align 64
367  // -->
368  // call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
369  // %stride64, %13)
370  //
371  // If bitcast (%13) has multi-use, transform as below.
372  // %13 = bitcast x86_amx %src to <256 x i32>
373  // store <256 x i32> %13, <256 x i32>* %addr, align 64
374  // %add = <256 x i32> %13, <256 x i32> %src2
375  // -->
376  // %13 = bitcast x86_amx %src to <256 x i32>
377  // call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
378  // %stride64, %13)
379  // %14 = load <256 x i32>, %addr
380  // %add = <256 x i32> %14, <256 x i32> %src2
381  //
382  combineBitcastStore(Bitcast, ST);
383  // Delete user first.
384  DeadInsts.push_back(ST);
385  DeadInsts.push_back(Bitcast);
386  }
387  }
388  }
389 
390  bool C = !DeadInsts.empty();
391 
392  for (auto *Inst : DeadInsts)
393  Inst->eraseFromParent();
394 
395  return C;
396 }
397 } // anonymous namespace
398 
400  Module *M = BB->getModule();
401  Function *F = BB->getParent();
402  IRBuilder<> Builder(&F->getEntryBlock().front());
403  const DataLayout &DL = M->getDataLayout();
404  unsigned AllocaAS = DL.getAllocaAddrSpace();
405  Type *V256I32Ty = VectorType::get(Builder.getInt32Ty(), 256, false);
406  AllocaInst *AllocaRes =
407  new AllocaInst(V256I32Ty, AllocaAS, "", &F->getEntryBlock().front());
408  BasicBlock::iterator Iter = AllocaRes->getIterator();
409  ++Iter;
410  Builder.SetInsertPoint(&*Iter);
411  Value *I8Ptr = Builder.CreateBitCast(AllocaRes, Builder.getInt8PtrTy());
412  return I8Ptr;
413 }
414 
415 static Instruction *createTileStore(Instruction *TileDef, Value *Ptr) {
416  assert(TileDef->getType()->isX86_AMXTy() && "Not define tile!");
417  auto *II = cast<IntrinsicInst>(TileDef);
418  assert(II && "Not tile intrinsic!");
419  Value *Row = II->getOperand(0);
420  Value *Col = II->getOperand(1);
421 
422  BasicBlock *BB = TileDef->getParent();
423  BasicBlock::iterator Iter = TileDef->getIterator();
424  IRBuilder<> Builder(BB, ++Iter);
425  Value *Stride = Builder.getInt64(64);
426  std::array<Value *, 5> Args = {Row, Col, Ptr, Stride, TileDef};
427 
428  Instruction *TileStore =
429  Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
430  return TileStore;
431 }
432 
433 static void replaceWithTileLoad(Use &U, Value *Ptr, bool IsPHI = false) {
434  Value *V = U.get();
435  assert(V->getType()->isX86_AMXTy() && "Not define tile!");
436 
437  // Get tile shape.
438  IntrinsicInst *II = nullptr;
439  if (IsPHI) {
440  Value *PhiOp = dyn_cast<PHINode>(V)->getIncomingValue(0);
441  II = cast<IntrinsicInst>(PhiOp);
442  } else {
443  II = cast<IntrinsicInst>(V);
444  }
445  Value *Row = II->getOperand(0);
446  Value *Col = II->getOperand(1);
447 
448  Instruction *UserI = dyn_cast<Instruction>(U.getUser());
449  IRBuilder<> Builder(UserI);
450  Value *Stride = Builder.getInt64(64);
451  std::array<Value *, 4> Args = {Row, Col, Ptr, Stride};
452 
453  Value *TileLoad =
454  Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, None, Args);
455  UserI->replaceUsesOfWith(V, TileLoad);
456 }
457 
459  for (Use &U : I->uses()) {
460  User *V = U.getUser();
461  if (isa<PHINode>(V))
462  return true;
463  }
464  return false;
465 }
466 
467 // Let all AMX tile data become volatile data, shorten the life range
468 // of each tile register before fast register allocation.
469 namespace {
470 class X86VolatileTileData {
471  Function &F;
472 
473 public:
474  X86VolatileTileData(Function &Func) : F(Func) {}
475  Value *updatePhiIncomings(BasicBlock *BB,
476  SmallVector<Instruction *, 2> &Incomings);
477  void replacePhiDefWithLoad(Instruction *PHI, Value *StorePtr);
478  bool volatileTileData();
479  void volatileTilePHI(PHINode *Inst);
480  void volatileTileNonPHI(Instruction *I);
481 };
482 
483 Value *X86VolatileTileData::updatePhiIncomings(
485  Value *I8Ptr = getAllocaPos(BB);
486 
487  for (auto *I : Incomings) {
488  User *Store = createTileStore(I, I8Ptr);
489 
490  // All its uses (except phi) should load from stored mem.
491  for (Use &U : I->uses()) {
492  User *V = U.getUser();
493  if (isa<PHINode>(V) || V == Store)
494  continue;
495  replaceWithTileLoad(U, I8Ptr);
496  }
497  }
498  return I8Ptr;
499 }
500 
501 void X86VolatileTileData::replacePhiDefWithLoad(Instruction *PHI,
502  Value *StorePtr) {
503  for (Use &U : PHI->uses())
504  replaceWithTileLoad(U, StorePtr, true);
505  PHI->eraseFromParent();
506 }
507 
508 // Smilar with volatileTileNonPHI, this function only handle PHI Nodes
509 // and their related AMX intrinsics.
510 // 1) PHI Def should change to tileload.
511 // 2) PHI Incoming Values should tilestored in just after their def.
512 // 3) The mem of these tileload and tilestores should be same.
513 // e.g.
514 // ------------------------------------------------------
515 // bb_dom:
516 // ...
517 // br i1 %bool.cond, label %if.else, label %if.then
518 //
519 // if.then:
520 // def %t0 = ...
521 // ...
522 // use %t0
523 // ...
524 // br label %if.end
525 //
526 // if.else:
527 // def %t1 = ...
528 // br label %if.end
529 //
530 // if.end:
531 // %td = phi x86_amx [ %t1, %if.else ], [ %t0, %if.then ]
532 // ...
533 // use %td
534 // ------------------------------------------------------
535 // -->
536 // ------------------------------------------------------
537 // bb_entry:
538 // %mem = alloca <256 x i32>, align 1024 *
539 // ...
540 // bb_dom:
541 // ...
542 // br i1 %bool.cond, label %if.else, label %if.then
543 //
544 // if.then:
545 // def %t0 = ...
546 // call void @llvm.x86.tilestored64.internal(mem, %t0) *
547 // ...
548 // %t0` = call x86_amx @llvm.x86.tileloadd64.internal(mem)*
549 // use %t0` *
550 // ...
551 // br label %if.end
552 //
553 // if.else:
554 // def %t1 = ...
555 // call void @llvm.x86.tilestored64.internal(mem, %t1) *
556 // br label %if.end
557 //
558 // if.end:
559 // ...
560 // %td = call x86_amx @llvm.x86.tileloadd64.internal(mem) *
561 // use %td
562 // ------------------------------------------------------
563 void X86VolatileTileData::volatileTilePHI(PHINode *PHI) {
564  BasicBlock *BB = PHI->getParent();
566 
567  for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) {
568  Value *Op = PHI->getIncomingValue(I);
569  Instruction *Inst = dyn_cast<Instruction>(Op);
570  assert(Inst && "We shouldn't fold AMX instrution!");
571  Incomings.push_back(Inst);
572  }
573 
574  Value *StorePtr = updatePhiIncomings(BB, Incomings);
575  replacePhiDefWithLoad(PHI, StorePtr);
576 }
577 
578 // Store the defined tile and load it before use.
579 // All its users are not PHI.
580 // e.g.
581 // ------------------------------------------------------
582 // def %td = ...
583 // ...
584 // "use %td"
585 // ------------------------------------------------------
586 // -->
587 // ------------------------------------------------------
588 // def %td = ...
589 // call void @llvm.x86.tilestored64.internal(mem, %td)
590 // ...
591 // %td2 = call x86_amx @llvm.x86.tileloadd64.internal(mem)
592 // "use %td2"
593 // ------------------------------------------------------
594 void X86VolatileTileData::volatileTileNonPHI(Instruction *I) {
595  BasicBlock *BB = I->getParent();
596  Value *I8Ptr = getAllocaPos(BB);
597  User *Store = createTileStore(I, I8Ptr);
598 
599  // All its uses should load from stored mem.
600  for (Use &U : I->uses()) {
601  User *V = U.getUser();
602  assert(!isa<PHINode>(V) && "PHI Nodes should be excluded!");
603  if (V != Store)
604  replaceWithTileLoad(U, I8Ptr);
605  }
606 }
607 
608 // Volatile Tile Model:
609 // 1) All the uses of tile data comes from tileload in time.
610 // 2) All the defs of tile data tilestore into mem immediately.
611 // For example:
612 // --------------------------------------------------------------------------
613 // %t1 = call x86_amx @llvm.x86.tileloadd64.internal(m, k, ...) key
614 // %t2 = call x86_amx @llvm.x86.tileloadd64.internal(k, n, ...)
615 // %t3 = call x86_amx @llvm.x86.tileloadd64.internal(m, n, ...) amx
616 // %td = tail call x86_amx @llvm.x86.tdpbssd.internal(m, n, k, t1, t2, t3)
617 // call void @llvm.x86.tilestored64.internal(... td) area
618 // --------------------------------------------------------------------------
619 // 3) No terminator, call or other amx instructions in the key amx area.
620 bool X86VolatileTileData::volatileTileData() {
621  bool Changed = false;
622  for (BasicBlock &BB : F) {
624  SmallVector<Instruction *, 8> AMXDefInsts;
625 
626  for (Instruction &I : BB) {
627  if (!I.getType()->isX86_AMXTy())
628  continue;
629  if (isa<PHINode>(&I))
630  PHIInsts.push_back(&I);
631  else
632  AMXDefInsts.push_back(&I);
633  }
634 
635  // First we "volatile" the non-phi related amx intrinsics.
636  for (Instruction *I : AMXDefInsts) {
637  if (isIncomingOfPHI(I))
638  continue;
639  volatileTileNonPHI(I);
640  Changed = true;
641  }
642 
643  for (Instruction *I : PHIInsts) {
644  volatileTilePHI(dyn_cast<PHINode>(I));
645  Changed = true;
646  }
647  }
648  return Changed;
649 }
650 
651 } // anonymous namespace
652 
653 namespace {
654 
655 class X86LowerAMXCast {
656  Function &Func;
657 
658 public:
659  X86LowerAMXCast(Function &F) : Func(F) {}
660  bool combineAMXcast(TargetLibraryInfo *TLI);
661  bool transformAMXCast(IntrinsicInst *AMXCast);
662  bool transformAllAMXCast();
663  bool optimizeAMXCastFromPhi(IntrinsicInst *CI, PHINode *PN,
665 };
666 
667 static bool DCEInstruction(Instruction *I,
669  const TargetLibraryInfo *TLI) {
670  if (isInstructionTriviallyDead(I, TLI)) {
673 
674  // Null out all of the instruction's operands to see if any operand becomes
675  // dead as we go.
676  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
677  Value *OpV = I->getOperand(i);
678  I->setOperand(i, nullptr);
679 
680  if (!OpV->use_empty() || I == OpV)
681  continue;
682 
683  // If the operand is an instruction that became dead as we nulled out the
684  // operand, and if it is 'trivially' dead, delete it in a future loop
685  // iteration.
686  if (Instruction *OpI = dyn_cast<Instruction>(OpV)) {
687  if (isInstructionTriviallyDead(OpI, TLI)) {
688  WorkList.insert(OpI);
689  }
690  }
691  }
692  I->eraseFromParent();
693  return true;
694  }
695  return false;
696 }
697 
698 /// This function handles following case
699 ///
700 /// A -> B amxcast
701 /// PHI
702 /// B -> A amxcast
703 ///
704 /// All the related PHI nodes can be replaced by new PHI nodes with type A.
705 /// The uses of \p CI can be changed to the new PHI node corresponding to \p PN.
706 bool X86LowerAMXCast::optimizeAMXCastFromPhi(
707  IntrinsicInst *CI, PHINode *PN,
709  IRBuilder<> Builder(CI);
710  Value *Src = CI->getOperand(0);
711  Type *SrcTy = Src->getType(); // Type B
712  Type *DestTy = CI->getType(); // Type A
713 
714  SmallVector<PHINode *, 4> PhiWorklist;
715  SmallSetVector<PHINode *, 4> OldPhiNodes;
716 
717  // Find all of the A->B casts and PHI nodes.
718  // We need to inspect all related PHI nodes, but PHIs can be cyclic, so
719  // OldPhiNodes is used to track all known PHI nodes, before adding a new
720  // PHI to PhiWorklist, it is checked against and added to OldPhiNodes first.
721  PhiWorklist.push_back(PN);
722  OldPhiNodes.insert(PN);
723  while (!PhiWorklist.empty()) {
724  auto *OldPN = PhiWorklist.pop_back_val();
725  for (Value *IncValue : OldPN->incoming_values()) {
726  // TODO: currently, We ignore cases where it is a const. In the future, we
727  // might support const.
728  if (isa<Constant>(IncValue))
729  return false;
730 
731  if (auto *PNode = dyn_cast<PHINode>(IncValue)) {
732  if (OldPhiNodes.insert(PNode))
733  PhiWorklist.push_back(PNode);
734  continue;
735  }
736  Instruction *ACI = dyn_cast<Instruction>(IncValue);
737  if (ACI && isAMXCast(ACI)) {
738  // Verify it's a A->B cast.
739  Type *TyA = ACI->getOperand(0)->getType();
740  Type *TyB = ACI->getType();
741  if (TyA != DestTy || TyB != SrcTy)
742  return false;
743  continue;
744  }
745  return false;
746  }
747  }
748 
749  // Check that each user of each old PHI node is something that we can
750  // rewrite, so that all of the old PHI nodes can be cleaned up afterwards.
751  for (auto *OldPN : OldPhiNodes) {
752  for (User *V : OldPN->users()) {
753  Instruction *ACI = dyn_cast<Instruction>(V);
754  if (ACI && isAMXCast(ACI)) {
755  // Verify it's a B->A cast.
756  Type *TyB = ACI->getOperand(0)->getType();
757  Type *TyA = ACI->getType();
758  if (TyA != DestTy || TyB != SrcTy)
759  return false;
760  } else if (auto *PHI = dyn_cast<PHINode>(V)) {
761  // As long as the user is another old PHI node, then even if we don't
762  // rewrite it, the PHI web we're considering won't have any users
763  // outside itself, so it'll be dead.
764  // example:
765  // bb.0:
766  // %0 = amxcast ...
767  // bb.1:
768  // %1 = amxcast ...
769  // bb.2:
770  // %goodphi = phi %0, %1
771  // %3 = amxcast %goodphi
772  // bb.3:
773  // %goodphi2 = phi %0, %goodphi
774  // %4 = amxcast %goodphi2
775  // When optimizeAMXCastFromPhi process %3 and %goodphi, %goodphi2 is
776  // outside the phi-web, so the combination stop When
777  // optimizeAMXCastFromPhi process %4 and %goodphi2, the optimization
778  // will be done.
779  if (OldPhiNodes.count(PHI) == 0)
780  return false;
781  } else
782  return false;
783  }
784  }
785 
786  // For each old PHI node, create a corresponding new PHI node with a type A.
788  for (auto *OldPN : OldPhiNodes) {
789  Builder.SetInsertPoint(OldPN);
790  PHINode *NewPN = Builder.CreatePHI(DestTy, OldPN->getNumOperands());
791  NewPNodes[OldPN] = NewPN;
792  }
793 
794  // Fill in the operands of new PHI nodes.
795  for (auto *OldPN : OldPhiNodes) {
796  PHINode *NewPN = NewPNodes[OldPN];
797  for (unsigned j = 0, e = OldPN->getNumOperands(); j != e; ++j) {
798  Value *V = OldPN->getOperand(j);
799  Value *NewV = nullptr;
800  Instruction *ACI = dyn_cast<Instruction>(V);
801  // There should not be a AMXcast from a const.
802  if (ACI && isAMXCast(ACI))
803  NewV = ACI->getOperand(0);
804  else if (auto *PrevPN = dyn_cast<PHINode>(V))
805  NewV = NewPNodes[PrevPN];
806  assert(NewV);
807  NewPN->addIncoming(NewV, OldPN->getIncomingBlock(j));
808  }
809  }
810 
811  // Traverse all accumulated PHI nodes and process its users,
812  // which are Stores and BitcCasts. Without this processing
813  // NewPHI nodes could be replicated and could lead to extra
814  // moves generated after DeSSA.
815  // If there is a store with type B, change it to type A.
816 
817  // Replace users of BitCast B->A with NewPHI. These will help
818  // later to get rid of a closure formed by OldPHI nodes.
819  for (auto *OldPN : OldPhiNodes) {
820  PHINode *NewPN = NewPNodes[OldPN];
821  for (User *V : make_early_inc_range(OldPN->users())) {
822  Instruction *ACI = dyn_cast<Instruction>(V);
823  if (ACI && isAMXCast(ACI)) {
824  Type *TyB = ACI->getOperand(0)->getType();
825  Type *TyA = ACI->getType();
826  assert(TyA == DestTy && TyB == SrcTy);
827  (void)TyA;
828  (void)TyB;
829  ACI->replaceAllUsesWith(NewPN);
830  DeadInst.insert(ACI);
831  } else if (auto *PHI = dyn_cast<PHINode>(V)) {
832  // We don't need to push PHINode into DeadInst since they are operands
833  // of rootPN DCE can safely delete rootPN's operands if rootPN is dead.
834  assert(OldPhiNodes.contains(PHI));
835  (void)PHI;
836  } else
837  llvm_unreachable("all uses should be handled");
838  }
839  }
840  return true;
841 }
842 
843 bool X86LowerAMXCast::combineAMXcast(TargetLibraryInfo *TLI) {
844  bool Change = false;
845  // Collect tile cast instruction.
846  SmallVector<Instruction *, 8> Vec2TileInsts;
847  SmallVector<Instruction *, 8> Tile2VecInsts;
848  SmallVector<Instruction *, 8> PhiCastWorkList;
850  for (BasicBlock &BB : Func) {
851  for (Instruction &I : BB) {
852  Value *Vec;
853  if (match(&I,
854  m_Intrinsic<Intrinsic::x86_cast_vector_to_tile>(m_Value(Vec))))
855  Vec2TileInsts.push_back(&I);
856  else if (match(&I, m_Intrinsic<Intrinsic::x86_cast_tile_to_vector>(
857  m_Value(Vec))))
858  Tile2VecInsts.push_back(&I);
859  }
860  }
861 
862  auto Convert = [&](SmallVectorImpl<Instruction *> &Insts, Intrinsic::ID IID) {
863  for (auto *Inst : Insts) {
864  for (User *U : Inst->users()) {
865  IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
866  if (!II || II->getIntrinsicID() != IID)
867  continue;
868  // T1 = vec2tile V0
869  // V2 = tile2vec T1
870  // V3 = OP V2
871  // -->
872  // T1 = vec2tile V0
873  // V2 = tile2vec T1
874  // V3 = OP V0
875  II->replaceAllUsesWith(Inst->getOperand(0));
876  Change = true;
877  }
878  }
879  };
880 
881  Convert(Vec2TileInsts, Intrinsic::x86_cast_tile_to_vector);
882  Convert(Tile2VecInsts, Intrinsic::x86_cast_vector_to_tile);
883 
884  auto EraseInst = [&](SmallVectorImpl<Instruction *> &Insts) {
885  for (auto *Inst : Insts) {
886  if (Inst->use_empty()) {
887  Inst->eraseFromParent();
888  Change = true;
889  }
890  }
891  };
892 
893  EraseInst(Vec2TileInsts);
894  EraseInst(Tile2VecInsts);
895 
896  // Handle the A->B->A cast, and there is an intervening PHI node.
897  for (BasicBlock &BB : Func) {
898  for (Instruction &I : BB) {
899  if (isAMXCast(&I)) {
900  if (isa<PHINode>(I.getOperand(0)))
901  PhiCastWorkList.push_back(&I);
902  }
903  }
904  }
905  for (auto *I : PhiCastWorkList) {
906  // We skip the dead Amxcast.
907  if (DeadInst.contains(I))
908  continue;
909  PHINode *PN = cast<PHINode>(I->getOperand(0));
910  if (optimizeAMXCastFromPhi(cast<IntrinsicInst>(I), PN, DeadInst)) {
911  DeadInst.insert(PN);
912  Change = true;
913  }
914  }
915 
916  // Since we create new phi and merge AMXCast, some old phis and AMXCast might
917  // have no uses. We do some DeadCodeElimination for them.
918  while (!DeadInst.empty()) {
919  Instruction *I = DeadInst.pop_back_val();
920  Change |= DCEInstruction(I, DeadInst, TLI);
921  }
922  return Change;
923 }
924 
925 // There might be remaining AMXcast after combineAMXcast and they should be
926 // handled elegantly.
927 bool X86LowerAMXCast::transformAMXCast(IntrinsicInst *AMXCast) {
928  IRBuilder<> Builder(AMXCast);
929  AllocaInst *AllocaAddr;
930  Value *I8Ptr, *Stride;
931  auto *Src = AMXCast->getOperand(0);
932 
933  auto Prepare = [&](Type *MemTy) {
934  AllocaAddr = createAllocaInstAtEntry(Builder, AMXCast->getParent(), MemTy);
935  I8Ptr = Builder.CreateBitCast(AllocaAddr, Builder.getInt8PtrTy());
936  Stride = Builder.getInt64(64);
937  };
938 
939  if (AMXCast->getType()->isX86_AMXTy()) {
940  // %2 = amxcast <225 x i32> %src to x86_amx
941  // call void @llvm.x86.tilestored64.internal(i16 15, i16 60,
942  // i8* %addr3, i64 60, x86_amx %2)
943  // -->
944  // %addr = alloca <225 x i32>, align 64
945  // store <225 x i32> %src, <225 x i32>* %addr, align 64
946  // %addr2 = bitcast <225 x i32>* %addr to i8*
947  // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 15, i16 60,
948  // i8* %addr2,
949  // i64 60)
950  // call void @llvm.x86.tilestored64.internal(i16 15, i16 60,
951  // i8* %addr3, i64 60, x86_amx %2)
952  Use &U = *(AMXCast->use_begin());
953  unsigned OpNo = U.getOperandNo();
954  auto *II = dyn_cast<IntrinsicInst>(U.getUser());
955  if (!II)
956  return false; // May be bitcast from x86amx to <256 x i32>.
957  Prepare(AMXCast->getOperand(0)->getType());
958  Builder.CreateStore(Src, AllocaAddr);
959  // TODO we can pick an constant operand for the shape.
960  Value *Row = nullptr, *Col = nullptr;
961  std::tie(Row, Col) = getShape(II, OpNo);
962  std::array<Value *, 4> Args = {
963  Row, Col, I8Ptr, Builder.CreateSExt(Col, Builder.getInt64Ty())};
964  Value *NewInst = Builder.CreateIntrinsic(
965  Intrinsic::x86_tileloadd64_internal, None, Args);
966  AMXCast->replaceAllUsesWith(NewInst);
967  AMXCast->eraseFromParent();
968  } else {
969  // %2 = amxcast x86_amx %src to <225 x i32>
970  // -->
971  // %addr = alloca <225 x i32>, align 64
972  // %addr2 = bitcast <225 x i32>* to i8*
973  // call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col,
974  // i8* %addr2, i64 %stride)
975  // %2 = load <225 x i32>, <225 x i32>* %addr, align 64
976  auto *II = dyn_cast<IntrinsicInst>(Src);
977  if (!II)
978  return false; // May be bitcast from <256 x i32> to x86amx.
979  Prepare(AMXCast->getType());
980  Value *Row = II->getOperand(0);
981  Value *Col = II->getOperand(1);
982  std::array<Value *, 5> Args = {
983  Row, Col, I8Ptr, Builder.CreateSExt(Col, Builder.getInt64Ty()), Src};
984  Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
985  Value *NewInst = Builder.CreateLoad(AMXCast->getType(), AllocaAddr);
986  AMXCast->replaceAllUsesWith(NewInst);
987  AMXCast->eraseFromParent();
988  }
989 
990  return true;
991 }
992 
993 bool X86LowerAMXCast::transformAllAMXCast() {
994  bool Change = false;
995  // Collect tile cast instruction.
997  for (BasicBlock &BB : Func) {
998  for (Instruction &I : BB) {
999  if (isAMXCast(&I))
1000  WorkLists.push_back(&I);
1001  }
1002  }
1003 
1004  for (auto *Inst : WorkLists) {
1005  Change |= transformAMXCast(cast<IntrinsicInst>(Inst));
1006  }
1007 
1008  return Change;
1009 }
1010 
1011 } // anonymous namespace
1012 
1013 namespace {
1014 
1015 class X86LowerAMXTypeLegacyPass : public FunctionPass {
1016 public:
1017  static char ID;
1018 
1019  X86LowerAMXTypeLegacyPass() : FunctionPass(ID) {
1021  }
1022 
1023  bool runOnFunction(Function &F) override {
1024  bool C = false;
1025  TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
1026  TargetLibraryInfo *TLI =
1027  &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
1028  X86LowerAMXCast LAC(F);
1029  C |= LAC.combineAMXcast(TLI);
1030  // There might be remaining AMXcast after combineAMXcast and they should be
1031  // handled elegantly.
1032  C |= LAC.transformAllAMXCast();
1033 
1034  X86LowerAMXType LAT(F);
1035  C |= LAT.visit();
1036 
1037  // Prepare for fast register allocation at O0.
1038  // Todo: May better check the volatile model of AMX code, not just
1039  // by checking Attribute::OptimizeNone and CodeGenOpt::None.
1040  if (TM->getOptLevel() == CodeGenOpt::None) {
1041  // If Front End not use O0 but the Mid/Back end use O0, (e.g.
1042  // "Clang -O2 -S -emit-llvm t.c" + "llc t.ll") we should make
1043  // sure the amx data is volatile, that is nessary for AMX fast
1044  // register allocation.
1045  if (!F.hasFnAttribute(Attribute::OptimizeNone)) {
1046  X86VolatileTileData VTD(F);
1047  C = VTD.volatileTileData() || C;
1048  }
1049  }
1050 
1051  return C;
1052  }
1053 
1054  void getAnalysisUsage(AnalysisUsage &AU) const override {
1055  AU.setPreservesCFG();
1058  }
1059 };
1060 
1061 } // anonymous namespace
1062 
1063 static const char PassName[] = "Lower AMX type for load/store";
1065 INITIALIZE_PASS_BEGIN(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false,
1066  false)
1069 INITIALIZE_PASS_END(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false,
1070  false)
1071 
1073  return new X86LowerAMXTypeLegacyPass();
1074 }
i
i
Definition: README.txt:29
llvm::createX86LowerAMXTypePass
FunctionPass * createX86LowerAMXTypePass()
The pass transforms load/store <256 x i32> to AMX load/store intrinsics or split the data to two <128...
Definition: X86LowerAMXType.cpp:1072
ValueTypes.h
DCEInstruction
static bool DCEInstruction(Instruction *I, SmallSetVector< Instruction *, 16 > &WorkList, const TargetLibraryInfo *TLI)
Definition: DCE.cpp:88
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false, false) INITIALIZE_PASS_END(X86LowerAMXTypeLegacyPass
llvm
---------------------— PointerInfo ------------------------------------—
Definition: AllocatorList.h:23
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:112
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:90
IntrinsicInst.h
llvm::Function
Definition: Function.h:61
Pass.h
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:52
llvm::ARM_MB::LD
@ LD
Definition: ARMBaseInfo.h:72
llvm::BitCastInst
This class represents a no-op cast from one type to another.
Definition: Instructions.h:5200
llvm::SmallVector< Instruction *, 8 >
llvm::LegacyLegalizeActions::Bitcast
@ Bitcast
Perform the operation on a different, but equivalently sized type.
Definition: LegacyLegalizerInfo.h:54
llvm::IRBuilder<>
llvm::Use::get
Value * get() const
Definition: Use.h:67
llvm::SmallDenseMap
Definition: DenseMap.h:880
Local.h
OptimizationRemarkEmitter.h
createAllocaInstAtEntry
static AllocaInst * createAllocaInstAtEntry(IRBuilder<> &Builder, BasicBlock *BB, Type *Ty)
Definition: X86LowerAMXType.cpp:75
createTileStore
static Instruction * createTileStore(Instruction *TileDef, Value *Ptr)
Definition: X86LowerAMXType.cpp:415
isAMXCast
static bool isAMXCast(Instruction *II)
Definition: X86LowerAMXType.cpp:69
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::Use::getOperandNo
unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:33
llvm::SmallVectorImpl::pop_back_val
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:635
replaceWithTileLoad
static void replaceWithTileLoad(Use &U, Value *Ptr, bool IsPHI=false)
Definition: X86LowerAMXType.cpp:433
F
#define F(x, y, z)
Definition: MD5.cpp:56
DEBUG_TYPE
#define DEBUG_TYPE
Definition: X86LowerAMXType.cpp:67
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
X86.h
llvm::Type::getX86_AMXTy
static Type * getX86_AMXTy(LLVMContext &C)
Definition: Type.cpp:198
TargetMachine.h
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:31
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::PHINode::getIncomingValue
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
Definition: Instructions.h:2729
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::User
Definition: User.h:44
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
TargetLibraryInfo.h
AssumeBundleBuilder.h
llvm::Value::uses
iterator_range< use_iterator > uses()
Definition: Value.h:377
false
Definition: StackSlotColoring.cpp:142
llvm::Instruction
Definition: Instruction.h:45
llvm::Use::getUser
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:73
PatternMatch.h
llvm::SetVector< T, SmallVector< T, N >, SmallDenseSet< T, N > >::empty
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:72
llvm::PHINode::getNumIncomingValues
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Definition: Instructions.h:2725
llvm::None
const NoneType None
Definition: None.h:23
llvm::Value::use_empty
bool use_empty() const
Definition: Value.h:345
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::ARM_PROC::IE
@ IE
Definition: ARMBaseInfo.h:27
Passes.h
llvm::TargetPassConfig
Target-Independent Code Generator Pass Configuration Options.
Definition: TargetPassConfig.h:84
llvm::StoreInst
An instruction for storing to memory.
Definition: Instructions.h:304
llvm::SetVector< T, SmallVector< T, N >, SmallDenseSet< T, N > >::contains
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:209
llvm::Instruction::eraseFromParent
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:78
llvm::TargetLibraryInfoWrapperPass
Definition: TargetLibraryInfo.h:463
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
llvm::PHINode::addIncoming
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Definition: Instructions.h:2783
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
llvm::numbers::e
constexpr double e
Definition: MathExtras.h:57
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::make_early_inc_range
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:576
getAllocaPos
static Value * getAllocaPos(BasicBlock *BB)
Definition: X86LowerAMXType.cpp:399
TargetPassConfig.h
llvm::initializeX86LowerAMXTypeLegacyPassPass
void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &)
IRBuilder.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:79
llvm::User::replaceUsesOfWith
void replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::Value::use_begin
use_iterator use_begin()
Definition: Value.h:361
llvm::salvageKnowledge
void salvageKnowledge(Instruction *I, AssumptionCache *AC=nullptr, DominatorTree *DT=nullptr)
Calls BuildAssumeFromInst and if the resulting llvm.assume is valid insert if before I.
Definition: AssumeBundleBuilder.cpp:292
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
PassName
static const char PassName[]
Definition: X86LowerAMXType.cpp:1063
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:650
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
llvm::SetVector< T, SmallVector< T, N >, SmallDenseSet< T, N > >::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:141
llvm::CodeGenOpt::None
@ None
Definition: CodeGen.h:53
getFirstNonAllocaInTheEntryBlock
static Instruction * getFirstNonAllocaInTheEntryBlock(Function &F)
Definition: X86LowerAMXType.cpp:90
llvm::isInstructionTriviallyDead
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction has no side ef...
Definition: Local.cpp:398
DataLayout.h
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:253
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
llvm::Instruction::getFunction
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:70
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:532
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:81
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
isIncomingOfPHI
static bool isIncomingOfPHI(Instruction *I)
Definition: X86LowerAMXType.cpp:458
llvm::ifs::IFSSymbolType::Func
@ Func
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:175
runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:69
j
return j(j<< 16)
getShape
static std::pair< Value *, Value * > getShape(IntrinsicInst *II, unsigned OpNo)
Definition: X86LowerAMXType.cpp:97
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:321
llvm::post_order
iterator_range< po_iterator< T > > post_order(const T &G)
Definition: PostOrderIterator.h:188
Function.h
llvm::salvageDebugInfo
void salvageDebugInfo(Instruction &I)
Assuming the instruction I is going to be deleted, attempt to salvage debug users of I by writing the...
Definition: Local.cpp:1728
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:219
llvm::SmallVectorImpl::clear
void clear()
Definition: SmallVector.h:585
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:45
llvm::AllocaInst::setAlignment
void setAlignment(Align Align)
Definition: Instructions.h:124
llvm::BasicBlock::reverse_iterator
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:92
Instructions.h
PostOrderIterator.h
llvm::IRBuilderBase::getInt16
ConstantInt * getInt16(uint16_t C)
Get a constant 16-bit value.
Definition: IRBuilder.h:473
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1338
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:94
TargetTransformInfo.h
llvm::PHINode
Definition: Instructions.h:2633
llvm::SmallVectorImpl< Instruction * >
llvm::SmallSetVector
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:307
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::IRBuilderBase::CreateUDiv
Value * CreateUDiv(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1263
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:389
llvm::AllocaInst
an instruction to allocate memory on the stack
Definition: Instructions.h:62
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
InitializePasses.h
llvm::SetVector< T, SmallVector< T, N >, SmallDenseSet< T, N > >::pop_back_val
LLVM_NODISCARD T pop_back_val()
Definition: SetVector.h:232
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
llvm::VectorType::get
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:632
llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:422
llvm::Type::isX86_AMXTy
bool isX86_AMXTy() const
Return true if this is X86 AMX.
Definition: Type.h:186
SetVector.h
llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:44
SmallSet.h
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37