LLVM 22.0.0git
AMDGPUInstCombineIntrinsic.cpp
Go to the documentation of this file.
1//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
21#include "llvm/IR/Dominators.h"
22#include "llvm/IR/IntrinsicsAMDGPU.h"
24#include <optional>
25
26using namespace llvm;
27using namespace llvm::PatternMatch;
28
29#define DEBUG_TYPE "AMDGPUtti"
30
31namespace {
32
33struct AMDGPUImageDMaskIntrinsic {
34 unsigned Intr;
35};
36
37#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
38#include "AMDGPUGenSearchableTables.inc"
39
40} // end anonymous namespace
41
42// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
43//
44// A single NaN input is folded to minnum, so we rely on that folding for
45// handling NaNs.
46static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
47 const APFloat &Src2) {
48 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
49
50 APFloat::cmpResult Cmp0 = Max3.compare(Src0);
51 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
52 if (Cmp0 == APFloat::cmpEqual)
53 return maxnum(Src1, Src2);
54
55 APFloat::cmpResult Cmp1 = Max3.compare(Src1);
56 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
57 if (Cmp1 == APFloat::cmpEqual)
58 return maxnum(Src0, Src2);
59
60 return maxnum(Src0, Src1);
61}
62
63// Check if a value can be converted to a 16-bit value without losing
64// precision.
65// The value is expected to be either a float (IsFloat = true) or an unsigned
66// integer (IsFloat = false).
67static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
68 Type *VTy = V.getType();
69 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
70 // The value is already 16-bit, so we don't want to convert to 16-bit again!
71 return false;
72 }
73 if (IsFloat) {
74 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
75 // We need to check that if we cast the index down to a half, we do not
76 // lose precision.
77 APFloat FloatValue(ConstFloat->getValueAPF());
78 bool LosesInfo = true;
80 &LosesInfo);
81 return !LosesInfo;
82 }
83 } else {
84 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
85 // We need to check that if we cast the index down to an i16, we do not
86 // lose precision.
87 APInt IntValue(ConstInt->getValue());
88 return IntValue.getActiveBits() <= 16;
89 }
90 }
91
92 Value *CastSrc;
93 bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
94 : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
95 if (IsExt) {
96 Type *CastSrcTy = CastSrc->getType();
97 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
98 return true;
99 }
100
101 return false;
102}
103
104// Convert a value to 16-bit.
106 Type *VTy = V.getType();
108 return cast<Instruction>(&V)->getOperand(0);
109 if (VTy->isIntegerTy())
110 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
111 if (VTy->isFloatingPointTy())
112 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
113
114 llvm_unreachable("Should never be called!");
115}
116
117/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
118/// modified arguments (based on OldIntr) and replaces InstToReplace with
119/// this newly created intrinsic call.
120static std::optional<Instruction *> modifyIntrinsicCall(
121 IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
122 InstCombiner &IC,
123 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
124 Func) {
127 return std::nullopt;
128
129 SmallVector<Value *, 8> Args(OldIntr.args());
130
131 // Modify arguments and types
132 Func(Args, ArgTys);
133
134 CallInst *NewCall = IC.Builder.CreateIntrinsic(NewIntr, ArgTys, Args);
135 NewCall->takeName(&OldIntr);
136 NewCall->copyMetadata(OldIntr);
137 if (isa<FPMathOperator>(NewCall))
138 NewCall->copyFastMathFlags(&OldIntr);
139
140 // Erase and replace uses
141 if (!InstToReplace.getType()->isVoidTy())
142 IC.replaceInstUsesWith(InstToReplace, NewCall);
143
144 bool RemoveOldIntr = &OldIntr != &InstToReplace;
145
146 auto *RetValue = IC.eraseInstFromFunction(InstToReplace);
147 if (RemoveOldIntr)
148 IC.eraseInstFromFunction(OldIntr);
149
150 return RetValue;
151}
152
153static std::optional<Instruction *>
155 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
157 // Optimize _L to _LZ when _L is zero
158 if (const auto *LZMappingInfo =
160 if (auto *ConstantLod =
161 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
162 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
163 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
165 ImageDimIntr->Dim);
166 return modifyIntrinsicCall(
167 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
168 Args.erase(Args.begin() + ImageDimIntr->LodIndex);
169 });
170 }
171 }
172 }
173
174 // Optimize _mip away, when 'lod' is zero
175 if (const auto *MIPMappingInfo =
177 if (auto *ConstantMip =
178 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
179 if (ConstantMip->isZero()) {
180 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
181 AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
182 ImageDimIntr->Dim);
183 return modifyIntrinsicCall(
184 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
185 Args.erase(Args.begin() + ImageDimIntr->MipIndex);
186 });
187 }
188 }
189 }
190
191 // Optimize _bias away when 'bias' is zero
192 if (const auto *BiasMappingInfo =
194 if (auto *ConstantBias =
195 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
196 if (ConstantBias->isZero()) {
197 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
198 AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
199 ImageDimIntr->Dim);
200 return modifyIntrinsicCall(
201 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
202 Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
203 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
204 });
205 }
206 }
207 }
208
209 // Optimize _offset away when 'offset' is zero
210 if (const auto *OffsetMappingInfo =
212 if (auto *ConstantOffset =
213 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
214 if (ConstantOffset->isZero()) {
215 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
217 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
218 return modifyIntrinsicCall(
219 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
220 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
221 });
222 }
223 }
224 }
225
226 // Try to use D16
227 if (ST->hasD16Images()) {
228
229 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
231
232 if (BaseOpcode->HasD16) {
233
234 // If the only use of image intrinsic is a fptrunc (with conversion to
235 // half) then both fptrunc and image intrinsic will be replaced with image
236 // intrinsic with D16 flag.
237 if (II.hasOneUse()) {
238 Instruction *User = II.user_back();
239
240 if (User->getOpcode() == Instruction::FPTrunc &&
242
243 return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
244 [&](auto &Args, auto &ArgTys) {
245 // Change return type of image intrinsic.
246 // Set it to return type of fptrunc.
247 ArgTys[0] = User->getType();
248 });
249 }
250 }
251
252 // Only perform D16 folding if every user of the image sample is
253 // an ExtractElementInst immediately followed by an FPTrunc to half.
255 ExtractTruncPairs;
256 bool AllHalfExtracts = true;
257
258 for (User *U : II.users()) {
259 auto *Ext = dyn_cast<ExtractElementInst>(U);
260 if (!Ext || !Ext->hasOneUse()) {
261 AllHalfExtracts = false;
262 break;
263 }
264
265 auto *Tr = dyn_cast<FPTruncInst>(*Ext->user_begin());
266 if (!Tr || !Tr->getType()->isHalfTy()) {
267 AllHalfExtracts = false;
268 break;
269 }
270
271 ExtractTruncPairs.emplace_back(Ext, Tr);
272 }
273
274 if (!ExtractTruncPairs.empty() && AllHalfExtracts) {
275 auto *VecTy = cast<VectorType>(II.getType());
276 Type *HalfVecTy =
277 VecTy->getWithNewType(Type::getHalfTy(II.getContext()));
278
279 // Obtain the original image sample intrinsic's signature
280 // and replace its return type with the half-vector for D16 folding
282 Intrinsic::getIntrinsicSignature(II.getCalledFunction(), SigTys);
283 SigTys[0] = HalfVecTy;
284
285 Module *M = II.getModule();
286 Function *HalfDecl =
287 Intrinsic::getOrInsertDeclaration(M, ImageDimIntr->Intr, SigTys);
288
289 II.mutateType(HalfVecTy);
290 II.setCalledFunction(HalfDecl);
291
292 IRBuilder<> Builder(II.getContext());
293 for (auto &[Ext, Tr] : ExtractTruncPairs) {
294 Value *Idx = Ext->getIndexOperand();
295
296 Builder.SetInsertPoint(Tr);
297
298 Value *HalfExtract = Builder.CreateExtractElement(&II, Idx);
299 HalfExtract->takeName(Tr);
300
301 Tr->replaceAllUsesWith(HalfExtract);
302 }
303
304 for (auto &[Ext, Tr] : ExtractTruncPairs) {
305 IC.eraseInstFromFunction(*Tr);
306 IC.eraseInstFromFunction(*Ext);
307 }
308
309 return &II;
310 }
311 }
312 }
313
314 // Try to use A16 or G16
315 if (!ST->hasA16() && !ST->hasG16())
316 return std::nullopt;
317
318 // Address is interpreted as float if the instruction has a sampler or as
319 // unsigned int if there is no sampler.
320 bool HasSampler =
322 bool FloatCoord = false;
323 // true means derivatives can be converted to 16 bit, coordinates not
324 bool OnlyDerivatives = false;
325
326 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
327 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
328 Value *Coord = II.getOperand(OperandIndex);
329 // If the values are not derived from 16-bit values, we cannot optimize.
330 if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
331 if (OperandIndex < ImageDimIntr->CoordStart ||
332 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
333 return std::nullopt;
334 }
335 // All gradients can be converted, so convert only them
336 OnlyDerivatives = true;
337 break;
338 }
339
340 assert(OperandIndex == ImageDimIntr->GradientStart ||
341 FloatCoord == Coord->getType()->isFloatingPointTy());
342 FloatCoord = Coord->getType()->isFloatingPointTy();
343 }
344
345 if (!OnlyDerivatives && !ST->hasA16())
346 OnlyDerivatives = true; // Only supports G16
347
348 // Check if there is a bias parameter and if it can be converted to f16
349 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
350 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
351 assert(HasSampler &&
352 "Only image instructions with a sampler can have a bias");
353 if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
354 OnlyDerivatives = true;
355 }
356
357 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
358 ImageDimIntr->CoordStart))
359 return std::nullopt;
360
361 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
362 : Type::getInt16Ty(II.getContext());
363
364 return modifyIntrinsicCall(
365 II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
366 ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
367 if (!OnlyDerivatives) {
368 ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
369
370 // Change the bias type
371 if (ImageDimIntr->NumBiasArgs != 0)
372 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
373 }
374
375 unsigned EndIndex =
376 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
377 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
378 OperandIndex < EndIndex; OperandIndex++) {
379 Args[OperandIndex] =
380 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
381 }
382
383 // Convert the bias
384 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
385 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
386 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
387 }
388 });
389}
390
392 const Value *Op0, const Value *Op1,
393 InstCombiner &IC) const {
394 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
395 // infinity, gives +0.0. If we can prove we don't have one of the special
396 // cases then we can use a normal multiply instead.
397 // TODO: Create and use isKnownFiniteNonZero instead of just matching
398 // constants here.
401 // One operand is not zero or infinity or NaN.
402 return true;
403 }
404
406 if (isKnownNeverInfOrNaN(Op0, SQ) && isKnownNeverInfOrNaN(Op1, SQ)) {
407 // Neither operand is infinity or NaN.
408 return true;
409 }
410 return false;
411}
412
413/// Match an fpext from half to float, or a constant we can convert.
415 Value *Src = nullptr;
416 ConstantFP *CFP = nullptr;
417 if (match(Arg, m_OneUse(m_FPExt(m_Value(Src))))) {
418 if (Src->getType()->isHalfTy())
419 return Src;
420 } else if (match(Arg, m_ConstantFP(CFP))) {
421 bool LosesInfo;
422 APFloat Val(CFP->getValueAPF());
424 if (!LosesInfo)
425 return ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val);
426 }
427 return nullptr;
428}
429
430// Trim all zero components from the end of the vector \p UseV and return
431// an appropriate bitset with known elements.
433 Instruction *I) {
434 auto *VTy = cast<FixedVectorType>(UseV->getType());
435 unsigned VWidth = VTy->getNumElements();
436 APInt DemandedElts = APInt::getAllOnes(VWidth);
437
438 for (int i = VWidth - 1; i > 0; --i) {
439 auto *Elt = findScalarElement(UseV, i);
440 if (!Elt)
441 break;
442
443 if (auto *ConstElt = dyn_cast<Constant>(Elt)) {
444 if (!ConstElt->isNullValue() && !isa<UndefValue>(Elt))
445 break;
446 } else {
447 break;
448 }
449
450 DemandedElts.clearBit(i);
451 }
452
453 return DemandedElts;
454}
455
456// Trim elements of the end of the vector \p V, if they are
457// equal to the first element of the vector.
459 auto *VTy = cast<FixedVectorType>(V->getType());
460 unsigned VWidth = VTy->getNumElements();
461 APInt DemandedElts = APInt::getAllOnes(VWidth);
462 Value *FirstComponent = findScalarElement(V, 0);
463
464 SmallVector<int> ShuffleMask;
465 if (auto *SVI = dyn_cast<ShuffleVectorInst>(V))
466 SVI->getShuffleMask(ShuffleMask);
467
468 for (int I = VWidth - 1; I > 0; --I) {
469 if (ShuffleMask.empty()) {
470 auto *Elt = findScalarElement(V, I);
471 if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt)))
472 break;
473 } else {
474 // Detect identical elements in the shufflevector result, even though
475 // findScalarElement cannot tell us what that element is.
476 if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
477 break;
478 }
479 DemandedElts.clearBit(I);
480 }
481
482 return DemandedElts;
483}
484
487 APInt DemandedElts,
488 int DMaskIdx = -1,
489 bool IsLoad = true);
490
491/// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
492static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
493 return (SqrtOp->getType()->isFloatTy() &&
494 (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) ||
495 SqrtOp->getType()->isHalfTy();
496}
497
498/// Return true if we can easily prove that use U is uniform.
499static bool isTriviallyUniform(const Use &U) {
500 Value *V = U.get();
501 if (isa<Constant>(V))
502 return true;
503 if (const auto *A = dyn_cast<Argument>(V))
505 if (const auto *II = dyn_cast<IntrinsicInst>(V)) {
506 if (!AMDGPU::isIntrinsicAlwaysUniform(II->getIntrinsicID()))
507 return false;
508 // If II and U are in different blocks then there is a possibility of
509 // temporal divergence.
510 return II->getParent() == cast<Instruction>(U.getUser())->getParent();
511 }
512 return false;
513}
514
515/// Simplify a lane index operand (e.g. llvm.amdgcn.readlane src1).
516///
517/// The instruction only reads the low 5 bits for wave32, and 6 bits for wave64.
520 unsigned LaneArgIdx) const {
521 unsigned MaskBits = ST->getWavefrontSizeLog2();
522 APInt DemandedMask(32, maskTrailingOnes<unsigned>(MaskBits));
523
524 KnownBits Known(32);
525 if (IC.SimplifyDemandedBits(&II, LaneArgIdx, DemandedMask, Known))
526 return true;
527
528 if (!Known.isConstant())
529 return false;
530
531 // Out of bounds indexes may appear in wave64 code compiled for wave32.
532 // Unlike the DAG version, SimplifyDemandedBits does not change constants, so
533 // manually fix it up.
534
535 Value *LaneArg = II.getArgOperand(LaneArgIdx);
536 Constant *MaskedConst =
537 ConstantInt::get(LaneArg->getType(), Known.getConstant() & DemandedMask);
538 if (MaskedConst != LaneArg) {
539 II.getOperandUse(LaneArgIdx).set(MaskedConst);
540 return true;
541 }
542
543 return false;
544}
545
547 Function &NewCallee, ArrayRef<Value *> Ops) {
549 Old.getOperandBundlesAsDefs(OpBundles);
550
551 CallInst *NewCall = B.CreateCall(&NewCallee, Ops, OpBundles);
552 NewCall->takeName(&Old);
553 return NewCall;
554}
555
558 IntrinsicInst &II) const {
559 const auto IID = II.getIntrinsicID();
560 assert(IID == Intrinsic::amdgcn_readlane ||
561 IID == Intrinsic::amdgcn_readfirstlane ||
562 IID == Intrinsic::amdgcn_permlane64);
563
564 Instruction *OpInst = dyn_cast<Instruction>(II.getOperand(0));
565
566 // Only do this if both instructions are in the same block
567 // (so the exec mask won't change) and the readlane is the only user of its
568 // operand.
569 if (!OpInst || !OpInst->hasOneUser() || OpInst->getParent() != II.getParent())
570 return nullptr;
571
572 const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
573
574 // If this is a readlane, check that the second operand is a constant, or is
575 // defined before OpInst so we know it's safe to move this intrinsic higher.
576 Value *LaneID = nullptr;
577 if (IsReadLane) {
578 LaneID = II.getOperand(1);
579
580 // readlane take an extra operand for the lane ID, so we must check if that
581 // LaneID value can be used at the point where we want to move the
582 // intrinsic.
583 if (auto *LaneIDInst = dyn_cast<Instruction>(LaneID)) {
584 if (!IC.getDominatorTree().dominates(LaneIDInst, OpInst))
585 return nullptr;
586 }
587 }
588
589 // Hoist the intrinsic (II) through OpInst.
590 //
591 // (II (OpInst x)) -> (OpInst (II x))
592 const auto DoIt = [&](unsigned OpIdx,
593 Function *NewIntrinsic) -> Instruction * {
595 if (IsReadLane)
596 Ops.push_back(LaneID);
597
598 // Rewrite the intrinsic call.
599 CallInst *NewII = rewriteCall(IC.Builder, II, *NewIntrinsic, Ops);
600
601 // Rewrite OpInst so it takes the result of the intrinsic now.
602 Instruction &NewOp = *OpInst->clone();
603 NewOp.setOperand(OpIdx, NewII);
604 return &NewOp;
605 };
606
607 // TODO(?): Should we do more with permlane64?
608 if (IID == Intrinsic::amdgcn_permlane64 && !isa<BitCastInst>(OpInst))
609 return nullptr;
610
611 if (isa<UnaryOperator>(OpInst))
612 return DoIt(0, II.getCalledFunction());
613
614 if (isa<CastInst>(OpInst)) {
615 Value *Src = OpInst->getOperand(0);
616 Type *SrcTy = Src->getType();
617 if (!isTypeLegal(SrcTy))
618 return nullptr;
619
620 Function *Remangled =
621 Intrinsic::getOrInsertDeclaration(II.getModule(), IID, {SrcTy});
622 return DoIt(0, Remangled);
623 }
624
625 // We can also hoist through binary operators if the other operand is uniform.
626 if (isa<BinaryOperator>(OpInst)) {
627 // FIXME: If we had access to UniformityInfo here we could just check
628 // if the operand is uniform.
629 if (isTriviallyUniform(OpInst->getOperandUse(0)))
630 return DoIt(1, II.getCalledFunction());
631 if (isTriviallyUniform(OpInst->getOperandUse(1)))
632 return DoIt(0, II.getCalledFunction());
633 }
634
635 return nullptr;
636}
637
638std::optional<Instruction *>
640 Intrinsic::ID IID = II.getIntrinsicID();
641 switch (IID) {
642 case Intrinsic::amdgcn_rcp: {
643 Value *Src = II.getArgOperand(0);
644 if (isa<PoisonValue>(Src))
645 return IC.replaceInstUsesWith(II, Src);
646
647 // TODO: Move to ConstantFolding/InstSimplify?
648 if (isa<UndefValue>(Src)) {
649 Type *Ty = II.getType();
650 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
651 return IC.replaceInstUsesWith(II, QNaN);
652 }
653
654 if (II.isStrictFP())
655 break;
656
657 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
658 const APFloat &ArgVal = C->getValueAPF();
659 APFloat Val(ArgVal.getSemantics(), 1);
661
662 // This is more precise than the instruction may give.
663 //
664 // TODO: The instruction always flushes denormal results (except for f16),
665 // should this also?
666 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
667 }
668
669 FastMathFlags FMF = cast<FPMathOperator>(II).getFastMathFlags();
670 if (!FMF.allowContract())
671 break;
672 auto *SrcCI = dyn_cast<IntrinsicInst>(Src);
673 if (!SrcCI)
674 break;
675
676 auto IID = SrcCI->getIntrinsicID();
677 // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
678 //
679 // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
680 // relaxed.
681 if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) {
682 const FPMathOperator *SqrtOp = cast<FPMathOperator>(SrcCI);
683 FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
684 if (!InnerFMF.allowContract() || !SrcCI->hasOneUse())
685 break;
686
687 if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
688 break;
689
691 SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()});
692
693 InnerFMF |= FMF;
694 II.setFastMathFlags(InnerFMF);
695
696 II.setCalledFunction(NewDecl);
697 return IC.replaceOperand(II, 0, SrcCI->getArgOperand(0));
698 }
699
700 break;
701 }
702 case Intrinsic::amdgcn_sqrt:
703 case Intrinsic::amdgcn_rsq:
704 case Intrinsic::amdgcn_tanh: {
705 Value *Src = II.getArgOperand(0);
706 if (isa<PoisonValue>(Src))
707 return IC.replaceInstUsesWith(II, Src);
708
709 // TODO: Move to ConstantFolding/InstSimplify?
710 if (isa<UndefValue>(Src)) {
711 Type *Ty = II.getType();
712 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
713 return IC.replaceInstUsesWith(II, QNaN);
714 }
715
716 // f16 amdgcn.sqrt is identical to regular sqrt.
717 if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
719 II.getModule(), Intrinsic::sqrt, {II.getType()});
720 II.setCalledFunction(NewDecl);
721 return &II;
722 }
723
724 break;
725 }
726 case Intrinsic::amdgcn_log:
727 case Intrinsic::amdgcn_exp2: {
728 const bool IsLog = IID == Intrinsic::amdgcn_log;
729 const bool IsExp = IID == Intrinsic::amdgcn_exp2;
730 Value *Src = II.getArgOperand(0);
731 Type *Ty = II.getType();
732
733 if (isa<PoisonValue>(Src))
734 return IC.replaceInstUsesWith(II, Src);
735
736 if (IC.getSimplifyQuery().isUndefValue(Src))
738
739 if (ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
740 if (C->isInfinity()) {
741 // exp2(+inf) -> +inf
742 // log2(+inf) -> +inf
743 if (!C->isNegative())
744 return IC.replaceInstUsesWith(II, C);
745
746 // exp2(-inf) -> 0
747 if (IsExp && C->isNegative())
749 }
750
751 if (II.isStrictFP())
752 break;
753
754 if (C->isNaN()) {
755 Constant *Quieted = ConstantFP::get(Ty, C->getValue().makeQuiet());
756 return IC.replaceInstUsesWith(II, Quieted);
757 }
758
759 // f32 instruction doesn't handle denormals, f16 does.
760 if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) {
761 Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, true)
762 : ConstantFP::get(Ty, 1.0);
763 return IC.replaceInstUsesWith(II, FoldedValue);
764 }
765
766 if (IsLog && C->isNegative())
768
769 // TODO: Full constant folding matching hardware behavior.
770 }
771
772 break;
773 }
774 case Intrinsic::amdgcn_frexp_mant:
775 case Intrinsic::amdgcn_frexp_exp: {
776 Value *Src = II.getArgOperand(0);
777 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
778 int Exp;
779 APFloat Significand =
780 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
781
782 if (IID == Intrinsic::amdgcn_frexp_mant) {
783 return IC.replaceInstUsesWith(
784 II, ConstantFP::get(II.getContext(), Significand));
785 }
786
787 // Match instruction special case behavior.
788 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
789 Exp = 0;
790
791 return IC.replaceInstUsesWith(II,
792 ConstantInt::getSigned(II.getType(), Exp));
793 }
794
795 if (isa<PoisonValue>(Src))
796 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
797
798 if (isa<UndefValue>(Src)) {
799 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
800 }
801
802 break;
803 }
804 case Intrinsic::amdgcn_class: {
805 Value *Src0 = II.getArgOperand(0);
806 Value *Src1 = II.getArgOperand(1);
807 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
808 if (CMask) {
809 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
810 II.getModule(), Intrinsic::is_fpclass, Src0->getType()));
811
812 // Clamp any excess bits, as they're illegal for the generic intrinsic.
813 II.setArgOperand(1, ConstantInt::get(Src1->getType(),
814 CMask->getZExtValue() & fcAllFlags));
815 return &II;
816 }
817
818 // Propagate poison.
819 if (isa<PoisonValue>(Src0) || isa<PoisonValue>(Src1))
820 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
821
822 // llvm.amdgcn.class(_, undef) -> false
823 if (IC.getSimplifyQuery().isUndefValue(Src1))
824 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
825
826 // llvm.amdgcn.class(undef, mask) -> mask != 0
827 if (IC.getSimplifyQuery().isUndefValue(Src0)) {
828 Value *CmpMask = IC.Builder.CreateICmpNE(
829 Src1, ConstantInt::getNullValue(Src1->getType()));
830 return IC.replaceInstUsesWith(II, CmpMask);
831 }
832 break;
833 }
834 case Intrinsic::amdgcn_cvt_pkrtz: {
835 auto foldFPTruncToF16RTZ = [](Value *Arg) -> Value * {
836 Type *HalfTy = Type::getHalfTy(Arg->getContext());
837
838 if (isa<PoisonValue>(Arg))
839 return PoisonValue::get(HalfTy);
840 if (isa<UndefValue>(Arg))
841 return UndefValue::get(HalfTy);
842
843 ConstantFP *CFP = nullptr;
844 if (match(Arg, m_ConstantFP(CFP))) {
845 bool LosesInfo;
846 APFloat Val(CFP->getValueAPF());
848 return ConstantFP::get(HalfTy, Val);
849 }
850
851 Value *Src = nullptr;
852 if (match(Arg, m_FPExt(m_Value(Src)))) {
853 if (Src->getType()->isHalfTy())
854 return Src;
855 }
856
857 return nullptr;
858 };
859
860 if (Value *Src0 = foldFPTruncToF16RTZ(II.getArgOperand(0))) {
861 if (Value *Src1 = foldFPTruncToF16RTZ(II.getArgOperand(1))) {
862 Value *V = PoisonValue::get(II.getType());
863 V = IC.Builder.CreateInsertElement(V, Src0, (uint64_t)0);
864 V = IC.Builder.CreateInsertElement(V, Src1, (uint64_t)1);
865 return IC.replaceInstUsesWith(II, V);
866 }
867 }
868
869 break;
870 }
871 case Intrinsic::amdgcn_cvt_pknorm_i16:
872 case Intrinsic::amdgcn_cvt_pknorm_u16:
873 case Intrinsic::amdgcn_cvt_pk_i16:
874 case Intrinsic::amdgcn_cvt_pk_u16: {
875 Value *Src0 = II.getArgOperand(0);
876 Value *Src1 = II.getArgOperand(1);
877
878 // TODO: Replace call with scalar operation if only one element is poison.
879 if (isa<PoisonValue>(Src0) && isa<PoisonValue>(Src1))
880 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
881
882 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
883 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
884 }
885
886 break;
887 }
888 case Intrinsic::amdgcn_cvt_off_f32_i4: {
889 Value* Arg = II.getArgOperand(0);
890 Type *Ty = II.getType();
891
892 if (isa<PoisonValue>(Arg))
894
895 if(IC.getSimplifyQuery().isUndefValue(Arg))
897
898 ConstantInt *CArg = dyn_cast<ConstantInt>(II.getArgOperand(0));
899 if (!CArg)
900 break;
901
902 // Tabulated 0.0625 * (sext (CArg & 0xf)).
903 constexpr size_t ResValsSize = 16;
904 static constexpr float ResVals[ResValsSize] = {
905 0.0, 0.0625, 0.125, 0.1875, 0.25, 0.3125, 0.375, 0.4375,
906 -0.5, -0.4375, -0.375, -0.3125, -0.25, -0.1875, -0.125, -0.0625};
907 Constant *Res =
908 ConstantFP::get(Ty, ResVals[CArg->getZExtValue() & (ResValsSize - 1)]);
909 return IC.replaceInstUsesWith(II, Res);
910 }
911 case Intrinsic::amdgcn_ubfe:
912 case Intrinsic::amdgcn_sbfe: {
913 // Decompose simple cases into standard shifts.
914 Value *Src = II.getArgOperand(0);
915 if (isa<UndefValue>(Src)) {
916 return IC.replaceInstUsesWith(II, Src);
917 }
918
919 unsigned Width;
920 Type *Ty = II.getType();
921 unsigned IntSize = Ty->getIntegerBitWidth();
922
923 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
924 if (CWidth) {
925 Width = CWidth->getZExtValue();
926 if ((Width & (IntSize - 1)) == 0) {
928 }
929
930 // Hardware ignores high bits, so remove those.
931 if (Width >= IntSize) {
932 return IC.replaceOperand(
933 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
934 }
935 }
936
937 unsigned Offset;
938 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
939 if (COffset) {
940 Offset = COffset->getZExtValue();
941 if (Offset >= IntSize) {
942 return IC.replaceOperand(
943 II, 1,
944 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
945 }
946 }
947
948 bool Signed = IID == Intrinsic::amdgcn_sbfe;
949
950 if (!CWidth || !COffset)
951 break;
952
953 // The case of Width == 0 is handled above, which makes this transformation
954 // safe. If Width == 0, then the ashr and lshr instructions become poison
955 // value since the shift amount would be equal to the bit size.
956 assert(Width != 0);
957
958 // TODO: This allows folding to undef when the hardware has specific
959 // behavior?
960 if (Offset + Width < IntSize) {
961 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
962 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
963 : IC.Builder.CreateLShr(Shl, IntSize - Width);
964 RightShift->takeName(&II);
965 return IC.replaceInstUsesWith(II, RightShift);
966 }
967
968 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
969 : IC.Builder.CreateLShr(Src, Offset);
970
971 RightShift->takeName(&II);
972 return IC.replaceInstUsesWith(II, RightShift);
973 }
974 case Intrinsic::amdgcn_exp:
975 case Intrinsic::amdgcn_exp_row:
976 case Intrinsic::amdgcn_exp_compr: {
977 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
978 unsigned EnBits = En->getZExtValue();
979 if (EnBits == 0xf)
980 break; // All inputs enabled.
981
982 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
983 bool Changed = false;
984 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
985 if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
986 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
987 Value *Src = II.getArgOperand(I + 2);
988 if (!isa<PoisonValue>(Src)) {
989 IC.replaceOperand(II, I + 2, PoisonValue::get(Src->getType()));
990 Changed = true;
991 }
992 }
993 }
994
995 if (Changed) {
996 return &II;
997 }
998
999 break;
1000 }
1001 case Intrinsic::amdgcn_fmed3: {
1002 Value *Src0 = II.getArgOperand(0);
1003 Value *Src1 = II.getArgOperand(1);
1004 Value *Src2 = II.getArgOperand(2);
1005
1006 for (Value *Src : {Src0, Src1, Src2}) {
1007 if (isa<PoisonValue>(Src))
1008 return IC.replaceInstUsesWith(II, Src);
1009 }
1010
1011 if (II.isStrictFP())
1012 break;
1013
1014 // med3 with a nan input acts like
1015 // v_min_f32(v_min_f32(s0, s1), s2)
1016 //
1017 // Signalingness is ignored with ieee=0, so we fold to
1018 // minimumnum/maximumnum. With ieee=1, the v_min_f32 acts like llvm.minnum
1019 // with signaling nan handling. With ieee=0, like llvm.minimumnum except a
1020 // returned signaling nan will not be quieted.
1021
1022 // ieee=1
1023 // s0 snan: s2
1024 // s1 snan: s2
1025 // s2 snan: qnan
1026
1027 // s0 qnan: min(s1, s2)
1028 // s1 qnan: min(s0, s2)
1029 // s2 qnan: min(s0, s1)
1030
1031 // ieee=0
1032 // s0 _nan: min(s1, s2)
1033 // s1 _nan: min(s0, s2)
1034 // s2 _nan: min(s0, s1)
1035
1036 // med3 behavior with infinity
1037 // s0 +inf: max(s1, s2)
1038 // s1 +inf: max(s0, s2)
1039 // s2 +inf: max(s0, s1)
1040 // s0 -inf: min(s1, s2)
1041 // s1 -inf: min(s0, s2)
1042 // s2 -inf: min(s0, s1)
1043
1044 // Checking for NaN before canonicalization provides better fidelity when
1045 // mapping other operations onto fmed3 since the order of operands is
1046 // unchanged.
1047 Value *V = nullptr;
1048 const APFloat *ConstSrc0 = nullptr;
1049 const APFloat *ConstSrc1 = nullptr;
1050 const APFloat *ConstSrc2 = nullptr;
1051
1052 if ((match(Src0, m_APFloat(ConstSrc0)) &&
1053 (ConstSrc0->isNaN() || ConstSrc0->isInfinity())) ||
1054 isa<UndefValue>(Src0)) {
1055 const bool IsPosInfinity = ConstSrc0 && ConstSrc0->isPosInfinity();
1056 switch (fpenvIEEEMode(II)) {
1057 case KnownIEEEMode::On:
1058 // TODO: If Src2 is snan, does it need quieting?
1059 if (ConstSrc0 && ConstSrc0->isNaN() && ConstSrc0->isSignaling())
1060 return IC.replaceInstUsesWith(II, Src2);
1061
1062 V = IsPosInfinity ? IC.Builder.CreateMaxNum(Src1, Src2)
1063 : IC.Builder.CreateMinNum(Src1, Src2);
1064 break;
1065 case KnownIEEEMode::Off:
1066 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(Src1, Src2)
1067 : IC.Builder.CreateMinimumNum(Src1, Src2);
1068 break;
1070 break;
1071 }
1072 } else if ((match(Src1, m_APFloat(ConstSrc1)) &&
1073 (ConstSrc1->isNaN() || ConstSrc1->isInfinity())) ||
1074 isa<UndefValue>(Src1)) {
1075 const bool IsPosInfinity = ConstSrc1 && ConstSrc1->isPosInfinity();
1076 switch (fpenvIEEEMode(II)) {
1077 case KnownIEEEMode::On:
1078 // TODO: If Src2 is snan, does it need quieting?
1079 if (ConstSrc1 && ConstSrc1->isNaN() && ConstSrc1->isSignaling())
1080 return IC.replaceInstUsesWith(II, Src2);
1081
1082 V = IsPosInfinity ? IC.Builder.CreateMaxNum(Src0, Src2)
1083 : IC.Builder.CreateMinNum(Src0, Src2);
1084 break;
1085 case KnownIEEEMode::Off:
1086 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(Src0, Src2)
1087 : IC.Builder.CreateMinimumNum(Src0, Src2);
1088 break;
1090 break;
1091 }
1092 } else if ((match(Src2, m_APFloat(ConstSrc2)) &&
1093 (ConstSrc2->isNaN() || ConstSrc2->isInfinity())) ||
1094 isa<UndefValue>(Src2)) {
1095 switch (fpenvIEEEMode(II)) {
1096 case KnownIEEEMode::On:
1097 if (ConstSrc2 && ConstSrc2->isNaN() && ConstSrc2->isSignaling()) {
1098 auto *Quieted = ConstantFP::get(II.getType(), ConstSrc2->makeQuiet());
1099 return IC.replaceInstUsesWith(II, Quieted);
1100 }
1101
1102 V = (ConstSrc2 && ConstSrc2->isPosInfinity())
1103 ? IC.Builder.CreateMaxNum(Src0, Src1)
1104 : IC.Builder.CreateMinNum(Src0, Src1);
1105 break;
1106 case KnownIEEEMode::Off:
1107 V = (ConstSrc2 && ConstSrc2->isNegInfinity())
1108 ? IC.Builder.CreateMinimumNum(Src0, Src1)
1109 : IC.Builder.CreateMaximumNum(Src0, Src1);
1110 break;
1112 break;
1113 }
1114 }
1115
1116 if (V) {
1117 if (auto *CI = dyn_cast<CallInst>(V)) {
1118 CI->copyFastMathFlags(&II);
1119 CI->takeName(&II);
1120 }
1121 return IC.replaceInstUsesWith(II, V);
1122 }
1123
1124 bool Swap = false;
1125 // Canonicalize constants to RHS operands.
1126 //
1127 // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
1128 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
1129 std::swap(Src0, Src1);
1130 Swap = true;
1131 }
1132
1133 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
1134 std::swap(Src1, Src2);
1135 Swap = true;
1136 }
1137
1138 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
1139 std::swap(Src0, Src1);
1140 Swap = true;
1141 }
1142
1143 if (Swap) {
1144 II.setArgOperand(0, Src0);
1145 II.setArgOperand(1, Src1);
1146 II.setArgOperand(2, Src2);
1147 return &II;
1148 }
1149
1150 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
1151 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
1152 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
1153 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
1154 C2->getValueAPF());
1155 return IC.replaceInstUsesWith(II,
1156 ConstantFP::get(II.getType(), Result));
1157 }
1158 }
1159 }
1160
1161 if (!ST->hasMed3_16())
1162 break;
1163
1164 // Repeat floating-point width reduction done for minnum/maxnum.
1165 // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
1166 if (Value *X = matchFPExtFromF16(Src0)) {
1167 if (Value *Y = matchFPExtFromF16(Src1)) {
1168 if (Value *Z = matchFPExtFromF16(Src2)) {
1169 Value *NewCall = IC.Builder.CreateIntrinsic(
1170 IID, {X->getType()}, {X, Y, Z}, &II, II.getName());
1171 return new FPExtInst(NewCall, II.getType());
1172 }
1173 }
1174 }
1175
1176 break;
1177 }
1178 case Intrinsic::amdgcn_icmp:
1179 case Intrinsic::amdgcn_fcmp: {
1180 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
1181 // Guard against invalid arguments.
1182 int64_t CCVal = CC->getZExtValue();
1183 bool IsInteger = IID == Intrinsic::amdgcn_icmp;
1184 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
1185 CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
1186 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
1188 break;
1189
1190 Value *Src0 = II.getArgOperand(0);
1191 Value *Src1 = II.getArgOperand(1);
1192
1193 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
1194 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
1196 (ICmpInst::Predicate)CCVal, CSrc0, CSrc1, DL);
1197 if (CCmp && CCmp->isNullValue()) {
1198 return IC.replaceInstUsesWith(
1199 II, IC.Builder.CreateSExt(CCmp, II.getType()));
1200 }
1201
1202 // The result of V_ICMP/V_FCMP assembly instructions (which this
1203 // intrinsic exposes) is one bit per thread, masked with the EXEC
1204 // register (which contains the bitmask of live threads). So a
1205 // comparison that always returns true is the same as a read of the
1206 // EXEC register.
1207 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
1208 MDNode *MD = MDNode::get(II.getContext(), MDArgs);
1209 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
1210 CallInst *NewCall = IC.Builder.CreateIntrinsic(Intrinsic::read_register,
1211 II.getType(), Args);
1212 NewCall->addFnAttr(Attribute::Convergent);
1213 NewCall->takeName(&II);
1214 return IC.replaceInstUsesWith(II, NewCall);
1215 }
1216
1217 // Canonicalize constants to RHS.
1218 CmpInst::Predicate SwapPred =
1220 II.setArgOperand(0, Src1);
1221 II.setArgOperand(1, Src0);
1222 II.setArgOperand(
1223 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
1224 return &II;
1225 }
1226
1227 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
1228 break;
1229
1230 // Canonicalize compare eq with true value to compare != 0
1231 // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
1232 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
1233 // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
1234 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
1235 Value *ExtSrc;
1236 if (CCVal == CmpInst::ICMP_EQ &&
1237 ((match(Src1, PatternMatch::m_One()) &&
1238 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
1239 (match(Src1, PatternMatch::m_AllOnes()) &&
1240 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
1241 ExtSrc->getType()->isIntegerTy(1)) {
1243 IC.replaceOperand(II, 2,
1244 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
1245 return &II;
1246 }
1247
1248 CmpPredicate SrcPred;
1249 Value *SrcLHS;
1250 Value *SrcRHS;
1251
1252 // Fold compare eq/ne with 0 from a compare result as the predicate to the
1253 // intrinsic. The typical use is a wave vote function in the library, which
1254 // will be fed from a user code condition compared with 0. Fold in the
1255 // redundant compare.
1256
1257 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
1258 // -> llvm.amdgcn.[if]cmp(a, b, pred)
1259 //
1260 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
1261 // -> llvm.amdgcn.[if]cmp(a, b, inv pred)
1262 if (match(Src1, PatternMatch::m_Zero()) &&
1264 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
1265 PatternMatch::m_Value(SrcRHS))))) {
1266 if (CCVal == CmpInst::ICMP_EQ)
1267 SrcPred = CmpInst::getInversePredicate(SrcPred);
1268
1269 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
1270 ? Intrinsic::amdgcn_fcmp
1271 : Intrinsic::amdgcn_icmp;
1272
1273 Type *Ty = SrcLHS->getType();
1274 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
1275 // Promote to next legal integer type.
1276 unsigned Width = CmpType->getBitWidth();
1277 unsigned NewWidth = Width;
1278
1279 // Don't do anything for i1 comparisons.
1280 if (Width == 1)
1281 break;
1282
1283 if (Width <= 16)
1284 NewWidth = 16;
1285 else if (Width <= 32)
1286 NewWidth = 32;
1287 else if (Width <= 64)
1288 NewWidth = 64;
1289 else
1290 break; // Can't handle this.
1291
1292 if (Width != NewWidth) {
1293 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
1294 if (CmpInst::isSigned(SrcPred)) {
1295 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
1296 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
1297 } else {
1298 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
1299 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
1300 }
1301 }
1302 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
1303 break;
1304
1305 Value *Args[] = {SrcLHS, SrcRHS,
1306 ConstantInt::get(CC->getType(), SrcPred)};
1307 CallInst *NewCall = IC.Builder.CreateIntrinsic(
1308 NewIID, {II.getType(), SrcLHS->getType()}, Args);
1309 NewCall->takeName(&II);
1310 return IC.replaceInstUsesWith(II, NewCall);
1311 }
1312
1313 break;
1314 }
1315 case Intrinsic::amdgcn_mbcnt_hi: {
1316 // exec_hi is all 0, so this is just a copy.
1317 if (ST->isWave32())
1318 return IC.replaceInstUsesWith(II, II.getArgOperand(1));
1319 break;
1320 }
1321 case Intrinsic::amdgcn_ballot: {
1322 Value *Arg = II.getArgOperand(0);
1323 if (isa<PoisonValue>(Arg))
1324 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1325
1326 if (auto *Src = dyn_cast<ConstantInt>(Arg)) {
1327 if (Src->isZero()) {
1328 // amdgcn.ballot(i1 0) is zero.
1329 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
1330 }
1331 }
1332 if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
1333 // %b64 = call i64 ballot.i64(...)
1334 // =>
1335 // %b32 = call i32 ballot.i32(...)
1336 // %b64 = zext i32 %b32 to i64
1338 IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot,
1339 {IC.Builder.getInt32Ty()},
1340 {II.getArgOperand(0)}),
1341 II.getType());
1342 Call->takeName(&II);
1343 return IC.replaceInstUsesWith(II, Call);
1344 }
1345 break;
1346 }
1347 case Intrinsic::amdgcn_wavefrontsize: {
1348 if (ST->isWaveSizeKnown())
1349 return IC.replaceInstUsesWith(
1350 II, ConstantInt::get(II.getType(), ST->getWavefrontSize()));
1351 break;
1352 }
1353 case Intrinsic::amdgcn_wqm_vote: {
1354 // wqm_vote is identity when the argument is constant.
1355 if (!isa<Constant>(II.getArgOperand(0)))
1356 break;
1357
1358 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1359 }
1360 case Intrinsic::amdgcn_kill: {
1361 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
1362 if (!C || !C->getZExtValue())
1363 break;
1364
1365 // amdgcn.kill(i1 1) is a no-op
1366 return IC.eraseInstFromFunction(II);
1367 }
1368 case Intrinsic::amdgcn_update_dpp: {
1369 Value *Old = II.getArgOperand(0);
1370
1371 auto *BC = cast<ConstantInt>(II.getArgOperand(5));
1372 auto *RM = cast<ConstantInt>(II.getArgOperand(3));
1373 auto *BM = cast<ConstantInt>(II.getArgOperand(4));
1374 if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
1375 BM->getZExtValue() != 0xF || isa<PoisonValue>(Old))
1376 break;
1377
1378 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1379 return IC.replaceOperand(II, 0, PoisonValue::get(Old->getType()));
1380 }
1381 case Intrinsic::amdgcn_permlane16:
1382 case Intrinsic::amdgcn_permlane16_var:
1383 case Intrinsic::amdgcn_permlanex16:
1384 case Intrinsic::amdgcn_permlanex16_var: {
1385 // Discard vdst_in if it's not going to be read.
1386 Value *VDstIn = II.getArgOperand(0);
1387 if (isa<PoisonValue>(VDstIn))
1388 break;
1389
1390 // FetchInvalid operand idx.
1391 unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 ||
1392 IID == Intrinsic::amdgcn_permlanex16)
1393 ? 4 /* for permlane16 and permlanex16 */
1394 : 3; /* for permlane16_var and permlanex16_var */
1395
1396 // BoundCtrl operand idx.
1397 // For permlane16 and permlanex16 it should be 5
1398 // For Permlane16_var and permlanex16_var it should be 4
1399 unsigned int BcIdx = FiIdx + 1;
1400
1401 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(FiIdx));
1402 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(BcIdx));
1403 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1404 break;
1405
1406 return IC.replaceOperand(II, 0, PoisonValue::get(VDstIn->getType()));
1407 }
1408 case Intrinsic::amdgcn_permlane64:
1409 case Intrinsic::amdgcn_readfirstlane:
1410 case Intrinsic::amdgcn_readlane:
1411 case Intrinsic::amdgcn_ds_bpermute: {
1412 // If the data argument is uniform these intrinsics return it unchanged.
1413 unsigned SrcIdx = IID == Intrinsic::amdgcn_ds_bpermute ? 1 : 0;
1414 const Use &Src = II.getArgOperandUse(SrcIdx);
1415 if (isTriviallyUniform(Src))
1416 return IC.replaceInstUsesWith(II, Src.get());
1417
1418 if (IID == Intrinsic::amdgcn_readlane &&
1420 return &II;
1421
1422 // If the lane argument of bpermute is uniform, change it to readlane. This
1423 // generates better code and can enable further optimizations because
1424 // readlane is AlwaysUniform.
1425 if (IID == Intrinsic::amdgcn_ds_bpermute) {
1426 const Use &Lane = II.getArgOperandUse(0);
1427 if (isTriviallyUniform(Lane)) {
1428 Value *NewLane = IC.Builder.CreateLShr(Lane, 2);
1430 II.getModule(), Intrinsic::amdgcn_readlane, II.getType());
1431 II.setCalledFunction(NewDecl);
1432 II.setOperand(0, Src);
1433 II.setOperand(1, NewLane);
1434 return &II;
1435 }
1436 }
1437
1438 if (IID != Intrinsic::amdgcn_ds_bpermute) {
1440 return Res;
1441 }
1442
1443 return std::nullopt;
1444 }
1445 case Intrinsic::amdgcn_writelane: {
1446 // TODO: Fold bitcast like readlane.
1447 if (simplifyDemandedLaneMaskArg(IC, II, 1))
1448 return &II;
1449 return std::nullopt;
1450 }
1451 case Intrinsic::amdgcn_trig_preop: {
1452 // The intrinsic is declared with name mangling, but currently the
1453 // instruction only exists for f64
1454 if (!II.getType()->isDoubleTy())
1455 break;
1456
1457 Value *Src = II.getArgOperand(0);
1458 Value *Segment = II.getArgOperand(1);
1459 if (isa<PoisonValue>(Src) || isa<PoisonValue>(Segment))
1460 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1461
1462 if (isa<UndefValue>(Src)) {
1463 auto *QNaN = ConstantFP::get(
1464 II.getType(), APFloat::getQNaN(II.getType()->getFltSemantics()));
1465 return IC.replaceInstUsesWith(II, QNaN);
1466 }
1467
1468 const ConstantFP *Csrc = dyn_cast<ConstantFP>(Src);
1469 if (!Csrc)
1470 break;
1471
1472 if (II.isStrictFP())
1473 break;
1474
1475 const APFloat &Fsrc = Csrc->getValueAPF();
1476 if (Fsrc.isNaN()) {
1477 auto *Quieted = ConstantFP::get(II.getType(), Fsrc.makeQuiet());
1478 return IC.replaceInstUsesWith(II, Quieted);
1479 }
1480
1481 const ConstantInt *Cseg = dyn_cast<ConstantInt>(Segment);
1482 if (!Cseg)
1483 break;
1484
1485 unsigned Exponent = (Fsrc.bitcastToAPInt().getZExtValue() >> 52) & 0x7ff;
1486 unsigned SegmentVal = Cseg->getValue().trunc(5).getZExtValue();
1487 unsigned Shift = SegmentVal * 53;
1488 if (Exponent > 1077)
1489 Shift += Exponent - 1077;
1490
1491 // 2.0/PI table.
1492 static const uint32_t TwoByPi[] = {
1493 0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,
1494 0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,
1495 0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
1496 0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,
1497 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
1498 0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
1499 0x56033046};
1500
1501 // Return 0 for outbound segment (hardware behavior).
1502 unsigned Idx = Shift >> 5;
1503 if (Idx + 2 >= std::size(TwoByPi)) {
1504 APFloat Zero = APFloat::getZero(II.getType()->getFltSemantics());
1505 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getType(), Zero));
1506 }
1507
1508 unsigned BShift = Shift & 0x1f;
1509 uint64_t Thi = Make_64(TwoByPi[Idx], TwoByPi[Idx + 1]);
1510 uint64_t Tlo = Make_64(TwoByPi[Idx + 2], 0);
1511 if (BShift)
1512 Thi = (Thi << BShift) | (Tlo >> (64 - BShift));
1513 Thi = Thi >> 11;
1514 APFloat Result = APFloat((double)Thi);
1515
1516 int Scale = -53 - Shift;
1517 if (Exponent >= 1968)
1518 Scale += 128;
1519
1520 Result = scalbn(Result, Scale, RoundingMode::NearestTiesToEven);
1521 return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Result));
1522 }
1523 case Intrinsic::amdgcn_fmul_legacy: {
1524 Value *Op0 = II.getArgOperand(0);
1525 Value *Op1 = II.getArgOperand(1);
1526
1527 for (Value *Src : {Op0, Op1}) {
1528 if (isa<PoisonValue>(Src))
1529 return IC.replaceInstUsesWith(II, Src);
1530 }
1531
1532 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1533 // infinity, gives +0.0.
1534 // TODO: Move to InstSimplify?
1535 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1537 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
1538
1539 // If we can prove we don't have one of the special cases then we can use a
1540 // normal fmul instruction instead.
1541 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
1542 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
1543 FMul->takeName(&II);
1544 return IC.replaceInstUsesWith(II, FMul);
1545 }
1546 break;
1547 }
1548 case Intrinsic::amdgcn_fma_legacy: {
1549 Value *Op0 = II.getArgOperand(0);
1550 Value *Op1 = II.getArgOperand(1);
1551 Value *Op2 = II.getArgOperand(2);
1552
1553 for (Value *Src : {Op0, Op1, Op2}) {
1554 if (isa<PoisonValue>(Src))
1555 return IC.replaceInstUsesWith(II, Src);
1556 }
1557
1558 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1559 // infinity, gives +0.0.
1560 // TODO: Move to InstSimplify?
1561 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1563 // It's tempting to just return Op2 here, but that would give the wrong
1564 // result if Op2 was -0.0.
1565 auto *Zero = ConstantFP::getZero(II.getType());
1566 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
1567 FAdd->takeName(&II);
1568 return IC.replaceInstUsesWith(II, FAdd);
1569 }
1570
1571 // If we can prove we don't have one of the special cases then we can use a
1572 // normal fma instead.
1573 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
1574 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
1575 II.getModule(), Intrinsic::fma, II.getType()));
1576 return &II;
1577 }
1578 break;
1579 }
1580 case Intrinsic::amdgcn_is_shared:
1581 case Intrinsic::amdgcn_is_private: {
1582 Value *Src = II.getArgOperand(0);
1583 if (isa<PoisonValue>(Src))
1584 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1585 if (isa<UndefValue>(Src))
1586 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1587
1588 if (isa<ConstantPointerNull>(II.getArgOperand(0)))
1589 return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
1590 break;
1591 }
1592 case Intrinsic::amdgcn_make_buffer_rsrc: {
1593 Value *Src = II.getArgOperand(0);
1594 if (isa<PoisonValue>(Src))
1595 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1596 return std::nullopt;
1597 }
1598 case Intrinsic::amdgcn_raw_buffer_store_format:
1599 case Intrinsic::amdgcn_struct_buffer_store_format:
1600 case Intrinsic::amdgcn_raw_tbuffer_store:
1601 case Intrinsic::amdgcn_struct_tbuffer_store:
1602 case Intrinsic::amdgcn_image_store_1d:
1603 case Intrinsic::amdgcn_image_store_1darray:
1604 case Intrinsic::amdgcn_image_store_2d:
1605 case Intrinsic::amdgcn_image_store_2darray:
1606 case Intrinsic::amdgcn_image_store_2darraymsaa:
1607 case Intrinsic::amdgcn_image_store_2dmsaa:
1608 case Intrinsic::amdgcn_image_store_3d:
1609 case Intrinsic::amdgcn_image_store_cube:
1610 case Intrinsic::amdgcn_image_store_mip_1d:
1611 case Intrinsic::amdgcn_image_store_mip_1darray:
1612 case Intrinsic::amdgcn_image_store_mip_2d:
1613 case Intrinsic::amdgcn_image_store_mip_2darray:
1614 case Intrinsic::amdgcn_image_store_mip_3d:
1615 case Intrinsic::amdgcn_image_store_mip_cube: {
1616 if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))
1617 break;
1618
1619 APInt DemandedElts;
1620 if (ST->hasDefaultComponentBroadcast())
1621 DemandedElts = defaultComponentBroadcast(II.getArgOperand(0));
1622 else if (ST->hasDefaultComponentZero())
1623 DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
1624 else
1625 break;
1626
1627 int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
1628 if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
1629 false)) {
1630 return IC.eraseInstFromFunction(II);
1631 }
1632
1633 break;
1634 }
1635 case Intrinsic::amdgcn_prng_b32: {
1636 auto *Src = II.getArgOperand(0);
1637 if (isa<UndefValue>(Src)) {
1638 return IC.replaceInstUsesWith(II, Src);
1639 }
1640 return std::nullopt;
1641 }
1642 case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
1643 case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
1644 Value *Src0 = II.getArgOperand(0);
1645 Value *Src1 = II.getArgOperand(1);
1646 uint64_t CBSZ = cast<ConstantInt>(II.getArgOperand(3))->getZExtValue();
1647 uint64_t BLGP = cast<ConstantInt>(II.getArgOperand(4))->getZExtValue();
1648 auto *Src0Ty = cast<FixedVectorType>(Src0->getType());
1649 auto *Src1Ty = cast<FixedVectorType>(Src1->getType());
1650
1651 auto getFormatNumRegs = [](unsigned FormatVal) {
1652 switch (FormatVal) {
1655 return 6u;
1657 return 4u;
1660 return 8u;
1661 default:
1662 llvm_unreachable("invalid format value");
1663 }
1664 };
1665
1666 bool MadeChange = false;
1667 unsigned Src0NumElts = getFormatNumRegs(CBSZ);
1668 unsigned Src1NumElts = getFormatNumRegs(BLGP);
1669
1670 // Depending on the used format, fewer registers are required so shrink the
1671 // vector type.
1672 if (Src0Ty->getNumElements() > Src0NumElts) {
1673 Src0 = IC.Builder.CreateExtractVector(
1674 FixedVectorType::get(Src0Ty->getElementType(), Src0NumElts), Src0,
1675 uint64_t(0));
1676 MadeChange = true;
1677 }
1678
1679 if (Src1Ty->getNumElements() > Src1NumElts) {
1680 Src1 = IC.Builder.CreateExtractVector(
1681 FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1,
1682 uint64_t(0));
1683 MadeChange = true;
1684 }
1685
1686 if (!MadeChange)
1687 return std::nullopt;
1688
1689 SmallVector<Value *, 10> Args(II.args());
1690 Args[0] = Src0;
1691 Args[1] = Src1;
1692
1693 CallInst *NewII = IC.Builder.CreateIntrinsic(
1694 IID, {Src0->getType(), Src1->getType()}, Args, &II);
1695 NewII->takeName(&II);
1696 return IC.replaceInstUsesWith(II, NewII);
1697 }
1698 case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
1699 case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
1700 case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: {
1701 Value *Src0 = II.getArgOperand(1);
1702 Value *Src1 = II.getArgOperand(3);
1703 unsigned FmtA = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
1704 uint64_t FmtB = cast<ConstantInt>(II.getArgOperand(2))->getZExtValue();
1705 auto *Src0Ty = cast<FixedVectorType>(Src0->getType());
1706 auto *Src1Ty = cast<FixedVectorType>(Src1->getType());
1707
1708 bool MadeChange = false;
1709 unsigned Src0NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtA);
1710 unsigned Src1NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtB);
1711
1712 // Depending on the used format, fewer registers are required so shrink the
1713 // vector type.
1714 if (Src0Ty->getNumElements() > Src0NumElts) {
1715 Src0 = IC.Builder.CreateExtractVector(
1716 FixedVectorType::get(Src0Ty->getElementType(), Src0NumElts), Src0,
1717 IC.Builder.getInt64(0));
1718 MadeChange = true;
1719 }
1720
1721 if (Src1Ty->getNumElements() > Src1NumElts) {
1722 Src1 = IC.Builder.CreateExtractVector(
1723 FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1,
1724 IC.Builder.getInt64(0));
1725 MadeChange = true;
1726 }
1727
1728 if (!MadeChange)
1729 return std::nullopt;
1730
1731 SmallVector<Value *, 13> Args(II.args());
1732 Args[1] = Src0;
1733 Args[3] = Src1;
1734
1735 CallInst *NewII = IC.Builder.CreateIntrinsic(
1736 IID, {II.getArgOperand(5)->getType(), Src0->getType(), Src1->getType()},
1737 Args, &II);
1738 NewII->takeName(&II);
1739 return IC.replaceInstUsesWith(II, NewII);
1740 }
1741 }
1742 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1743 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
1744 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1745 }
1746 return std::nullopt;
1747}
1748
1749/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1750///
1751/// The result of simplifying amdgcn image and buffer store intrinsics is updating
1752/// definitions of the intrinsics vector argument, not Uses of the result like
1753/// image and buffer loads.
1754/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1755/// struct returns.
1758 APInt DemandedElts,
1759 int DMaskIdx, bool IsLoad) {
1760
1761 auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType()
1762 : II.getOperand(0)->getType());
1763 unsigned VWidth = IIVTy->getNumElements();
1764 if (VWidth == 1)
1765 return nullptr;
1766 Type *EltTy = IIVTy->getElementType();
1767
1770
1771 // Assume the arguments are unchanged and later override them, if needed.
1772 SmallVector<Value *, 16> Args(II.args());
1773
1774 if (DMaskIdx < 0) {
1775 // Buffer case.
1776
1777 const unsigned ActiveBits = DemandedElts.getActiveBits();
1778 const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
1779
1780 // Start assuming the prefix of elements is demanded, but possibly clear
1781 // some other bits if there are trailing zeros (unused components at front)
1782 // and update offset.
1783 DemandedElts = (1 << ActiveBits) - 1;
1784
1785 if (UnusedComponentsAtFront > 0) {
1786 static const unsigned InvalidOffsetIdx = 0xf;
1787
1788 unsigned OffsetIdx;
1789 switch (II.getIntrinsicID()) {
1790 case Intrinsic::amdgcn_raw_buffer_load:
1791 case Intrinsic::amdgcn_raw_ptr_buffer_load:
1792 OffsetIdx = 1;
1793 break;
1794 case Intrinsic::amdgcn_s_buffer_load:
1795 // If resulting type is vec3, there is no point in trimming the
1796 // load with updated offset, as the vec3 would most likely be widened to
1797 // vec4 anyway during lowering.
1798 if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
1799 OffsetIdx = InvalidOffsetIdx;
1800 else
1801 OffsetIdx = 1;
1802 break;
1803 case Intrinsic::amdgcn_struct_buffer_load:
1804 case Intrinsic::amdgcn_struct_ptr_buffer_load:
1805 OffsetIdx = 2;
1806 break;
1807 default:
1808 // TODO: handle tbuffer* intrinsics.
1809 OffsetIdx = InvalidOffsetIdx;
1810 break;
1811 }
1812
1813 if (OffsetIdx != InvalidOffsetIdx) {
1814 // Clear demanded bits and update the offset.
1815 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
1816 auto *Offset = Args[OffsetIdx];
1817 unsigned SingleComponentSizeInBits =
1818 IC.getDataLayout().getTypeSizeInBits(EltTy);
1819 unsigned OffsetAdd =
1820 UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
1821 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
1822 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
1823 }
1824 }
1825 } else {
1826 // Image case.
1827
1828 ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]);
1829 unsigned DMaskVal = DMask->getZExtValue() & 0xf;
1830
1831 // dmask 0 has special semantics, do not simplify.
1832 if (DMaskVal == 0)
1833 return nullptr;
1834
1835 // Mask off values that are undefined because the dmask doesn't cover them
1836 DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
1837
1838 unsigned NewDMaskVal = 0;
1839 unsigned OrigLdStIdx = 0;
1840 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
1841 const unsigned Bit = 1 << SrcIdx;
1842 if (!!(DMaskVal & Bit)) {
1843 if (!!DemandedElts[OrigLdStIdx])
1844 NewDMaskVal |= Bit;
1845 OrigLdStIdx++;
1846 }
1847 }
1848
1849 if (DMaskVal != NewDMaskVal)
1850 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
1851 }
1852
1853 unsigned NewNumElts = DemandedElts.popcount();
1854 if (!NewNumElts)
1855 return PoisonValue::get(IIVTy);
1856
1857 if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1858 if (DMaskIdx >= 0)
1859 II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
1860 return nullptr;
1861 }
1862
1863 // Validate function argument and return types, extracting overloaded types
1864 // along the way.
1865 SmallVector<Type *, 6> OverloadTys;
1866 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
1867 return nullptr;
1868
1869 Type *NewTy =
1870 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
1871 OverloadTys[0] = NewTy;
1872
1873 if (!IsLoad) {
1874 SmallVector<int, 8> EltMask;
1875 for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
1876 if (DemandedElts[OrigStoreIdx])
1877 EltMask.push_back(OrigStoreIdx);
1878
1879 if (NewNumElts == 1)
1880 Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]);
1881 else
1882 Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask);
1883 }
1884
1885 CallInst *NewCall =
1886 IC.Builder.CreateIntrinsic(II.getIntrinsicID(), OverloadTys, Args);
1887 NewCall->takeName(&II);
1888 NewCall->copyMetadata(II);
1889
1890 if (IsLoad) {
1891 if (NewNumElts == 1) {
1892 return IC.Builder.CreateInsertElement(PoisonValue::get(IIVTy), NewCall,
1893 DemandedElts.countr_zero());
1894 }
1895
1896 SmallVector<int, 8> EltMask;
1897 unsigned NewLoadIdx = 0;
1898 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1899 if (!!DemandedElts[OrigLoadIdx])
1900 EltMask.push_back(NewLoadIdx++);
1901 else
1902 EltMask.push_back(NewNumElts);
1903 }
1904
1905 auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
1906
1907 return Shuffle;
1908 }
1909
1910 return NewCall;
1911}
1912
1914 InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts,
1915 APInt &UndefElts) const {
1916 auto *VT = dyn_cast<FixedVectorType>(II.getType());
1917 if (!VT)
1918 return nullptr;
1919
1920 const unsigned FirstElt = DemandedElts.countr_zero();
1921 const unsigned LastElt = DemandedElts.getActiveBits() - 1;
1922 const unsigned MaskLen = LastElt - FirstElt + 1;
1923
1924 unsigned OldNumElts = VT->getNumElements();
1925 if (MaskLen == OldNumElts && MaskLen != 1)
1926 return nullptr;
1927
1928 Type *EltTy = VT->getElementType();
1929 Type *NewVT = MaskLen == 1 ? EltTy : FixedVectorType::get(EltTy, MaskLen);
1930
1931 // Theoretically we should support these intrinsics for any legal type. Avoid
1932 // introducing cases that aren't direct register types like v3i16.
1933 if (!isTypeLegal(NewVT))
1934 return nullptr;
1935
1936 Value *Src = II.getArgOperand(0);
1937
1938 // Make sure convergence tokens are preserved.
1939 // TODO: CreateIntrinsic should allow directly copying bundles
1941 II.getOperandBundlesAsDefs(OpBundles);
1942
1944 Function *Remangled =
1945 Intrinsic::getOrInsertDeclaration(M, II.getIntrinsicID(), {NewVT});
1946
1947 if (MaskLen == 1) {
1948 Value *Extract = IC.Builder.CreateExtractElement(Src, FirstElt);
1949
1950 // TODO: Preserve callsite attributes?
1951 CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
1952
1953 return IC.Builder.CreateInsertElement(PoisonValue::get(II.getType()),
1954 NewCall, FirstElt);
1955 }
1956
1957 SmallVector<int> ExtractMask(MaskLen, -1);
1958 for (unsigned I = 0; I != MaskLen; ++I) {
1959 if (DemandedElts[FirstElt + I])
1960 ExtractMask[I] = FirstElt + I;
1961 }
1962
1963 Value *Extract = IC.Builder.CreateShuffleVector(Src, ExtractMask);
1964
1965 // TODO: Preserve callsite attributes?
1966 CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
1967
1968 SmallVector<int> InsertMask(OldNumElts, -1);
1969 for (unsigned I = 0; I != MaskLen; ++I) {
1970 if (DemandedElts[FirstElt + I])
1971 InsertMask[FirstElt + I] = I;
1972 }
1973
1974 // FIXME: If the call has a convergence bundle, we end up leaving the dead
1975 // call behind.
1976 return IC.Builder.CreateShuffleVector(NewCall, InsertMask);
1977}
1978
1980 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1981 APInt &UndefElts2, APInt &UndefElts3,
1982 std::function<void(Instruction *, unsigned, APInt, APInt &)>
1983 SimplifyAndSetOp) const {
1984 switch (II.getIntrinsicID()) {
1985 case Intrinsic::amdgcn_readfirstlane:
1986 SimplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1987 return simplifyAMDGCNLaneIntrinsicDemanded(IC, II, DemandedElts, UndefElts);
1988 case Intrinsic::amdgcn_raw_buffer_load:
1989 case Intrinsic::amdgcn_raw_ptr_buffer_load:
1990 case Intrinsic::amdgcn_raw_buffer_load_format:
1991 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
1992 case Intrinsic::amdgcn_raw_tbuffer_load:
1993 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
1994 case Intrinsic::amdgcn_s_buffer_load:
1995 case Intrinsic::amdgcn_struct_buffer_load:
1996 case Intrinsic::amdgcn_struct_ptr_buffer_load:
1997 case Intrinsic::amdgcn_struct_buffer_load_format:
1998 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
1999 case Intrinsic::amdgcn_struct_tbuffer_load:
2000 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
2001 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
2002 default: {
2003 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
2004 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
2005 }
2006 break;
2007 }
2008 }
2009 return std::nullopt;
2010}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp)
Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
static bool isTriviallyUniform(const Use &U)
Return true if we can easily prove that use U is uniform.
static CallInst * rewriteCall(IRBuilderBase &B, CallInst &Old, Function &NewCallee, ArrayRef< Value * > Ops)
static Value * convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder)
static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV, Instruction *I)
static APInt defaultComponentBroadcast(Value *V)
static std::optional< Instruction * > modifyIntrinsicCall(IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr, InstCombiner &IC, std::function< void(SmallVectorImpl< Value * > &, SmallVectorImpl< Type * > &)> Func)
Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with modified arguments (based on ...
static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, const APFloat &Src2)
static Value * simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, int DMaskIdx=-1, bool IsLoad=true)
Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
static std::optional< Instruction * > simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, IntrinsicInst &II, InstCombiner &IC)
static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat)
static Value * matchFPExtFromF16(Value *Arg)
Match an fpext from half to float, or a constant we can convert.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
This file a TargetTransformInfoImplBase conforming object specific to the AMDGPU target machine.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define I(x, y, z)
Definition MD5.cpp:57
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
if(PassOpts->AAPipeline)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
cmpResult
IEEE-754R 5.11: Floating Point Comparison Relations.
Definition APFloat.h:334
static constexpr roundingMode rmTowardZero
Definition APFloat.h:348
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1102
opStatus divide(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1190
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6053
bool isPosInfinity() const
Definition APFloat.h:1444
const fltSemantics & getSemantics() const
Definition APFloat.h:1439
APFloat makeQuiet() const
Assuming this is an IEEE-754 NaN value, quiet its signaling bit.
Definition APFloat.h:1298
bool isNaN() const
Definition APFloat.h:1429
bool isSignaling() const
Definition APFloat.h:1433
APInt bitcastToAPInt() const
Definition APFloat.h:1335
bool isNegInfinity() const
Definition APFloat.h:1445
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1061
cmpResult compare(const APFloat &RHS) const
Definition APFloat.h:1386
bool isInfinity() const
Definition APFloat.h:1428
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1407
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1541
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1671
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1513
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1640
bool isMask(unsigned numBits) const
Definition APInt.h:489
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
bool isTypeLegal(Type *Ty) const override
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_NE
not equal
Definition InstrTypes.h:698
bool isSigned() const
Definition InstrTypes.h:930
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition InstrTypes.h:827
bool isFPPredicate() const
Definition InstrTypes.h:782
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:282
const APFloat & getValueAPF() const
Definition Constants.h:325
static LLVM_ABI Constant * getInfinity(Type *Ty, bool Negative=false)
static LLVM_ABI Constant * getZero(Type *Ty, bool Negative=false)
static LLVM_ABI Constant * getNaN(Type *Ty, bool Negative=false, uint64_t Payload=0)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:136
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
LLVM_ABI bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition Constants.cpp:90
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:771
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
This class represents an extension of floating point types.
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:333
bool hasApproxFunc() const
Test if this operation allows approximations of math library functions or intrinsics.
Definition Operator.h:328
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool allowContract() const
Definition FMF.h:69
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II, unsigned LaneAgIdx) const
Simplify a lane index operand (e.g.
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
Instruction * hoistLaneIntrinsicThroughOperand(InstCombiner &IC, IntrinsicInst &II) const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
Value * simplifyAMDGCNLaneIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts, APInt &UndefElts) const
bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0, const Value *Op1, InstCombiner &IC) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition IRBuilder.h:1093
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2567
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2097
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1513
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:201
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2336
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateMaxNum(Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create call to the maxnum intrinsic.
Definition IRBuilder.h:1024
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1492
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2085
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601
Value * CreateMaximumNum(Value *LHS, Value *RHS, const Twine &Name="")
Create call to the maximum intrinsic.
Definition IRBuilder.h:1052
Value * CreateMinNum(Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create call to the minnum intrinsic.
Definition IRBuilder.h:1012
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1403
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2511
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFAddFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1618
Value * CreateMinimumNum(Value *LHS, Value *RHS, const Twine &Name="")
Create call to the minimumnum intrinsic.
Definition IRBuilder.h:1046
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1532
Value * CreateFMulFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1656
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
The core instruction combiner logic.
const DataLayout & getDataLayout() const
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
IRBuilder< TargetFolder, IRBuilderCallbackInserter > BuilderTy
An IRBuilder that automatically inserts new instructions into the worklist.
DominatorTree & getDominatorTree() const
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, const SimplifyQuery &Q, unsigned Depth=0)=0
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
const SimplifyQuery & getSimplifyQuery() const
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
Metadata node.
Definition Metadata.h:1078
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:608
static LLVM_ABI MetadataAsValue * get(LLVMContext &Context, Metadata *MD)
Definition Metadata.cpp:104
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:295
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI Type * getHalfTy(LLVMContext &C)
Definition Type.cpp:282
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:245
void setOperand(unsigned i, Value *Val)
Definition User.h:237
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:166
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY const MIMGOffsetMappingInfo * getMIMGOffsetMappingInfo(unsigned Offset)
uint8_t wmmaScaleF8F6F4FormatToNumRegs(unsigned Fmt)
const ImageDimIntrinsicInfo * getImageDimIntrinsicByBaseOpcode(unsigned BaseOpcode, unsigned Dim)
LLVM_READONLY const MIMGMIPMappingInfo * getMIMGMIPMappingInfo(unsigned MIP)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY const MIMGBiasMappingInfo * getMIMGBiasMappingInfo(unsigned Bias)
LLVM_READONLY const MIMGLZMappingInfo * getMIMGLZMappingInfo(unsigned L)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI bool getIntrinsicSignature(Intrinsic::ID, FunctionType *FT, SmallVectorImpl< Type * > &ArgTys)
Gets the type arguments of an intrinsic call by matching type contraints specified by the ....
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
bool match(Val *V, const Pattern &P)
cstfp_pred_ty< is_any_zero_fp > m_AnyZeroFP()
Match a floating-point negative zero or positive zero.
ap_match< APFloat > m_APFloat(const APFloat *&Res)
Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
class_match< ConstantFP > m_ConstantFP()
Match an arbitrary ConstantFP and ignore it.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
cstfp_pred_ty< is_finitenonzero > m_FiniteNonZero()
Match a finite non-zero FP constant.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:532
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI Constant * ConstantFoldCompareInstOperands(unsigned Predicate, Constant *LHS, Constant *RHS, const DataLayout &DL, const TargetLibraryInfo *TLI=nullptr, const Instruction *I=nullptr)
Attempt to constant fold a compare instruction (icmp/fcmp) with the specified operands.
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
APFloat frexp(const APFloat &X, int &Exp, APFloat::roundingMode RM)
Equivalent of C standard library function.
Definition APFloat.h:1537
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE-754 2008 maxNum semantics.
Definition APFloat.h:1580
APFloat scalbn(APFloat X, int Exp, APFloat::roundingMode RM)
Returns: X * 2^Exp for integral exponents.
Definition APFloat.h:1525
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
@ FMul
Product of floats.
@ FAdd
Sum of floats.
LLVM_ABI Value * findScalarElement(Value *V, unsigned EltNo)
Given a vector and an element number, see if the scalar value is already around as a register,...
@ NearestTiesToEven
roundTiesToEven.
LLVM_ABI bool isKnownNeverInfOrNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point value can never contain a NaN or infinity.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
constexpr uint64_t Make_64(uint32_t High, uint32_t Low)
Make a 64-bit integer from a high / low pair of 32-bit integers.
Definition MathExtras.h:160
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition KnownBits.h:60
SimplifyQuery getWithInstruction(const Instruction *I) const
LLVM_ABI bool isUndefValue(Value *V) const
If CanUseUndef is true, returns whether V is undef.