LLVM 23.0.0git
AMDGPUInstCombineIntrinsic.cpp
Go to the documentation of this file.
1//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
20#include "SIDefines.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/ADT/Sequence.h"
26#include "llvm/IR/Constants.h"
27#include "llvm/IR/Dominators.h"
28#include "llvm/IR/IntrinsicsAMDGPU.h"
31#include <optional>
32
33using namespace llvm;
34using namespace llvm::PatternMatch;
35
36#define DEBUG_TYPE "AMDGPUtti"
37
38namespace {
39
40struct AMDGPUImageDMaskIntrinsic {
41 unsigned Intr;
42};
43
44#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
45#include "AMDGPUGenSearchableTables.inc"
46
47} // end anonymous namespace
48
49// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
50//
51// A single NaN input is folded to minnum, so we rely on that folding for
52// handling NaNs.
53static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
54 const APFloat &Src2) {
55 assert(!Src0.isNaN() && !Src1.isNaN() && !Src2.isNaN() &&
56 "nans handled separately");
57 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
58
59 if (Max3.bitwiseIsEqual(Src0))
60 return maxnum(Src1, Src2);
61
62 if (Max3.bitwiseIsEqual(Src1))
63 return maxnum(Src0, Src2);
64
65 return maxnum(Src0, Src1);
66}
67
68// Check if a value can be converted to a 16-bit value without losing
69// precision.
70// The value is expected to be either a float (IsFloat = true) or an unsigned
71// integer (IsFloat = false).
72static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
73 Type *VTy = V.getType();
74 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
75 // The value is already 16-bit, so we don't want to convert to 16-bit again!
76 return false;
77 }
78 if (IsFloat) {
79 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
80 // We need to check that if we cast the index down to a half, we do not
81 // lose precision.
82 APFloat FloatValue(ConstFloat->getValueAPF());
83 bool LosesInfo = true;
85 &LosesInfo);
86 return !LosesInfo;
87 }
88 } else {
89 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
90 // We need to check that if we cast the index down to an i16, we do not
91 // lose precision.
92 APInt IntValue(ConstInt->getValue());
93 return IntValue.getActiveBits() <= 16;
94 }
95 }
96
97 Value *CastSrc;
98 bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
99 : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
100 if (IsExt) {
101 Type *CastSrcTy = CastSrc->getType();
102 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
103 return true;
104 }
105
106 return false;
107}
108
109// Convert a value to 16-bit.
111 Type *VTy = V.getType();
113 return cast<Instruction>(&V)->getOperand(0);
114 if (VTy->isIntegerTy())
115 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
116 if (VTy->isFloatingPointTy())
117 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
118
119 llvm_unreachable("Should never be called!");
120}
121
122/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
123/// modified arguments (based on OldIntr) and replaces InstToReplace with
124/// this newly created intrinsic call.
125static std::optional<Instruction *> modifyIntrinsicCall(
126 IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
127 InstCombiner &IC,
128 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
129 Func) {
130 SmallVector<Type *, 4> OverloadTys;
131 if (!Intrinsic::isSignatureValid(OldIntr.getCalledFunction(), OverloadTys))
132 return std::nullopt;
133
134 SmallVector<Value *, 8> Args(OldIntr.args());
135
136 // Modify arguments and types
137 Func(Args, OverloadTys);
138
139 CallInst *NewCall =
140 IC.Builder.CreateIntrinsicWithoutFolding(NewIntr, OverloadTys, Args);
141 NewCall->takeName(&OldIntr);
142 NewCall->copyMetadata(OldIntr);
143 if (isa<FPMathOperator>(NewCall))
144 NewCall->copyFastMathFlags(&OldIntr);
145 // Copy attributes
146 AttributeList OldAttrList = OldIntr.getAttributes();
147 NewCall->setAttributes(OldAttrList);
148
149 // Erase and replace uses
150 if (!InstToReplace.getType()->isVoidTy())
151 IC.replaceInstUsesWith(InstToReplace, NewCall);
152
153 bool RemoveOldIntr = &OldIntr != &InstToReplace;
154
155 auto *RetValue = IC.eraseInstFromFunction(InstToReplace);
156 if (RemoveOldIntr)
157 IC.eraseInstFromFunction(OldIntr);
158
159 return RetValue;
160}
161
162static std::optional<Instruction *>
164 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
166 // Optimize _L to _LZ when _L is zero
167 if (const auto *LZMappingInfo =
169 if (auto *ConstantLod =
170 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
171 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
172 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
174 ImageDimIntr->Dim);
175 return modifyIntrinsicCall(
176 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
177 Args.erase(Args.begin() + ImageDimIntr->LodIndex);
178 });
179 }
180 }
181 }
182
183 // Optimize _mip away, when 'lod' is zero
184 if (const auto *MIPMappingInfo =
186 if (auto *ConstantMip =
187 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
188 if (ConstantMip->isZero()) {
189 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
190 AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
191 ImageDimIntr->Dim);
192 return modifyIntrinsicCall(
193 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
194 Args.erase(Args.begin() + ImageDimIntr->MipIndex);
195 });
196 }
197 }
198 }
199
200 // Optimize _bias away when 'bias' is zero
201 if (const auto *BiasMappingInfo =
203 if (auto *ConstantBias =
204 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
205 if (ConstantBias->isZero()) {
206 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
207 AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
208 ImageDimIntr->Dim);
209 return modifyIntrinsicCall(
210 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
211 Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
212 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
213 });
214 }
215 }
216 }
217
218 // Optimize _offset away when 'offset' is zero
219 if (const auto *OffsetMappingInfo =
221 if (auto *ConstantOffset =
222 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
223 if (ConstantOffset->isZero()) {
224 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
226 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
227 return modifyIntrinsicCall(
228 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
229 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
230 });
231 }
232 }
233 }
234
235 // Try to use D16
236 if (ST->hasD16Images()) {
237
238 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
240
241 if (BaseOpcode->HasD16) {
242
243 // If the only use of image intrinsic is a fptrunc (with conversion to
244 // half) then both fptrunc and image intrinsic will be replaced with image
245 // intrinsic with D16 flag.
246 if (II.hasOneUse()) {
247 Instruction *User = II.user_back();
248
249 if (User->getOpcode() == Instruction::FPTrunc &&
251
252 return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
253 [&](auto &Args, auto &ArgTys) {
254 // Change return type of image intrinsic.
255 // Set it to return type of fptrunc.
256 ArgTys[0] = User->getType();
257 });
258 }
259 }
260
261 // Only perform D16 folding if every user of the image sample is
262 // an ExtractElementInst immediately followed by an FPTrunc to half.
264 ExtractTruncPairs;
265 bool AllHalfExtracts = true;
266
267 for (User *U : II.users()) {
268 auto *Ext = dyn_cast<ExtractElementInst>(U);
269 if (!Ext || !Ext->hasOneUse()) {
270 AllHalfExtracts = false;
271 break;
272 }
273
274 auto *Tr = dyn_cast<FPTruncInst>(*Ext->user_begin());
275 if (!Tr || !Tr->getType()->isHalfTy()) {
276 AllHalfExtracts = false;
277 break;
278 }
279
280 ExtractTruncPairs.emplace_back(Ext, Tr);
281 }
282
283 if (!ExtractTruncPairs.empty() && AllHalfExtracts) {
284 auto *VecTy = cast<VectorType>(II.getType());
285 Type *HalfVecTy =
286 VecTy->getWithNewType(Type::getHalfTy(II.getContext()));
287
288 // Obtain the original image sample intrinsic's signature
289 // and replace its return type with the half-vector for D16 folding
290 SmallVector<Type *, 8> OverloadTys;
291 if (!Intrinsic::isSignatureValid(II.getCalledFunction(), OverloadTys))
292 return std::nullopt;
293
294 OverloadTys[0] = HalfVecTy;
295 Module *M = II.getModule();
297 M, ImageDimIntr->Intr, OverloadTys);
298
299 II.mutateType(HalfVecTy);
300 II.setCalledFunction(HalfDecl);
301
302 IRBuilder<> Builder(II.getContext());
303 for (auto &[Ext, Tr] : ExtractTruncPairs) {
304 Value *Idx = Ext->getIndexOperand();
305
306 Builder.SetInsertPoint(Tr);
307
308 Value *HalfExtract = Builder.CreateExtractElement(&II, Idx);
309 HalfExtract->takeName(Tr);
310
311 Tr->replaceAllUsesWith(HalfExtract);
312 }
313
314 for (auto &[Ext, Tr] : ExtractTruncPairs) {
315 IC.eraseInstFromFunction(*Tr);
316 IC.eraseInstFromFunction(*Ext);
317 }
318
319 return &II;
320 }
321 }
322 }
323
324 // Try to use A16 or G16
325 if (!ST->hasA16() && !ST->hasG16())
326 return std::nullopt;
327
328 // Address is interpreted as float if the instruction has a sampler or as
329 // unsigned int if there is no sampler.
330 bool HasSampler =
332 bool FloatCoord = false;
333 // true means derivatives can be converted to 16 bit, coordinates not
334 bool OnlyDerivatives = false;
335
336 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
337 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
338 Value *Coord = II.getOperand(OperandIndex);
339 // If the values are not derived from 16-bit values, we cannot optimize.
340 if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
341 if (OperandIndex < ImageDimIntr->CoordStart ||
342 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
343 return std::nullopt;
344 }
345 // All gradients can be converted, so convert only them
346 OnlyDerivatives = true;
347 break;
348 }
349
350 assert(OperandIndex == ImageDimIntr->GradientStart ||
351 FloatCoord == Coord->getType()->isFloatingPointTy());
352 FloatCoord = Coord->getType()->isFloatingPointTy();
353 }
354
355 if (!OnlyDerivatives && !ST->hasA16())
356 OnlyDerivatives = true; // Only supports G16
357
358 // Check if there is a bias parameter and if it can be converted to f16
359 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
360 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
361 assert(HasSampler &&
362 "Only image instructions with a sampler can have a bias");
363 if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
364 OnlyDerivatives = true;
365 }
366
367 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
368 ImageDimIntr->CoordStart))
369 return std::nullopt;
370
371 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
372 : Type::getInt16Ty(II.getContext());
373
374 return modifyIntrinsicCall(
375 II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
376 ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
377 if (!OnlyDerivatives) {
378 ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
379
380 // Change the bias type
381 if (ImageDimIntr->NumBiasArgs != 0)
382 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
383 }
384
385 unsigned EndIndex =
386 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
387 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
388 OperandIndex < EndIndex; OperandIndex++) {
389 Args[OperandIndex] =
390 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
391 }
392
393 // Convert the bias
394 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
395 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
396 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
397 }
398 });
399}
400
402 const Value *Op0, const Value *Op1,
403 InstCombiner &IC) const {
404 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
405 // infinity, gives +0.0. If we can prove we don't have one of the special
406 // cases then we can use a normal multiply instead.
407 // TODO: Create and use isKnownFiniteNonZero instead of just matching
408 // constants here.
411 // One operand is not zero or infinity or NaN.
412 return true;
413 }
414
416 if (isKnownNeverInfOrNaN(Op0, SQ) && isKnownNeverInfOrNaN(Op1, SQ)) {
417 // Neither operand is infinity or NaN.
418 return true;
419 }
420 return false;
421}
422
423/// Match an fpext from half to float, or a constant we can convert.
425 Value *Src = nullptr;
426 ConstantFP *CFP = nullptr;
427 if (match(Arg, m_OneUse(m_FPExt(m_Value(Src))))) {
428 if (Src->getType()->isHalfTy())
429 return Src;
430 } else if (match(Arg, m_ConstantFP(CFP))) {
431 bool LosesInfo;
432 APFloat Val(CFP->getValueAPF());
434 if (!LosesInfo)
435 return ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val);
436 }
437 return nullptr;
438}
439
440// Trim all zero components from the end of the vector \p UseV and return
441// an appropriate bitset with known elements.
443 Instruction *I) {
444 auto *VTy = cast<FixedVectorType>(UseV->getType());
445 unsigned VWidth = VTy->getNumElements();
446 APInt DemandedElts = APInt::getAllOnes(VWidth);
447
448 for (int i = VWidth - 1; i > 0; --i) {
449 auto *Elt = findScalarElement(UseV, i);
450 if (!Elt)
451 break;
452
453 if (auto *ConstElt = dyn_cast<Constant>(Elt)) {
454 if (!ConstElt->isNullValue() && !isa<UndefValue>(Elt))
455 break;
456 } else {
457 break;
458 }
459
460 DemandedElts.clearBit(i);
461 }
462
463 return DemandedElts;
464}
465
466// Trim elements of the end of the vector \p V, if they are
467// equal to the first element of the vector.
469 auto *VTy = cast<FixedVectorType>(V->getType());
470 unsigned VWidth = VTy->getNumElements();
471 APInt DemandedElts = APInt::getAllOnes(VWidth);
472 Value *FirstComponent = findScalarElement(V, 0);
473
474 SmallVector<int> ShuffleMask;
475 if (auto *SVI = dyn_cast<ShuffleVectorInst>(V))
476 SVI->getShuffleMask(ShuffleMask);
477
478 for (int I = VWidth - 1; I > 0; --I) {
479 if (ShuffleMask.empty()) {
480 auto *Elt = findScalarElement(V, I);
481 if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt)))
482 break;
483 } else {
484 // Detect identical elements in the shufflevector result, even though
485 // findScalarElement cannot tell us what that element is.
486 if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
487 break;
488 }
489 DemandedElts.clearBit(I);
490 }
491
492 return DemandedElts;
493}
494
497 APInt DemandedElts,
498 int DMaskIdx = -1,
499 bool IsLoad = true);
500
501/// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
502static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
503 return (SqrtOp->getType()->isFloatTy() &&
504 (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) ||
505 SqrtOp->getType()->isHalfTy();
506}
507
508/// Return true if we can easily prove that use U is uniform.
509static bool isTriviallyUniform(const Use &U) {
510 Value *V = U.get();
511 if (isa<Constant>(V))
512 return true;
513 if (const auto *A = dyn_cast<Argument>(V))
515 if (const auto *II = dyn_cast<IntrinsicInst>(V)) {
516 if (!AMDGPU::isIntrinsicAlwaysUniform(II->getIntrinsicID()))
517 return false;
518 // If II and U are in different blocks then there is a possibility of
519 // temporal divergence.
520 return II->getParent() == cast<Instruction>(U.getUser())->getParent();
521 }
522 return false;
523}
524
525/// Simplify a lane index operand (e.g. llvm.amdgcn.readlane src1).
526///
527/// The instruction only reads the low 5 bits for wave32, and 6 bits for wave64.
530 unsigned LaneArgIdx) const {
531 unsigned MaskBits = ST->getWavefrontSizeLog2();
532 APInt DemandedMask(32, maskTrailingOnes<unsigned>(MaskBits));
533
534 KnownBits Known(32);
535 if (IC.SimplifyDemandedBits(&II, LaneArgIdx, DemandedMask, Known))
536 return true;
537
538 if (!Known.isConstant())
539 return false;
540
541 // Out of bounds indexes may appear in wave64 code compiled for wave32.
542 // Unlike the DAG version, SimplifyDemandedBits does not change constants, so
543 // manually fix it up.
544
545 Value *LaneArg = II.getArgOperand(LaneArgIdx);
546 Constant *MaskedConst =
547 ConstantInt::get(LaneArg->getType(), Known.getConstant() & DemandedMask);
548 if (MaskedConst != LaneArg) {
549 II.getOperandUse(LaneArgIdx).set(MaskedConst);
550 return true;
551 }
552
553 return false;
554}
555
557 Function &NewCallee, ArrayRef<Value *> Ops) {
559 Old.getOperandBundlesAsDefs(OpBundles);
560
561 CallInst *NewCall = B.CreateCall(&NewCallee, Ops, OpBundles);
562 NewCall->takeName(&Old);
563 return NewCall;
564}
565
566// Return true for sequences of instructions that effectively assign
567// each lane to its thread ID
568static bool isThreadID(const GCNSubtarget &ST, Value *V) {
569 // Case 1:
570 // wave32: mbcnt_lo(-1, 0)
571 // wave64: mbcnt_hi(-1, mbcnt_lo(-1, 0))
577 if (ST.isWave32() && match(V, W32Pred))
578 return true;
579 if (ST.isWave64() && match(V, W64Pred))
580 return true;
581
582 return false;
583}
584
587 IntrinsicInst &II) const {
588 const auto IID = II.getIntrinsicID();
589 assert(IID == Intrinsic::amdgcn_readlane ||
590 IID == Intrinsic::amdgcn_readfirstlane ||
591 IID == Intrinsic::amdgcn_permlane64);
592
593 Instruction *OpInst = dyn_cast<Instruction>(II.getOperand(0));
594
595 // Only do this if both instructions are in the same block
596 // (so the exec mask won't change) and the readlane is the only user of its
597 // operand.
598 if (!OpInst || !OpInst->hasOneUser() || OpInst->getParent() != II.getParent())
599 return nullptr;
600
601 const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
602
603 // If this is a readlane, check that the second operand is a constant, or is
604 // defined before OpInst so we know it's safe to move this intrinsic higher.
605 Value *LaneID = nullptr;
606 if (IsReadLane) {
607 LaneID = II.getOperand(1);
608
609 // readlane take an extra operand for the lane ID, so we must check if that
610 // LaneID value can be used at the point where we want to move the
611 // intrinsic.
612 if (auto *LaneIDInst = dyn_cast<Instruction>(LaneID)) {
613 if (!IC.getDominatorTree().dominates(LaneIDInst, OpInst))
614 return nullptr;
615 }
616 }
617
618 // Hoist the intrinsic (II) through OpInst.
619 //
620 // (II (OpInst x)) -> (OpInst (II x))
621 const auto DoIt = [&](unsigned OpIdx,
622 Function *NewIntrinsic) -> Instruction * {
624 if (IsReadLane)
625 Ops.push_back(LaneID);
626
627 // Rewrite the intrinsic call.
628 CallInst *NewII = rewriteCall(IC.Builder, II, *NewIntrinsic, Ops);
629
630 // Rewrite OpInst so it takes the result of the intrinsic now.
631 Instruction &NewOp = *OpInst->clone();
632 NewOp.setOperand(OpIdx, NewII);
633 return &NewOp;
634 };
635
636 // TODO(?): Should we do more with permlane64?
637 if (IID == Intrinsic::amdgcn_permlane64 && !isa<BitCastInst>(OpInst))
638 return nullptr;
639
640 if (isa<UnaryOperator>(OpInst))
641 return DoIt(0, II.getCalledFunction());
642
643 if (isa<CastInst>(OpInst)) {
644 Value *Src = OpInst->getOperand(0);
645 Type *SrcTy = Src->getType();
646 if (!isTypeLegal(SrcTy))
647 return nullptr;
648
649 Function *Remangled =
650 Intrinsic::getOrInsertDeclaration(II.getModule(), IID, {SrcTy});
651 return DoIt(0, Remangled);
652 }
653
654 // We can also hoist through binary operators if the other operand is uniform.
655 if (isa<BinaryOperator>(OpInst)) {
656 // FIXME: If we had access to UniformityInfo here we could just check
657 // if the operand is uniform.
658 if (isTriviallyUniform(OpInst->getOperandUse(0)))
659 return DoIt(1, II.getCalledFunction());
660 if (isTriviallyUniform(OpInst->getOperandUse(1)))
661 return DoIt(0, II.getCalledFunction());
662 }
663
664 return nullptr;
665}
666
667/// Evaluate V as a function of the lane ID and return its value on Lane, or
668/// std::nullopt if V is not a closed-form expression of the lane ID.
669static std::optional<unsigned> evalLaneExpr(Value *V, unsigned Lane,
670 const GCNSubtarget &ST,
671 const DataLayout &DL,
672 unsigned Depth = 0) {
674 return std::nullopt;
675
676 // Poison/undef in the index expression: bail and let InstCombine fold the
677 // intrinsic the usual way.
678 if (isa<UndefValue>(V))
679 return std::nullopt;
680
681 if (const ConstantInt *CI = dyn_cast<ConstantInt>(V))
682 return CI->getZExtValue();
683
684 if (isThreadID(ST, V))
685 return Lane;
686
688 if (!BO)
689 return std::nullopt;
690
691 std::optional<unsigned> LHS =
692 evalLaneExpr(BO->getOperand(0), Lane, ST, DL, Depth + 1);
693 if (!LHS)
694 return std::nullopt;
695 std::optional<unsigned> RHS =
696 evalLaneExpr(BO->getOperand(1), Lane, ST, DL, Depth + 1);
697 if (!RHS)
698 return std::nullopt;
699
700 Type *Ty = BO->getType();
701 Constant *Ops[] = {ConstantInt::get(Ty, *LHS), ConstantInt::get(Ty, *RHS)};
702 auto *CI =
704 return CI ? std::optional<unsigned>(CI->getZExtValue()) : std::nullopt;
705}
706
707/// Build the per-lane shuffle map by evaluating Index for every lane in the
708/// wave. Returns false if any lane index is non-constant or out of range.
709static bool tryBuildShuffleMap(Value *Index, const GCNSubtarget &ST,
711 const DataLayout &DL) {
712 unsigned WaveSize = ST.getWavefrontSize();
713 Ids.resize(WaveSize);
714 for (unsigned Lane : seq(WaveSize)) {
715 std::optional<unsigned> Val = evalLaneExpr(Index, Lane, ST, DL);
716 if (!Val || *Val >= WaveSize)
717 return false;
718 Ids[Lane] = *Val;
719 }
720 return true;
721}
722
723/// Lanes are partitioned into groups of Period; each group is a translated
724/// copy of the first: Ids[I] = Ids[I % Period] + (I & ~(Period - 1)).
725template <unsigned Period>
727 static_assert(isPowerOf2_32(Period), "Period must be a power of two");
728 for (unsigned I = Period, E = Ids.size(); I < E; ++I)
729 if (Ids[I] != Ids[I % Period] + (I & ~(Period - 1)))
730 return false;
731 return true;
732}
733
734/// Match an N-lane row pattern: each lane in [0, N) reads from a source lane
735/// in the same N-lane row, and the pattern repeats periodically across rows.
736template <unsigned N> static bool isRowPattern(ArrayRef<uint8_t> Ids) {
737 for (unsigned I = 0; I < N; ++I)
738 if (Ids[I] >= N)
739 return false;
740 return hasPeriodicLayout<N>(Ids);
741}
742
743static constexpr auto isQuadPattern = isRowPattern<4>;
744static constexpr auto isHalfRowPattern = isRowPattern<8>;
745static constexpr auto isFullRowPattern = isRowPattern<16>;
746
747/// Match a 4-lane (quad) permutation, encoded as the v_mov_b32_dpp
748/// QUAD_PERM control word: bits[1:0]=Ids[0], [3:2]=Ids[1], [5:4]=Ids[2],
749/// [7:6]=Ids[3].
750static std::optional<unsigned> matchQuadPermPattern(ArrayRef<uint8_t> Ids) {
751 if (!isQuadPattern(Ids))
752 return std::nullopt;
753 return Ids[3] << 6 | Ids[2] << 4 | Ids[1] << 2 | Ids[0];
754}
755
756/// Match an N-lane reversal (mirror) pattern.
757template <unsigned N> static bool matchMirrorPattern(ArrayRef<uint8_t> Ids) {
758 if (!isRowPattern<N>(Ids))
759 return false;
760 for (unsigned J = 0; J < N; ++J)
761 if (Ids[J] != (N - 1) - J)
762 return false;
763 return true;
764}
765
768
769/// Match a 16-lane cyclic rotation; returns the rotation amount in [1, 15].
770static std::optional<unsigned> matchRowRotatePattern(ArrayRef<uint8_t> Ids) {
771 if (Ids[0] == 0 || !isFullRowPattern(Ids))
772 return std::nullopt;
773 for (unsigned J = 1; J < 16; ++J)
774 if (Ids[J] != (Ids[0] + J) % 16)
775 return std::nullopt;
776 return 16u - Ids[0];
777}
778
779/// Match a row-share pattern: all 16 lanes of each row read the same source
780/// lane. Returns the shared source lane index in [0, 16).
781static std::optional<unsigned> matchRowSharePattern(ArrayRef<uint8_t> Ids) {
782 if (!isFullRowPattern(Ids))
783 return std::nullopt;
784 if (!all_equal(Ids.take_front(16)))
785 return std::nullopt;
786 return Ids[0];
787}
788
789/// Match an XOR mask pattern within each 16-lane row: Ids[J] == Mask ^ J,
790/// with Mask in [1, 15].
791static std::optional<unsigned> matchRowXMaskPattern(ArrayRef<uint8_t> Ids) {
792 unsigned Mask = Ids[0];
793 if (Mask == 0 || !isFullRowPattern(Ids))
794 return std::nullopt;
795 for (unsigned J = 0; J < 16; ++J)
796 if (Ids[J] != (Mask ^ J))
797 return std::nullopt;
798 return Mask;
799}
800
801/// Match an 8-lane arbitrary permutation, encoded as the v_mov_b32_dpp8
802/// 24-bit selector (three bits per output lane).
803static std::optional<unsigned> matchHalfRowPermPattern(ArrayRef<uint8_t> Ids) {
804 if (!isHalfRowPattern(Ids))
805 return std::nullopt;
806 unsigned Selector = 0;
807 for (unsigned J = 0; J < 8; ++J)
808 Selector |= Ids[J] << (J * 3);
809 return Selector;
810}
811
812/// Pack a 16-lane permutation into a single 64-bit value: four bits per output
813/// lane, lane J in bits [J*4 + 3 : J*4]. The caller splits it into the low and
814/// high 32-bit selector operands of v_permlane16 / v_permlanex16.
816 uint64_t Sel = 0;
817 for (unsigned J = 0; J < 16; ++J)
818 Sel |= static_cast<uint64_t>(Ids[J] & 0xF) << (J * 4);
819 return Sel;
820}
821
822/// Match a half-wave swap: lane J reads from lane J ^ 32. Only meaningful on
823/// wave64 targets.
825 if (Ids.size() != 64)
826 return false;
827 for (unsigned J = 0; J < 64; ++J)
828 if (Ids[J] != (J ^ 32))
829 return false;
830 return true;
831}
832
833/// Match a cross-row permutation suitable for v_permlanex16: every lane in
834/// the low 16-lane half reads from the high half of its own row, and vice
835/// versa.
837 if (!hasPeriodicLayout<32>(Ids))
838 return false;
839 for (unsigned J = 0; J < 16; ++J) {
840 if (Ids[J] < 16 || Ids[J] >= 32)
841 return false;
842 if (Ids[J + 16] != Ids[J] - 16)
843 return false;
844 }
845 return true;
846}
847
848/// Match a DS_SWIZZLE bitmask-mode permutation:
849/// dst_lane = ((src_lane & AND) | OR) ^ XOR
850/// with each mask being five bits. Returns the encoded swizzle immediate.
851/// The hardware applies the formula independently within each 32-lane group,
852/// so on wave64 the high group must replicate the low one (translated by 32).
853static std::optional<unsigned>
855 if (!hasPeriodicLayout<32>(Ids))
856 return std::nullopt;
857
858 // The formula is per-bit: output bit B depends only on input bit B. Probe
859 // each bit with src=0 and src=(1<<B); if the output bit flipped, AND[B]=1
860 // and XOR[B] carries the constant offset; otherwise it is a constant bit
861 // encoded in OR (with AND[B]=0, XOR[B]=0).
862 unsigned AndMask = 0, OrMask = 0, XorMask = 0;
863 for (unsigned B = 0; B < 5; ++B) {
864 unsigned Bit0 = (Ids[0] >> B) & 1;
865 unsigned Bit1 = (Ids[1u << B] >> B) & 1;
866 if (Bit0 != Bit1) {
867 AndMask |= 1u << B;
868 XorMask |= Bit0 << B;
869 } else {
870 OrMask |= Bit0 << B;
871 }
872 }
873
874 // The per-bit derivation assumes bit independence; verify the masks
875 // actually reproduce every lane in the 32-lane group.
876 for (unsigned I : seq(32u)) {
877 unsigned Expected = ((I & AndMask) | OrMask) ^ XorMask;
878 if (Ids[I] != Expected)
879 return std::nullopt;
880 }
881
886}
887
888/// Match a GFX9+ DS_SWIZZLE rotate-mode permutation: a cyclic left-rotation
889/// of all 32 lanes within each 32-lane group by a constant N in [0, 31],
890/// i.e. dst_lane = (src_lane + N) % 32. On wave64, hasPeriodicLayout<32>
891/// ensures both 32-lane groups rotate by the same amount.
892static std::optional<unsigned>
894 if (!hasPeriodicLayout<32>(Ids))
895 return std::nullopt;
896
897 // Determine the rotation amount from lane 0: every lane must read from
898 // lane (I + N) % 32 where N = Ids[0] and 0 <= N <= 31.
899 unsigned N = Ids[0];
900 if (N >= 32)
901 return std::nullopt;
902
903 for (unsigned I = 0; I < 32; ++I)
904 if (Ids[I] != (I + N) % 32)
905 return std::nullopt;
906
909}
910
911/// Emit v_mov_b32_dpp with the given control word, row/bank masks 0xF, and
912/// bound_ctrl=1 so out-of-bounds lanes are well-defined and the DPP mov can
913/// be folded into a consuming VALU op by GCNDPPCombine.
914static Value *createUpdateDpp(IRBuilderBase &B, Value *Val, unsigned Ctrl) {
915 Type *Ty = Val->getType();
916 return B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, {Ty},
917 {PoisonValue::get(Ty), Val, B.getInt32(Ctrl),
918 B.getInt32(0xF), B.getInt32(0xF), B.getTrue()});
919}
920
921/// Emit v_mov_b32_dpp8 with the given 24-bit lane selector.
922static Value *createMovDpp8(IRBuilderBase &B, Value *Val, unsigned Selector) {
923 return B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp8, {Val->getType()},
924 {Val, B.getInt32(Selector)});
925}
926
927/// Emit v_permlane16 with the precomputed lane-select halves.
929 uint32_t Hi) {
930 Type *Ty = Val->getType();
931 return B.CreateIntrinsic(Intrinsic::amdgcn_permlane16, {Ty},
932 {PoisonValue::get(Ty), Val, B.getInt32(Lo),
933 B.getInt32(Hi), B.getFalse(), B.getFalse()});
934}
935
936/// Emit v_permlanex16 with the precomputed lane-select halves. Each output
937/// lane reads from the other 16-lane half of the same row.
939 uint32_t Hi) {
940 Type *Ty = Val->getType();
941 return B.CreateIntrinsic(Intrinsic::amdgcn_permlanex16, {Ty},
942 {PoisonValue::get(Ty), Val, B.getInt32(Lo),
943 B.getInt32(Hi), B.getFalse(), B.getFalse()});
944}
945
946/// Emit ds_swizzle with the given immediate, bitcasting/converting between
947/// pointer/float types and i32 as required by the intrinsic signature.
949 const DataLayout &DL) {
950 Type *OrigTy = Val->getType();
951 assert(DL.getTypeSizeInBits(OrigTy) == 32 &&
952 "ds_swizzle only supports 32-bit operands");
953 IntegerType *I32Ty = B.getInt32Ty();
954 Value *Src = Val;
955 if (OrigTy->isPointerTy())
956 Src = B.CreatePtrToInt(Src, I32Ty);
957 else if (OrigTy != I32Ty)
958 Src = B.CreateBitCast(Src, I32Ty);
959 Value *Result = B.CreateIntrinsic(Intrinsic::amdgcn_ds_swizzle, {},
960 {Src, B.getInt32(Offset)});
961 if (OrigTy->isPointerTy())
962 return B.CreateIntToPtr(Result, OrigTy);
963 if (OrigTy != I32Ty)
964 return B.CreateBitCast(Result, OrigTy);
965 return Result;
966}
967
968/// Emit v_permlane64 (swap of the two 32-lane halves of a wave64).
970 return B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {Val->getType()},
971 {Val});
972}
973
974/// Given a shuffle map, try to emit the best hardware intrinsic.
977 const GCNSubtarget &ST,
978 const DataLayout &DL) {
979 // Identity shuffle (every lane reads itself) folds to the source value.
980 if (all_of(enumerate(Ids),
981 [](const auto &E) { return E.value() == E.index(); }))
982 return Src;
983
984 // Uniform shuffle (all lanes read the same value) is handled by cheaper
985 // broadcast/readlane intrinsics.
986 if (all_equal(Ids))
987 return nullptr;
988
989 if (std::optional<unsigned> QP = matchQuadPermPattern(Ids)) {
990 if (ST.hasDPP())
991 return createUpdateDpp(B, Src, *QP);
993 }
994
995 if (ST.hasDPP()) {
1000 if (std::optional<unsigned> Amt = matchRowRotatePattern(Ids))
1001 return createUpdateDpp(B, Src, AMDGPU::DPP::ROW_ROR_FIRST + *Amt - 1);
1002 }
1003
1004 // row_share is supported on GFX90A and GFX10+; row_xmask is GFX10+ only.
1005 if (ST.hasDPPRowShare()) {
1006 if (std::optional<unsigned> Lane = matchRowSharePattern(Ids))
1007 return createUpdateDpp(B, Src, AMDGPU::DPP::ROW_SHARE_FIRST + *Lane);
1008 }
1009
1010 if (ST.hasDPP() && ST.hasGFX10Insts()) {
1011 if (std::optional<unsigned> Mask = matchRowXMaskPattern(Ids))
1012 return createUpdateDpp(B, Src, AMDGPU::DPP::ROW_XMASK_FIRST + *Mask);
1013 }
1014
1015 if (ST.hasDPP8()) {
1016 if (std::optional<unsigned> Sel = matchHalfRowPermPattern(Ids))
1017 return createMovDpp8(B, Src, *Sel);
1018 }
1019
1020 if (ST.hasPermlane16Insts()) {
1021 if (isFullRowPattern(Ids)) {
1023 return createPermlane16(B, Src, Lo_32(Sel), Hi_32(Sel));
1024 }
1025 // Cross-row shuffles (e.g. XOR 16..31) — covered by permlanex16.
1026 if (isCrossRowPattern(Ids)) {
1028 return createPermlaneX16(B, Src, Lo_32(Sel), Hi_32(Sel));
1029 }
1030 }
1031
1032 // Generic DS_SWIZZLE bitmask-mode fallback: handles any 32-lane shuffle that
1033 // can be expressed as dst = ((src & AND) | OR) ^ XOR with 5-bit masks. This
1034 // is available on every target that has ds_swizzle.
1035 if (std::optional<unsigned> Imm = matchDsSwizzleBitmaskPattern(Ids))
1036 return createDsSwizzle(B, Src, *Imm, DL);
1037
1038 // DS_SWIZZLE rotate mode (GFX9+): handles cyclic 32-lane rotations that
1039 // bitmask mode cannot express (e.g. +1 mod 32 requires inter-bit carry).
1040 if (ST.hasDsSwizzleRotateMode()) {
1041 if (std::optional<unsigned> Imm = matchDsSwizzleRotatePattern(Ids))
1042 return createDsSwizzle(B, Src, *Imm, DL);
1043 }
1044
1045 if (ST.hasPermLane64() && matchHalfWaveSwapPattern(Ids))
1046 return createPermlane64(B, Src);
1047
1048 return nullptr;
1049}
1050
1051/// Try to fold a wave_shuffle/ds_bpermute whose lane index is a constant
1052/// function of the lane ID into a hardware-specific lane permutation intrinsic.
1053static std::optional<Instruction *>
1055 const GCNSubtarget &ST) {
1056 const DataLayout &DL = IC.getDataLayout();
1057 if (DL.getTypeSizeInBits(II.getType()) != 32)
1058 return std::nullopt;
1059
1060 if (!ST.isWaveSizeKnown())
1061 return std::nullopt;
1062
1063 unsigned WaveSize = ST.getWavefrontSize();
1064 bool IsBpermute = II.getIntrinsicID() == Intrinsic::amdgcn_ds_bpermute;
1065 Value *Src = II.getArgOperand(IsBpermute ? 1 : 0);
1066 Value *Index = II.getArgOperand(IsBpermute ? 0 : 1);
1067
1069 if (IsBpermute) {
1070 Ids.resize(WaveSize);
1071 for (unsigned Lane : seq(WaveSize)) {
1072 std::optional<unsigned> Val = evalLaneExpr(Index, Lane, ST, DL);
1073 if (!Val || (*Val & 3) || (*Val >> 2) >= WaveSize)
1074 return std::nullopt;
1075 Ids[Lane] = *Val >> 2;
1076 }
1077 } else {
1078 if (!tryBuildShuffleMap(Index, ST, Ids, DL))
1079 return std::nullopt;
1080 }
1081
1082 Value *Result = matchShuffleToHWIntrinsic(IC.Builder, Src, Ids, ST, DL);
1083 if (!Result)
1084 return std::nullopt;
1085
1086 return IC.replaceInstUsesWith(II, Result);
1087}
1088std::optional<Instruction *>
1090 Intrinsic::ID IID = II.getIntrinsicID();
1091 switch (IID) {
1092 case Intrinsic::amdgcn_implicitarg_ptr: {
1093 if (II.getFunction()->hasFnAttribute("amdgpu-no-implicitarg-ptr"))
1094 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1095 uint64_t ImplicitArgBytes = ST->getImplicitArgNumBytes(*II.getFunction());
1096
1097 uint64_t CurrentOrNullBytes =
1098 II.getAttributes().getRetDereferenceableOrNullBytes();
1099 if (CurrentOrNullBytes != 0) {
1100 // Refine "dereferenceable (A) meets dereferenceable_or_null(B)"
1101 // into dereferenceable(max(A, B))
1102 uint64_t NewBytes = std::max(CurrentOrNullBytes, ImplicitArgBytes);
1103 II.addRetAttr(
1104 Attribute::getWithDereferenceableBytes(II.getContext(), NewBytes));
1105 II.removeRetAttr(Attribute::DereferenceableOrNull);
1106 return &II;
1107 }
1108
1109 uint64_t CurrentBytes = II.getAttributes().getRetDereferenceableBytes();
1110 uint64_t NewBytes = std::max(CurrentBytes, ImplicitArgBytes);
1111 if (NewBytes != CurrentBytes) {
1112 II.addRetAttr(
1113 Attribute::getWithDereferenceableBytes(II.getContext(), NewBytes));
1114 return &II;
1115 }
1116
1117 return std::nullopt;
1118 }
1119 case Intrinsic::amdgcn_rcp: {
1120 Value *Src = II.getArgOperand(0);
1121 if (isa<PoisonValue>(Src))
1122 return IC.replaceInstUsesWith(II, Src);
1123
1124 // TODO: Move to ConstantFolding/InstSimplify?
1125 if (isa<UndefValue>(Src)) {
1126 Type *Ty = II.getType();
1127 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
1128 return IC.replaceInstUsesWith(II, QNaN);
1129 }
1130
1131 if (II.isStrictFP())
1132 break;
1133
1134 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
1135 const APFloat &ArgVal = C->getValueAPF();
1136 APFloat Val(ArgVal.getSemantics(), 1);
1138
1139 // This is more precise than the instruction may give.
1140 //
1141 // TODO: The instruction always flushes denormal results (except for f16),
1142 // should this also?
1143 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
1144 }
1145
1146 FastMathFlags FMF = cast<FPMathOperator>(II).getFastMathFlags();
1147 if (!FMF.allowContract())
1148 break;
1149 auto *SrcCI = dyn_cast<IntrinsicInst>(Src);
1150 if (!SrcCI)
1151 break;
1152
1153 auto IID = SrcCI->getIntrinsicID();
1154 // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
1155 //
1156 // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
1157 // relaxed.
1158 if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) {
1159 const FPMathOperator *SqrtOp = cast<FPMathOperator>(SrcCI);
1160 FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
1161 if (!InnerFMF.allowContract() || !SrcCI->hasOneUse())
1162 break;
1163
1164 if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
1165 break;
1166
1168 SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()});
1169
1170 InnerFMF |= FMF;
1171 II.setFastMathFlags(InnerFMF);
1172
1173 II.setCalledFunction(NewDecl);
1174 return IC.replaceOperand(II, 0, SrcCI->getArgOperand(0));
1175 }
1176
1177 break;
1178 }
1179 case Intrinsic::amdgcn_sqrt:
1180 case Intrinsic::amdgcn_rsq:
1181 case Intrinsic::amdgcn_tanh: {
1182 Value *Src = II.getArgOperand(0);
1183 if (isa<PoisonValue>(Src))
1184 return IC.replaceInstUsesWith(II, Src);
1185
1186 // TODO: Move to ConstantFolding/InstSimplify?
1187 if (isa<UndefValue>(Src)) {
1188 Type *Ty = II.getType();
1189 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
1190 return IC.replaceInstUsesWith(II, QNaN);
1191 }
1192
1193 // f16 amdgcn.sqrt is identical to regular sqrt.
1194 if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
1196 II.getModule(), Intrinsic::sqrt, {II.getType()});
1197 II.setCalledFunction(NewDecl);
1198 return &II;
1199 }
1200
1201 break;
1202 }
1203 case Intrinsic::amdgcn_log:
1204 case Intrinsic::amdgcn_exp2: {
1205 const bool IsLog = IID == Intrinsic::amdgcn_log;
1206 const bool IsExp = IID == Intrinsic::amdgcn_exp2;
1207 Value *Src = II.getArgOperand(0);
1208 Type *Ty = II.getType();
1209
1210 if (isa<PoisonValue>(Src))
1211 return IC.replaceInstUsesWith(II, Src);
1212
1213 if (IC.getSimplifyQuery().isUndefValue(Src))
1215
1216 if (ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
1217 if (C->isInfinity()) {
1218 // exp2(+inf) -> +inf
1219 // log2(+inf) -> +inf
1220 if (!C->isNegative())
1221 return IC.replaceInstUsesWith(II, C);
1222
1223 // exp2(-inf) -> 0
1224 if (IsExp && C->isNegative())
1226 }
1227
1228 if (II.isStrictFP())
1229 break;
1230
1231 if (C->isNaN()) {
1232 Constant *Quieted = ConstantFP::get(Ty, C->getValue().makeQuiet());
1233 return IC.replaceInstUsesWith(II, Quieted);
1234 }
1235
1236 // f32 instruction doesn't handle denormals, f16 does.
1237 if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) {
1238 Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, true)
1239 : ConstantFP::get(Ty, 1.0);
1240 return IC.replaceInstUsesWith(II, FoldedValue);
1241 }
1242
1243 if (IsLog && C->isNegative())
1245
1246 // TODO: Full constant folding matching hardware behavior.
1247 }
1248
1249 break;
1250 }
1251 case Intrinsic::amdgcn_frexp_mant:
1252 case Intrinsic::amdgcn_frexp_exp: {
1253 Value *Src = II.getArgOperand(0);
1254 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
1255 int Exp;
1256 APFloat Significand =
1257 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
1258
1259 if (IID == Intrinsic::amdgcn_frexp_mant) {
1260 return IC.replaceInstUsesWith(
1261 II, ConstantFP::get(II.getContext(), Significand));
1262 }
1263
1264 // Match instruction special case behavior.
1265 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
1266 Exp = 0;
1267
1268 return IC.replaceInstUsesWith(II,
1269 ConstantInt::getSigned(II.getType(), Exp));
1270 }
1271
1272 if (isa<PoisonValue>(Src))
1273 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1274
1275 if (isa<UndefValue>(Src)) {
1276 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1277 }
1278
1279 break;
1280 }
1281 case Intrinsic::amdgcn_class: {
1282 Value *Src0 = II.getArgOperand(0);
1283 Value *Src1 = II.getArgOperand(1);
1284 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
1285 if (CMask) {
1286 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
1287 II.getModule(), Intrinsic::is_fpclass, Src0->getType()));
1288
1289 // Clamp any excess bits, as they're illegal for the generic intrinsic.
1290 II.setArgOperand(1, ConstantInt::get(Src1->getType(),
1291 CMask->getZExtValue() & fcAllFlags));
1292 return &II;
1293 }
1294
1295 // Propagate poison.
1296 if (isa<PoisonValue>(Src0) || isa<PoisonValue>(Src1))
1297 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1298
1299 // llvm.amdgcn.class(_, undef) -> false
1300 if (IC.getSimplifyQuery().isUndefValue(Src1))
1301 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
1302
1303 // llvm.amdgcn.class(undef, mask) -> mask != 0
1304 if (IC.getSimplifyQuery().isUndefValue(Src0)) {
1305 Value *CmpMask = IC.Builder.CreateICmpNE(
1306 Src1, ConstantInt::getNullValue(Src1->getType()));
1307 return IC.replaceInstUsesWith(II, CmpMask);
1308 }
1309 break;
1310 }
1311 case Intrinsic::amdgcn_cvt_pkrtz: {
1312 auto foldFPTruncToF16RTZ = [](Value *Arg) -> Value * {
1313 Type *HalfTy = Type::getHalfTy(Arg->getContext());
1314
1315 if (isa<PoisonValue>(Arg))
1316 return PoisonValue::get(HalfTy);
1317 if (isa<UndefValue>(Arg))
1318 return UndefValue::get(HalfTy);
1319
1320 ConstantFP *CFP = nullptr;
1321 if (match(Arg, m_ConstantFP(CFP))) {
1322 bool LosesInfo;
1323 APFloat Val(CFP->getValueAPF());
1325 return ConstantFP::get(HalfTy, Val);
1326 }
1327
1328 Value *Src = nullptr;
1329 if (match(Arg, m_FPExt(m_Value(Src)))) {
1330 if (Src->getType()->isHalfTy())
1331 return Src;
1332 }
1333
1334 return nullptr;
1335 };
1336
1337 if (Value *Src0 = foldFPTruncToF16RTZ(II.getArgOperand(0))) {
1338 if (Value *Src1 = foldFPTruncToF16RTZ(II.getArgOperand(1))) {
1339 Value *V = PoisonValue::get(II.getType());
1340 V = IC.Builder.CreateInsertElement(V, Src0, (uint64_t)0);
1341 V = IC.Builder.CreateInsertElement(V, Src1, (uint64_t)1);
1342 return IC.replaceInstUsesWith(II, V);
1343 }
1344 }
1345
1346 break;
1347 }
1348 case Intrinsic::amdgcn_cvt_pknorm_i16:
1349 case Intrinsic::amdgcn_cvt_pknorm_u16:
1350 case Intrinsic::amdgcn_cvt_pk_i16:
1351 case Intrinsic::amdgcn_cvt_pk_u16: {
1352 Value *Src0 = II.getArgOperand(0);
1353 Value *Src1 = II.getArgOperand(1);
1354
1355 // TODO: Replace call with scalar operation if only one element is poison.
1356 if (isa<PoisonValue>(Src0) && isa<PoisonValue>(Src1))
1357 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1358
1359 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
1360 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1361 }
1362
1363 break;
1364 }
1365 case Intrinsic::amdgcn_cvt_off_f32_i4: {
1366 Value* Arg = II.getArgOperand(0);
1367 Type *Ty = II.getType();
1368
1369 if (isa<PoisonValue>(Arg))
1370 return IC.replaceInstUsesWith(II, PoisonValue::get(Ty));
1371
1372 if(IC.getSimplifyQuery().isUndefValue(Arg))
1374
1375 ConstantInt *CArg = dyn_cast<ConstantInt>(II.getArgOperand(0));
1376 if (!CArg)
1377 break;
1378
1379 // Tabulated 0.0625 * (sext (CArg & 0xf)).
1380 constexpr size_t ResValsSize = 16;
1381 static constexpr float ResVals[ResValsSize] = {
1382 0.0, 0.0625, 0.125, 0.1875, 0.25, 0.3125, 0.375, 0.4375,
1383 -0.5, -0.4375, -0.375, -0.3125, -0.25, -0.1875, -0.125, -0.0625};
1384 Constant *Res =
1385 ConstantFP::get(Ty, ResVals[CArg->getZExtValue() & (ResValsSize - 1)]);
1386 return IC.replaceInstUsesWith(II, Res);
1387 }
1388 case Intrinsic::amdgcn_ubfe:
1389 case Intrinsic::amdgcn_sbfe: {
1390 // Decompose simple cases into standard shifts.
1391 Value *Src = II.getArgOperand(0);
1392 if (isa<UndefValue>(Src)) {
1393 return IC.replaceInstUsesWith(II, Src);
1394 }
1395
1396 unsigned Width;
1397 Type *Ty = II.getType();
1398 unsigned IntSize = Ty->getIntegerBitWidth();
1399
1400 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
1401 if (CWidth) {
1402 Width = CWidth->getZExtValue();
1403 if ((Width & (IntSize - 1)) == 0) {
1405 }
1406
1407 // Hardware ignores high bits, so remove those.
1408 if (Width >= IntSize) {
1409 return IC.replaceOperand(
1410 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
1411 }
1412 }
1413
1414 unsigned Offset;
1415 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
1416 if (COffset) {
1417 Offset = COffset->getZExtValue();
1418 if (Offset >= IntSize) {
1419 return IC.replaceOperand(
1420 II, 1,
1421 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
1422 }
1423 }
1424
1425 bool Signed = IID == Intrinsic::amdgcn_sbfe;
1426
1427 if (!CWidth || !COffset)
1428 break;
1429
1430 // The case of Width == 0 is handled above, which makes this transformation
1431 // safe. If Width == 0, then the ashr and lshr instructions become poison
1432 // value since the shift amount would be equal to the bit size.
1433 assert(Width != 0);
1434
1435 // TODO: This allows folding to undef when the hardware has specific
1436 // behavior?
1437 if (Offset + Width < IntSize) {
1438 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
1439 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
1440 : IC.Builder.CreateLShr(Shl, IntSize - Width);
1441 RightShift->takeName(&II);
1442 return IC.replaceInstUsesWith(II, RightShift);
1443 }
1444
1445 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
1446 : IC.Builder.CreateLShr(Src, Offset);
1447
1448 RightShift->takeName(&II);
1449 return IC.replaceInstUsesWith(II, RightShift);
1450 }
1451 case Intrinsic::amdgcn_exp:
1452 case Intrinsic::amdgcn_exp_row:
1453 case Intrinsic::amdgcn_exp_compr: {
1454 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
1455 unsigned EnBits = En->getZExtValue();
1456 if (EnBits == 0xf)
1457 break; // All inputs enabled.
1458
1459 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
1460 bool Changed = false;
1461 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
1462 if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
1463 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
1464 Value *Src = II.getArgOperand(I + 2);
1465 if (!isa<PoisonValue>(Src)) {
1466 IC.replaceOperand(II, I + 2, PoisonValue::get(Src->getType()));
1467 Changed = true;
1468 }
1469 }
1470 }
1471
1472 if (Changed) {
1473 return &II;
1474 }
1475
1476 break;
1477 }
1478 case Intrinsic::amdgcn_fmed3: {
1479 Value *Src0 = II.getArgOperand(0);
1480 Value *Src1 = II.getArgOperand(1);
1481 Value *Src2 = II.getArgOperand(2);
1482
1483 for (Value *Src : {Src0, Src1, Src2}) {
1484 if (isa<PoisonValue>(Src))
1485 return IC.replaceInstUsesWith(II, Src);
1486 }
1487
1488 if (II.isStrictFP())
1489 break;
1490
1491 // med3 with a nan input acts like
1492 // v_min_f32(v_min_f32(s0, s1), s2)
1493 //
1494 // Signalingness is ignored with ieee=0, so we fold to
1495 // minimumnum/maximumnum. With ieee=1, the v_min_f32 acts like llvm.minnum
1496 // with signaling nan handling. With ieee=0, like llvm.minimumnum except a
1497 // returned signaling nan will not be quieted.
1498
1499 // ieee=1
1500 // s0 snan: s2
1501 // s1 snan: s2
1502 // s2 snan: qnan
1503
1504 // s0 qnan: min(s1, s2)
1505 // s1 qnan: min(s0, s2)
1506 // s2 qnan: min(s0, s1)
1507
1508 // ieee=0
1509 // s0 _nan: min(s1, s2)
1510 // s1 _nan: min(s0, s2)
1511 // s2 _nan: min(s0, s1)
1512
1513 // med3 behavior with infinity
1514 // s0 +inf: max(s1, s2)
1515 // s1 +inf: max(s0, s2)
1516 // s2 +inf: max(s0, s1)
1517 // s0 -inf: min(s1, s2)
1518 // s1 -inf: min(s0, s2)
1519 // s2 -inf: min(s0, s1)
1520
1521 // Checking for NaN before canonicalization provides better fidelity when
1522 // mapping other operations onto fmed3 since the order of operands is
1523 // unchanged.
1524 Value *V = nullptr;
1525 const APFloat *ConstSrc0 = nullptr;
1526 const APFloat *ConstSrc1 = nullptr;
1527 const APFloat *ConstSrc2 = nullptr;
1528
1529 if ((match(Src0, m_APFloat(ConstSrc0)) &&
1530 (ConstSrc0->isNaN() || ConstSrc0->isInfinity())) ||
1531 isa<UndefValue>(Src0)) {
1532 const bool IsPosInfinity = ConstSrc0 && ConstSrc0->isPosInfinity();
1533 switch (fpenvIEEEMode(II)) {
1534 case KnownIEEEMode::On:
1535 // TODO: If Src2 is snan, does it need quieting?
1536 if (ConstSrc0 && ConstSrc0->isNaN() && ConstSrc0->isSignaling())
1537 return IC.replaceInstUsesWith(II, Src2);
1538
1539 V = IsPosInfinity ? IC.Builder.CreateMaxNum(Src1, Src2)
1540 : IC.Builder.CreateMinNum(Src1, Src2);
1541 break;
1542 case KnownIEEEMode::Off:
1543 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(Src1, Src2)
1544 : IC.Builder.CreateMinimumNum(Src1, Src2);
1545 break;
1547 break;
1548 }
1549 } else if ((match(Src1, m_APFloat(ConstSrc1)) &&
1550 (ConstSrc1->isNaN() || ConstSrc1->isInfinity())) ||
1551 isa<UndefValue>(Src1)) {
1552 const bool IsPosInfinity = ConstSrc1 && ConstSrc1->isPosInfinity();
1553 switch (fpenvIEEEMode(II)) {
1554 case KnownIEEEMode::On:
1555 // TODO: If Src2 is snan, does it need quieting?
1556 if (ConstSrc1 && ConstSrc1->isNaN() && ConstSrc1->isSignaling())
1557 return IC.replaceInstUsesWith(II, Src2);
1558
1559 V = IsPosInfinity ? IC.Builder.CreateMaxNum(Src0, Src2)
1560 : IC.Builder.CreateMinNum(Src0, Src2);
1561 break;
1562 case KnownIEEEMode::Off:
1563 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(Src0, Src2)
1564 : IC.Builder.CreateMinimumNum(Src0, Src2);
1565 break;
1567 break;
1568 }
1569 } else if ((match(Src2, m_APFloat(ConstSrc2)) &&
1570 (ConstSrc2->isNaN() || ConstSrc2->isInfinity())) ||
1571 isa<UndefValue>(Src2)) {
1572 switch (fpenvIEEEMode(II)) {
1573 case KnownIEEEMode::On:
1574 if (ConstSrc2 && ConstSrc2->isNaN() && ConstSrc2->isSignaling()) {
1575 auto *Quieted = ConstantFP::get(II.getType(), ConstSrc2->makeQuiet());
1576 return IC.replaceInstUsesWith(II, Quieted);
1577 }
1578
1579 V = (ConstSrc2 && ConstSrc2->isPosInfinity())
1580 ? IC.Builder.CreateMaxNum(Src0, Src1)
1581 : IC.Builder.CreateMinNum(Src0, Src1);
1582 break;
1583 case KnownIEEEMode::Off:
1584 V = (ConstSrc2 && ConstSrc2->isNegInfinity())
1585 ? IC.Builder.CreateMinimumNum(Src0, Src1)
1586 : IC.Builder.CreateMaximumNum(Src0, Src1);
1587 break;
1589 break;
1590 }
1591 }
1592
1593 if (V) {
1594 if (auto *CI = dyn_cast<CallInst>(V)) {
1595 CI->copyFastMathFlags(&II);
1596 CI->takeName(&II);
1597 }
1598 return IC.replaceInstUsesWith(II, V);
1599 }
1600
1601 bool Swap = false;
1602 // Canonicalize constants to RHS operands.
1603 //
1604 // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
1605 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
1606 std::swap(Src0, Src1);
1607 Swap = true;
1608 }
1609
1610 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
1611 std::swap(Src1, Src2);
1612 Swap = true;
1613 }
1614
1615 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
1616 std::swap(Src0, Src1);
1617 Swap = true;
1618 }
1619
1620 if (Swap) {
1621 II.setArgOperand(0, Src0);
1622 II.setArgOperand(1, Src1);
1623 II.setArgOperand(2, Src2);
1624 return &II;
1625 }
1626
1627 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
1628 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
1629 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
1630 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
1631 C2->getValueAPF());
1632 return IC.replaceInstUsesWith(II,
1633 ConstantFP::get(II.getType(), Result));
1634 }
1635 }
1636 }
1637
1638 if (!ST->hasMed3_16())
1639 break;
1640
1641 // Repeat floating-point width reduction done for minnum/maxnum.
1642 // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
1643 if (Value *X = matchFPExtFromF16(Src0)) {
1644 if (Value *Y = matchFPExtFromF16(Src1)) {
1645 if (Value *Z = matchFPExtFromF16(Src2)) {
1646 Value *NewCall = IC.Builder.CreateIntrinsic(
1647 IID, {X->getType()}, {X, Y, Z}, &II, II.getName());
1648 return new FPExtInst(NewCall, II.getType());
1649 }
1650 }
1651 }
1652
1653 break;
1654 }
1655 case Intrinsic::amdgcn_icmp:
1656 case Intrinsic::amdgcn_fcmp: {
1657 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
1658 // Guard against invalid arguments.
1659 int64_t CCVal = CC->getZExtValue();
1660 bool IsInteger = IID == Intrinsic::amdgcn_icmp;
1661 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
1662 CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
1663 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
1665 break;
1666
1667 Value *Src0 = II.getArgOperand(0);
1668 Value *Src1 = II.getArgOperand(1);
1669
1670 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
1671 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
1673 (ICmpInst::Predicate)CCVal, CSrc0, CSrc1, DL);
1674 if (CCmp && CCmp->isNullValue()) {
1675 return IC.replaceInstUsesWith(
1676 II, IC.Builder.CreateSExt(CCmp, II.getType()));
1677 }
1678
1679 // The result of V_ICMP/V_FCMP assembly instructions (which this
1680 // intrinsic exposes) is one bit per thread, masked with the EXEC
1681 // register (which contains the bitmask of live threads). So a
1682 // comparison that always returns true is the same as a read of the
1683 // EXEC register. ballot(true) reads EXEC at the wave-size width, so
1684 // zext/trunc the result to the intrinsic's return type.
1685 Type *WaveTy = IC.Builder.getIntNTy(ST->getWavefrontSize());
1686 Value *Ballot = IC.Builder.CreateIntrinsic(
1687 Intrinsic::amdgcn_ballot, WaveTy, IC.Builder.getTrue());
1688 Value *Result = IC.Builder.CreateZExtOrTrunc(Ballot, II.getType());
1689 return IC.replaceInstUsesWith(II, Result);
1690 }
1691
1692 // Canonicalize constants to RHS.
1693 CmpInst::Predicate SwapPred =
1695 II.setArgOperand(0, Src1);
1696 II.setArgOperand(1, Src0);
1697 II.setArgOperand(
1698 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
1699 return &II;
1700 }
1701
1702 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
1703 break;
1704
1705 // Canonicalize compare eq with true value to compare != 0
1706 // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
1707 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
1708 // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
1709 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
1710 Value *ExtSrc;
1711 if (CCVal == CmpInst::ICMP_EQ &&
1712 ((match(Src1, PatternMatch::m_One()) &&
1713 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
1714 (match(Src1, PatternMatch::m_AllOnes()) &&
1715 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
1716 ExtSrc->getType()->isIntegerTy(1)) {
1718 IC.replaceOperand(II, 2,
1719 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
1720 return &II;
1721 }
1722
1723 CmpPredicate SrcPred;
1724 Value *SrcLHS;
1725 Value *SrcRHS;
1726
1727 // Fold compare eq/ne with 0 from a compare result as the predicate to the
1728 // intrinsic. The typical use is a wave vote function in the library, which
1729 // will be fed from a user code condition compared with 0. Fold in the
1730 // redundant compare.
1731
1732 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
1733 // -> llvm.amdgcn.[if]cmp(a, b, pred)
1734 //
1735 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
1736 // -> llvm.amdgcn.[if]cmp(a, b, inv pred)
1737 if (match(Src1, PatternMatch::m_Zero()) &&
1739 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
1740 PatternMatch::m_Value(SrcRHS))))) {
1741 if (CCVal == CmpInst::ICMP_EQ)
1742 SrcPred = CmpInst::getInversePredicate(SrcPred);
1743
1744 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
1745 ? Intrinsic::amdgcn_fcmp
1746 : Intrinsic::amdgcn_icmp;
1747
1748 Type *Ty = SrcLHS->getType();
1749 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
1750 // Promote to next legal integer type.
1751 unsigned Width = CmpType->getBitWidth();
1752 unsigned NewWidth = Width;
1753
1754 // Don't do anything for i1 comparisons.
1755 if (Width == 1)
1756 break;
1757
1758 if (Width <= 16)
1759 NewWidth = 16;
1760 else if (Width <= 32)
1761 NewWidth = 32;
1762 else if (Width <= 64)
1763 NewWidth = 64;
1764 else
1765 break; // Can't handle this.
1766
1767 if (Width != NewWidth) {
1768 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
1769 if (CmpInst::isSigned(SrcPred)) {
1770 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
1771 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
1772 } else {
1773 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
1774 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
1775 }
1776 }
1777 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
1778 break;
1779
1780 Value *Args[] = {SrcLHS, SrcRHS,
1781 ConstantInt::get(CC->getType(), SrcPred)};
1782 Value *NewCall = IC.Builder.CreateIntrinsic(
1783 NewIID, {II.getType(), SrcLHS->getType()}, Args);
1784 NewCall->takeName(&II);
1785 return IC.replaceInstUsesWith(II, NewCall);
1786 }
1787
1788 break;
1789 }
1790 case Intrinsic::amdgcn_mbcnt_hi:
1791 // exec_hi is all 0, so this is just a copy.
1792 if (ST->isWave32())
1793 return IC.replaceInstUsesWith(II, II.getArgOperand(1));
1794 [[fallthrough]];
1795 case Intrinsic::amdgcn_mbcnt_lo: {
1796 ConstantRange AccRange =
1797 computeConstantRange(II.getArgOperand(1),
1798 /*ForSigned=*/false, IC.getSimplifyQuery());
1799 if (AccRange.isFullSet())
1800 return nullptr;
1801
1802 // TODO: Can raise lower bound by inspecting first argument.
1803 ConstantRange MbcntRange(APInt(32, 0), APInt(32, 32 + 1));
1804 ConstantRange ComputedRange = AccRange.add(MbcntRange);
1805 if (ComputedRange.isFullSet())
1806 return nullptr;
1807
1808 if (std::optional<ConstantRange> ExistingRange = II.getRange()) {
1809 ComputedRange = ComputedRange.intersectWith(*ExistingRange);
1810 if (ComputedRange == *ExistingRange)
1811 return nullptr;
1812 }
1813
1814 II.addRangeRetAttr(ComputedRange);
1815 return nullptr;
1816 }
1817 case Intrinsic::amdgcn_ballot: {
1818 Value *Arg = II.getArgOperand(0);
1819 if (isa<PoisonValue>(Arg))
1820 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1821
1822 if (auto *Src = dyn_cast<ConstantInt>(Arg)) {
1823 if (Src->isZero()) {
1824 // amdgcn.ballot(i1 0) is zero.
1825 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
1826 }
1827 }
1828 if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
1829 // %b64 = call i64 ballot.i64(...)
1830 // =>
1831 // %b32 = call i32 ballot.i32(...)
1832 // %b64 = zext i32 %b32 to i64
1834 IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot,
1835 {IC.Builder.getInt32Ty()},
1836 {II.getArgOperand(0)}),
1837 II.getType());
1838 Call->takeName(&II);
1839 return IC.replaceInstUsesWith(II, Call);
1840 }
1841 break;
1842 }
1843 case Intrinsic::amdgcn_wavefrontsize: {
1844 if (ST->isWaveSizeKnown())
1845 return IC.replaceInstUsesWith(
1846 II, ConstantInt::get(II.getType(), ST->getWavefrontSize()));
1847 break;
1848 }
1849 case Intrinsic::amdgcn_wqm_vote: {
1850 // wqm_vote is identity when the argument is constant.
1851 if (!isa<Constant>(II.getArgOperand(0)))
1852 break;
1853
1854 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1855 }
1856 case Intrinsic::amdgcn_kill: {
1857 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
1858 if (!C || !C->getZExtValue())
1859 break;
1860
1861 // amdgcn.kill(i1 1) is a no-op
1862 return IC.eraseInstFromFunction(II);
1863 }
1864 case Intrinsic::amdgcn_s_sendmsg:
1865 case Intrinsic::amdgcn_s_sendmsghalt: {
1866 // The second operand is copied to m0, but is only actually used for
1867 // certain message types. For message types that are known to not use m0,
1868 // fold it to poison.
1869 using namespace AMDGPU::SendMsg;
1870
1871 Value *M0Val = II.getArgOperand(1);
1872 if (isa<PoisonValue>(M0Val))
1873 break;
1874
1875 auto *MsgImm = cast<ConstantInt>(II.getArgOperand(0));
1876 uint16_t MsgId, OpId, StreamId;
1877 decodeMsg(MsgImm->getZExtValue(), MsgId, OpId, StreamId, *ST);
1878
1879 if (!msgDoesNotUseM0(MsgId, *ST))
1880 break;
1881
1882 // Drop UB-implying attributes since we're replacing with poison.
1883 II.dropUBImplyingAttrsAndMetadata();
1884 IC.replaceOperand(II, 1, PoisonValue::get(M0Val->getType()));
1885 return nullptr;
1886 }
1887 case Intrinsic::amdgcn_update_dpp: {
1888 Value *Old = II.getArgOperand(0);
1889
1890 auto *BC = cast<ConstantInt>(II.getArgOperand(5));
1891 auto *RM = cast<ConstantInt>(II.getArgOperand(3));
1892 auto *BM = cast<ConstantInt>(II.getArgOperand(4));
1893 if (BC->isNullValue() || RM->getZExtValue() != 0xF ||
1894 BM->getZExtValue() != 0xF || isa<PoisonValue>(Old))
1895 break;
1896
1897 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1898 return IC.replaceOperand(II, 0, PoisonValue::get(Old->getType()));
1899 }
1900 case Intrinsic::amdgcn_permlane16:
1901 case Intrinsic::amdgcn_permlane16_var:
1902 case Intrinsic::amdgcn_permlanex16:
1903 case Intrinsic::amdgcn_permlanex16_var: {
1904 // Discard vdst_in if it's not going to be read.
1905 Value *VDstIn = II.getArgOperand(0);
1906 if (isa<PoisonValue>(VDstIn))
1907 break;
1908
1909 // FetchInvalid operand idx.
1910 unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 ||
1911 IID == Intrinsic::amdgcn_permlanex16)
1912 ? 4 /* for permlane16 and permlanex16 */
1913 : 3; /* for permlane16_var and permlanex16_var */
1914
1915 // BoundCtrl operand idx.
1916 // For permlane16 and permlanex16 it should be 5
1917 // For Permlane16_var and permlanex16_var it should be 4
1918 unsigned int BcIdx = FiIdx + 1;
1919
1920 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(FiIdx));
1921 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(BcIdx));
1922 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1923 break;
1924
1925 return IC.replaceOperand(II, 0, PoisonValue::get(VDstIn->getType()));
1926 }
1927 case Intrinsic::amdgcn_wave_shuffle:
1928 return tryOptimizeShufflePattern(IC, II, *ST);
1929 case Intrinsic::amdgcn_permlane64:
1930 case Intrinsic::amdgcn_readfirstlane:
1931 case Intrinsic::amdgcn_readlane:
1932 case Intrinsic::amdgcn_ds_bpermute: {
1933 // If the data argument is uniform these intrinsics return it unchanged.
1934 unsigned SrcIdx = IID == Intrinsic::amdgcn_ds_bpermute ? 1 : 0;
1935 const Use &Src = II.getArgOperandUse(SrcIdx);
1936 if (isTriviallyUniform(Src))
1937 return IC.replaceInstUsesWith(II, Src.get());
1938
1939 if (IID == Intrinsic::amdgcn_readlane &&
1941 return &II;
1942
1943 // If the lane argument of bpermute is uniform, change it to readlane. This
1944 // generates better code and can enable further optimizations because
1945 // readlane is AlwaysUniform.
1946 if (IID == Intrinsic::amdgcn_ds_bpermute) {
1947 const Use &Lane = II.getArgOperandUse(0);
1948 if (isTriviallyUniform(Lane)) {
1949 Value *NewLane = IC.Builder.CreateLShr(Lane, 2);
1951 II.getModule(), Intrinsic::amdgcn_readlane, II.getType());
1952 II.setCalledFunction(NewDecl);
1953 II.setOperand(0, Src);
1954 II.setOperand(1, NewLane);
1955 return &II;
1956 }
1957 }
1958
1959 if (IID == Intrinsic::amdgcn_ds_bpermute)
1960 return tryOptimizeShufflePattern(IC, II, *ST);
1961
1963 return Res;
1964
1965 return std::nullopt;
1966 }
1967 case Intrinsic::amdgcn_writelane: {
1968 // TODO: Fold bitcast like readlane.
1969 if (simplifyDemandedLaneMaskArg(IC, II, 1))
1970 return &II;
1971 return std::nullopt;
1972 }
1973 case Intrinsic::amdgcn_trig_preop: {
1974 // The intrinsic is declared with name mangling, but currently the
1975 // instruction only exists for f64
1976 if (!II.getType()->isDoubleTy())
1977 break;
1978
1979 Value *Src = II.getArgOperand(0);
1980 Value *Segment = II.getArgOperand(1);
1981 if (isa<PoisonValue>(Src) || isa<PoisonValue>(Segment))
1982 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1983
1984 if (isa<UndefValue>(Segment))
1985 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
1986
1987 // Sign bit is not used.
1988 Value *StrippedSign = InstCombiner::stripSignOnlyFPOps(Src);
1989 if (StrippedSign != Src)
1990 return IC.replaceOperand(II, 0, StrippedSign);
1991
1992 if (II.isStrictFP())
1993 break;
1994
1995 const ConstantFP *CSrc = dyn_cast<ConstantFP>(Src);
1996 if (!CSrc && !isa<UndefValue>(Src))
1997 break;
1998
1999 // The instruction ignores special cases, and literally just extracts the
2000 // exponents. Fold undef to nan, and index the table as normal.
2001 APInt FSrcInt = CSrc ? CSrc->getValueAPF().bitcastToAPInt()
2002 : APFloat::getQNaN(II.getType()->getFltSemantics())
2003 .bitcastToAPInt();
2004
2005 const ConstantInt *Cseg = dyn_cast<ConstantInt>(Segment);
2006 if (!Cseg) {
2007 if (isa<UndefValue>(Src))
2008 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
2009 break;
2010 }
2011
2012 unsigned Exponent = FSrcInt.extractBitsAsZExtValue(11, 52);
2013 unsigned SegmentVal = Cseg->getValue().trunc(5).getZExtValue();
2014 unsigned Shift = SegmentVal * 53;
2015 if (Exponent > 1077)
2016 Shift += Exponent - 1077;
2017
2018 // 2.0/PI table.
2019 static const uint32_t TwoByPi[] = {
2020 0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,
2021 0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,
2022 0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
2023 0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,
2024 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
2025 0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
2026 0x56033046};
2027
2028 // Return 0 for outbound segment (hardware behavior).
2029 unsigned Idx = Shift >> 5;
2030 if (Idx + 2 >= std::size(TwoByPi)) {
2031 APFloat Zero = APFloat::getZero(II.getType()->getFltSemantics());
2032 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getType(), Zero));
2033 }
2034
2035 unsigned BShift = Shift & 0x1f;
2036 uint64_t Thi = Make_64(TwoByPi[Idx], TwoByPi[Idx + 1]);
2037 uint64_t Tlo = Make_64(TwoByPi[Idx + 2], 0);
2038 if (BShift)
2039 Thi = (Thi << BShift) | (Tlo >> (64 - BShift));
2040 Thi = Thi >> 11;
2041 APFloat Result = APFloat((double)Thi);
2042
2043 int Scale = -53 - Shift;
2044 if (Exponent >= 1968)
2045 Scale += 128;
2046
2047 Result = scalbn(Result, Scale, RoundingMode::NearestTiesToEven);
2048 return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Result));
2049 }
2050 case Intrinsic::amdgcn_fmul_legacy: {
2051 Value *Op0 = II.getArgOperand(0);
2052 Value *Op1 = II.getArgOperand(1);
2053
2054 for (Value *Src : {Op0, Op1}) {
2055 if (isa<PoisonValue>(Src))
2056 return IC.replaceInstUsesWith(II, Src);
2057 }
2058
2059 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
2060 // infinity, gives +0.0.
2061 // TODO: Move to InstSimplify?
2062 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
2064 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
2065
2066 // If we can prove we don't have one of the special cases then we can use a
2067 // normal fmul instruction instead.
2068 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
2069 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
2070 FMul->takeName(&II);
2071 return IC.replaceInstUsesWith(II, FMul);
2072 }
2073 break;
2074 }
2075 case Intrinsic::amdgcn_fma_legacy: {
2076 Value *Op0 = II.getArgOperand(0);
2077 Value *Op1 = II.getArgOperand(1);
2078 Value *Op2 = II.getArgOperand(2);
2079
2080 for (Value *Src : {Op0, Op1, Op2}) {
2081 if (isa<PoisonValue>(Src))
2082 return IC.replaceInstUsesWith(II, Src);
2083 }
2084
2085 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
2086 // infinity, gives +0.0.
2087 // TODO: Move to InstSimplify?
2088 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
2090 // It's tempting to just return Op2 here, but that would give the wrong
2091 // result if Op2 was -0.0.
2092 auto *Zero = ConstantFP::getZero(II.getType());
2093 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
2094 FAdd->takeName(&II);
2095 return IC.replaceInstUsesWith(II, FAdd);
2096 }
2097
2098 // If we can prove we don't have one of the special cases then we can use a
2099 // normal fma instead.
2100 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
2101 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
2102 II.getModule(), Intrinsic::fma, II.getType()));
2103 return &II;
2104 }
2105 break;
2106 }
2107 case Intrinsic::amdgcn_is_shared:
2108 case Intrinsic::amdgcn_is_private: {
2109 Value *Src = II.getArgOperand(0);
2110 if (isa<PoisonValue>(Src))
2111 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
2112 if (isa<UndefValue>(Src))
2113 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
2114
2115 if (isa<ConstantPointerNull>(II.getArgOperand(0)))
2116 return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
2117 break;
2118 }
2119 case Intrinsic::amdgcn_make_buffer_rsrc: {
2120 Value *Src = II.getArgOperand(0);
2121 if (isa<PoisonValue>(Src))
2122 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
2123 return std::nullopt;
2124 }
2125 case Intrinsic::amdgcn_raw_buffer_store_format:
2126 case Intrinsic::amdgcn_struct_buffer_store_format:
2127 case Intrinsic::amdgcn_raw_tbuffer_store:
2128 case Intrinsic::amdgcn_struct_tbuffer_store:
2129 case Intrinsic::amdgcn_image_store_1d:
2130 case Intrinsic::amdgcn_image_store_1darray:
2131 case Intrinsic::amdgcn_image_store_2d:
2132 case Intrinsic::amdgcn_image_store_2darray:
2133 case Intrinsic::amdgcn_image_store_2darraymsaa:
2134 case Intrinsic::amdgcn_image_store_2dmsaa:
2135 case Intrinsic::amdgcn_image_store_3d:
2136 case Intrinsic::amdgcn_image_store_cube:
2137 case Intrinsic::amdgcn_image_store_mip_1d:
2138 case Intrinsic::amdgcn_image_store_mip_1darray:
2139 case Intrinsic::amdgcn_image_store_mip_2d:
2140 case Intrinsic::amdgcn_image_store_mip_2darray:
2141 case Intrinsic::amdgcn_image_store_mip_3d:
2142 case Intrinsic::amdgcn_image_store_mip_cube: {
2143 if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))
2144 break;
2145
2146 APInt DemandedElts;
2147 if (ST->hasDefaultComponentBroadcast())
2148 DemandedElts = defaultComponentBroadcast(II.getArgOperand(0));
2149 else if (ST->hasDefaultComponentZero())
2150 DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
2151 else
2152 break;
2153
2154 int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
2155 if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
2156 false)) {
2157 return IC.eraseInstFromFunction(II);
2158 }
2159
2160 break;
2161 }
2162 case Intrinsic::amdgcn_prng_b32: {
2163 auto *Src = II.getArgOperand(0);
2164 if (isa<UndefValue>(Src)) {
2165 return IC.replaceInstUsesWith(II, Src);
2166 }
2167 return std::nullopt;
2168 }
2169 case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
2170 case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
2171 Value *Src0 = II.getArgOperand(0);
2172 Value *Src1 = II.getArgOperand(1);
2173 uint64_t CBSZ = cast<ConstantInt>(II.getArgOperand(3))->getZExtValue();
2174 uint64_t BLGP = cast<ConstantInt>(II.getArgOperand(4))->getZExtValue();
2175 auto *Src0Ty = cast<FixedVectorType>(Src0->getType());
2176 auto *Src1Ty = cast<FixedVectorType>(Src1->getType());
2177
2178 auto getFormatNumRegs = [](unsigned FormatVal) {
2179 switch (FormatVal) {
2182 return 6u;
2184 return 4u;
2187 return 8u;
2188 default:
2189 llvm_unreachable("invalid format value");
2190 }
2191 };
2192
2193 bool MadeChange = false;
2194 unsigned Src0NumElts = getFormatNumRegs(CBSZ);
2195 unsigned Src1NumElts = getFormatNumRegs(BLGP);
2196
2197 // Depending on the used format, fewer registers are required so shrink the
2198 // vector type.
2199 if (Src0Ty->getNumElements() > Src0NumElts) {
2200 Src0 = IC.Builder.CreateExtractVector(
2201 FixedVectorType::get(Src0Ty->getElementType(), Src0NumElts), Src0,
2202 uint64_t(0));
2203 MadeChange = true;
2204 }
2205
2206 if (Src1Ty->getNumElements() > Src1NumElts) {
2207 Src1 = IC.Builder.CreateExtractVector(
2208 FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1,
2209 uint64_t(0));
2210 MadeChange = true;
2211 }
2212
2213 if (!MadeChange)
2214 return std::nullopt;
2215
2216 SmallVector<Value *, 10> Args(II.args());
2217 Args[0] = Src0;
2218 Args[1] = Src1;
2219
2220 Value *NewII = IC.Builder.CreateIntrinsic(
2221 IID, {Src0->getType(), Src1->getType()}, Args, &II);
2222 NewII->takeName(&II);
2223 return IC.replaceInstUsesWith(II, NewII);
2224 }
2225 case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
2226 case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
2227 case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: {
2228 Value *Src0 = II.getArgOperand(1);
2229 Value *Src1 = II.getArgOperand(3);
2230 unsigned FmtA = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2231 uint64_t FmtB = cast<ConstantInt>(II.getArgOperand(2))->getZExtValue();
2232 auto *Src0Ty = cast<FixedVectorType>(Src0->getType());
2233 auto *Src1Ty = cast<FixedVectorType>(Src1->getType());
2234
2235 bool MadeChange = false;
2236 unsigned Src0NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtA);
2237 unsigned Src1NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtB);
2238
2239 // Depending on the used format, fewer registers are required so shrink the
2240 // vector type.
2241 if (Src0Ty->getNumElements() > Src0NumElts) {
2242 Src0 = IC.Builder.CreateExtractVector(
2243 FixedVectorType::get(Src0Ty->getElementType(), Src0NumElts), Src0,
2244 IC.Builder.getInt64(0));
2245 MadeChange = true;
2246 }
2247
2248 if (Src1Ty->getNumElements() > Src1NumElts) {
2249 Src1 = IC.Builder.CreateExtractVector(
2250 FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1,
2251 IC.Builder.getInt64(0));
2252 MadeChange = true;
2253 }
2254
2255 if (!MadeChange)
2256 return std::nullopt;
2257
2258 SmallVector<Value *, 13> Args(II.args());
2259 Args[1] = Src0;
2260 Args[3] = Src1;
2261
2262 Value *NewII = IC.Builder.CreateIntrinsic(
2263 IID, {II.getArgOperand(5)->getType(), Src0->getType(), Src1->getType()},
2264 Args, &II);
2265 NewII->takeName(&II);
2266 return IC.replaceInstUsesWith(II, NewII);
2267 }
2268 }
2269 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
2270 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
2271 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
2272 }
2273 return std::nullopt;
2274}
2275
2276/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
2277///
2278/// The result of simplifying amdgcn image and buffer store intrinsics is updating
2279/// definitions of the intrinsics vector argument, not Uses of the result like
2280/// image and buffer loads.
2281/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
2282/// struct returns.
2285 APInt DemandedElts,
2286 int DMaskIdx, bool IsLoad) {
2287
2288 auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType()
2289 : II.getOperand(0)->getType());
2290 unsigned VWidth = IIVTy->getNumElements();
2291 if (VWidth == 1)
2292 return nullptr;
2293 Type *EltTy = IIVTy->getElementType();
2294
2297
2298 // Assume the arguments are unchanged and later override them, if needed.
2299 SmallVector<Value *, 16> Args(II.args());
2300
2301 if (DMaskIdx < 0) {
2302 // Buffer case.
2303
2304 const unsigned ActiveBits = DemandedElts.getActiveBits();
2305 const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
2306
2307 // Start assuming the prefix of elements is demanded, but possibly clear
2308 // some other bits if there are trailing zeros (unused components at front)
2309 // and update offset.
2310 DemandedElts = (1 << ActiveBits) - 1;
2311
2312 if (UnusedComponentsAtFront > 0) {
2313 static const unsigned InvalidOffsetIdx = 0xf;
2314
2315 unsigned OffsetIdx;
2316 switch (II.getIntrinsicID()) {
2317 case Intrinsic::amdgcn_raw_buffer_load:
2318 case Intrinsic::amdgcn_raw_ptr_buffer_load:
2319 OffsetIdx = 1;
2320 break;
2321 case Intrinsic::amdgcn_s_buffer_load:
2322 // If resulting type is vec3, there is no point in trimming the
2323 // load with updated offset, as the vec3 would most likely be widened to
2324 // vec4 anyway during lowering.
2325 if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
2326 OffsetIdx = InvalidOffsetIdx;
2327 else
2328 OffsetIdx = 1;
2329 break;
2330 case Intrinsic::amdgcn_struct_buffer_load:
2331 case Intrinsic::amdgcn_struct_ptr_buffer_load:
2332 OffsetIdx = 2;
2333 break;
2334 default:
2335 // TODO: handle tbuffer* intrinsics.
2336 OffsetIdx = InvalidOffsetIdx;
2337 break;
2338 }
2339
2340 if (OffsetIdx != InvalidOffsetIdx) {
2341 // Clear demanded bits and update the offset.
2342 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
2343 auto *Offset = Args[OffsetIdx];
2344 unsigned SingleComponentSizeInBits =
2345 IC.getDataLayout().getTypeSizeInBits(EltTy);
2346 unsigned OffsetAdd =
2347 UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
2348 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
2349 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
2350 }
2351 }
2352 } else {
2353 // Image case.
2354
2355 ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]);
2356 unsigned DMaskVal = DMask->getZExtValue() & 0xf;
2357
2358 // dmask 0 has special semantics, do not simplify.
2359 if (DMaskVal == 0)
2360 return nullptr;
2361
2362 // Mask off values that are undefined because the dmask doesn't cover them
2363 DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
2364
2365 unsigned NewDMaskVal = 0;
2366 unsigned OrigLdStIdx = 0;
2367 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
2368 const unsigned Bit = 1 << SrcIdx;
2369 if (!!(DMaskVal & Bit)) {
2370 if (!!DemandedElts[OrigLdStIdx])
2371 NewDMaskVal |= Bit;
2372 OrigLdStIdx++;
2373 }
2374 }
2375
2376 if (DMaskVal != NewDMaskVal)
2377 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
2378 }
2379
2380 unsigned NewNumElts = DemandedElts.popcount();
2381 if (!NewNumElts)
2382 return PoisonValue::get(IIVTy);
2383
2384 if (NewNumElts >= VWidth && DemandedElts.isMask()) {
2385 if (DMaskIdx >= 0)
2386 II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
2387 return nullptr;
2388 }
2389
2390 // Validate function argument and return types, extracting overloaded types
2391 // along the way.
2392 SmallVector<Type *, 6> OverloadTys;
2393 if (!Intrinsic::isSignatureValid(II.getCalledFunction(), OverloadTys))
2394 return nullptr;
2395
2396 Type *NewTy =
2397 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
2398 OverloadTys[0] = NewTy;
2399
2400 if (!IsLoad) {
2401 SmallVector<int, 8> EltMask;
2402 for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
2403 if (DemandedElts[OrigStoreIdx])
2404 EltMask.push_back(OrigStoreIdx);
2405
2406 if (NewNumElts == 1)
2407 Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]);
2408 else
2409 Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask);
2410 }
2411
2413 II.getIntrinsicID(), OverloadTys, Args);
2414 NewCall->takeName(&II);
2415 NewCall->copyMetadata(II);
2416 AttributeList OldAttrList = II.getAttributes();
2417 NewCall->setAttributes(OldAttrList);
2418
2419 if (IsLoad) {
2420 if (NewNumElts == 1) {
2421 return IC.Builder.CreateInsertElement(PoisonValue::get(IIVTy), NewCall,
2422 DemandedElts.countr_zero());
2423 }
2424
2425 SmallVector<int, 8> EltMask;
2426 unsigned NewLoadIdx = 0;
2427 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
2428 if (!!DemandedElts[OrigLoadIdx])
2429 EltMask.push_back(NewLoadIdx++);
2430 else
2431 EltMask.push_back(NewNumElts);
2432 }
2433
2434 auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
2435
2436 return Shuffle;
2437 }
2438
2439 return NewCall;
2440}
2441
2443 InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts,
2444 APInt &UndefElts) const {
2445 auto *VT = dyn_cast<FixedVectorType>(II.getType());
2446 if (!VT)
2447 return nullptr;
2448
2449 const unsigned FirstElt = DemandedElts.countr_zero();
2450 const unsigned LastElt = DemandedElts.getActiveBits() - 1;
2451 const unsigned MaskLen = LastElt - FirstElt + 1;
2452
2453 unsigned OldNumElts = VT->getNumElements();
2454 if (MaskLen == OldNumElts && MaskLen != 1)
2455 return nullptr;
2456
2457 Type *EltTy = VT->getElementType();
2458 Type *NewVT = MaskLen == 1 ? EltTy : FixedVectorType::get(EltTy, MaskLen);
2459
2460 // Theoretically we should support these intrinsics for any legal type. Avoid
2461 // introducing cases that aren't direct register types like v3i16.
2462 if (!isTypeLegal(NewVT))
2463 return nullptr;
2464
2465 Value *Src = II.getArgOperand(0);
2466
2467 // Make sure convergence tokens are preserved.
2468 // TODO: CreateIntrinsic should allow directly copying bundles
2470 II.getOperandBundlesAsDefs(OpBundles);
2471
2473 Function *Remangled =
2474 Intrinsic::getOrInsertDeclaration(M, II.getIntrinsicID(), {NewVT});
2475
2476 if (MaskLen == 1) {
2477 Value *Extract = IC.Builder.CreateExtractElement(Src, FirstElt);
2478
2479 // TODO: Preserve callsite attributes?
2480 CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
2481
2482 return IC.Builder.CreateInsertElement(PoisonValue::get(II.getType()),
2483 NewCall, FirstElt);
2484 }
2485
2486 SmallVector<int> ExtractMask(MaskLen, -1);
2487 for (unsigned I = 0; I != MaskLen; ++I) {
2488 if (DemandedElts[FirstElt + I])
2489 ExtractMask[I] = FirstElt + I;
2490 }
2491
2492 Value *Extract = IC.Builder.CreateShuffleVector(Src, ExtractMask);
2493
2494 // TODO: Preserve callsite attributes?
2495 CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
2496
2497 SmallVector<int> InsertMask(OldNumElts, -1);
2498 for (unsigned I = 0; I != MaskLen; ++I) {
2499 if (DemandedElts[FirstElt + I])
2500 InsertMask[FirstElt + I] = I;
2501 }
2502
2503 // FIXME: If the call has a convergence bundle, we end up leaving the dead
2504 // call behind.
2505 return IC.Builder.CreateShuffleVector(NewCall, InsertMask);
2506}
2507
2509 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
2510 APInt &UndefElts2, APInt &UndefElts3,
2511 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2512 SimplifyAndSetOp) const {
2513 switch (II.getIntrinsicID()) {
2514 case Intrinsic::amdgcn_readfirstlane:
2515 SimplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2516 return simplifyAMDGCNLaneIntrinsicDemanded(IC, II, DemandedElts, UndefElts);
2517 case Intrinsic::amdgcn_raw_buffer_load:
2518 case Intrinsic::amdgcn_raw_ptr_buffer_load:
2519 case Intrinsic::amdgcn_raw_buffer_load_format:
2520 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
2521 case Intrinsic::amdgcn_raw_tbuffer_load:
2522 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
2523 case Intrinsic::amdgcn_s_buffer_load:
2524 case Intrinsic::amdgcn_struct_buffer_load:
2525 case Intrinsic::amdgcn_struct_ptr_buffer_load:
2526 case Intrinsic::amdgcn_struct_buffer_load_format:
2527 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
2528 case Intrinsic::amdgcn_struct_tbuffer_load:
2529 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
2530 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
2531 default: {
2532 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
2533 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
2534 }
2535 break;
2536 }
2537 }
2538 return std::nullopt;
2539}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * createPermlane16(IRBuilderBase &B, Value *Val, uint32_t Lo, uint32_t Hi)
Emit v_permlane16 with the precomputed lane-select halves.
static std::optional< unsigned > matchRowSharePattern(ArrayRef< uint8_t > Ids)
Match a row-share pattern: all 16 lanes of each row read the same source lane.
static bool matchMirrorPattern(ArrayRef< uint8_t > Ids)
Match an N-lane reversal (mirror) pattern.
static bool tryBuildShuffleMap(Value *Index, const GCNSubtarget &ST, SmallVectorImpl< uint8_t > &Ids, const DataLayout &DL)
Build the per-lane shuffle map by evaluating Index for every lane in the wave.
static std::optional< unsigned > matchQuadPermPattern(ArrayRef< uint8_t > Ids)
Match a 4-lane (quad) permutation, encoded as the v_mov_b32_dpp QUAD_PERM control word: bits[1:0]=Ids...
static std::optional< unsigned > matchDsSwizzleRotatePattern(ArrayRef< uint8_t > Ids)
Match a GFX9+ DS_SWIZZLE rotate-mode permutation: a cyclic left-rotation of all 32 lanes within each ...
static std::optional< unsigned > matchHalfRowPermPattern(ArrayRef< uint8_t > Ids)
Match an 8-lane arbitrary permutation, encoded as the v_mov_b32_dpp8 24-bit selector (three bits per ...
static std::optional< unsigned > matchRowXMaskPattern(ArrayRef< uint8_t > Ids)
Match an XOR mask pattern within each 16-lane row: Ids[J] == Mask ^ J, with Mask in [1,...
static constexpr auto matchHalfRowMirrorPattern
static Value * createPermlaneX16(IRBuilderBase &B, Value *Val, uint32_t Lo, uint32_t Hi)
Emit v_permlanex16 with the precomputed lane-select halves.
static bool isRowPattern(ArrayRef< uint8_t > Ids)
Match an N-lane row pattern: each lane in [0, N) reads from a source lane in the same N-lane row,...
static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp)
Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
static bool isTriviallyUniform(const Use &U)
Return true if we can easily prove that use U is uniform.
static CallInst * rewriteCall(IRBuilderBase &B, CallInst &Old, Function &NewCallee, ArrayRef< Value * > Ops)
static Value * convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder)
static constexpr auto isFullRowPattern
static constexpr auto isQuadPattern
static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV, Instruction *I)
static uint64_t computePermlane16Masks(ArrayRef< uint8_t > Ids)
Pack a 16-lane permutation into a single 64-bit value: four bits per output lane, lane J in bits [J*4...
static bool matchHalfWaveSwapPattern(ArrayRef< uint8_t > Ids)
Match a half-wave swap: lane J reads from lane J ^ 32.
static bool hasPeriodicLayout(ArrayRef< uint8_t > Ids)
Lanes are partitioned into groups of Period; each group is a translated copy of the first: Ids[I] = I...
static std::optional< Instruction * > tryOptimizeShufflePattern(InstCombiner &IC, IntrinsicInst &II, const GCNSubtarget &ST)
Try to fold a wave_shuffle/ds_bpermute whose lane index is a constant function of the lane ID into a ...
static constexpr auto isHalfRowPattern
static APInt defaultComponentBroadcast(Value *V)
static std::optional< unsigned > matchDsSwizzleBitmaskPattern(ArrayRef< uint8_t > Ids)
Match a DS_SWIZZLE bitmask-mode permutation: dst_lane = ((src_lane & AND) | OR) ^ XOR with each mask ...
static Value * createDsSwizzle(IRBuilderBase &B, Value *Val, unsigned Offset, const DataLayout &DL)
Emit ds_swizzle with the given immediate, bitcasting/converting between pointer/float types and i32 a...
static std::optional< Instruction * > modifyIntrinsicCall(IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr, InstCombiner &IC, std::function< void(SmallVectorImpl< Value * > &, SmallVectorImpl< Type * > &)> Func)
Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with modified arguments (based on ...
static Value * matchShuffleToHWIntrinsic(IRBuilderBase &B, Value *Src, ArrayRef< uint8_t > Ids, const GCNSubtarget &ST, const DataLayout &DL)
Given a shuffle map, try to emit the best hardware intrinsic.
static std::optional< unsigned > matchRowRotatePattern(ArrayRef< uint8_t > Ids)
Match a 16-lane cyclic rotation; returns the rotation amount in [1, 15].
static bool isCrossRowPattern(ArrayRef< uint8_t > Ids)
Match a cross-row permutation suitable for v_permlanex16: every lane in the low 16-lane half reads fr...
static bool isThreadID(const GCNSubtarget &ST, Value *V)
static Value * createUpdateDpp(IRBuilderBase &B, Value *Val, unsigned Ctrl)
Emit v_mov_b32_dpp with the given control word, row/bank masks 0xF, and bound_ctrl=1 so out-of-bounds...
static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, const APFloat &Src2)
static Value * simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, int DMaskIdx=-1, bool IsLoad=true)
Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
static std::optional< Instruction * > simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, IntrinsicInst &II, InstCombiner &IC)
static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat)
static Value * createMovDpp8(IRBuilderBase &B, Value *Val, unsigned Selector)
Emit v_mov_b32_dpp8 with the given 24-bit lane selector.
static Value * matchFPExtFromF16(Value *Arg)
Match an fpext from half to float, or a constant we can convert.
static constexpr auto matchFullRowMirrorPattern
static std::optional< unsigned > evalLaneExpr(Value *V, unsigned Lane, const GCNSubtarget &ST, const DataLayout &DL, unsigned Depth=0)
Evaluate V as a function of the lane ID and return its value on Lane, or std::nullopt if V is not a c...
static Value * createPermlane64(IRBuilderBase &B, Value *Val)
Emit v_permlane64 (swap of the two 32-lane halves of a wave64).
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
This file a TargetTransformInfoImplBase conforming object specific to the AMDGPU target machine.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define I(x, y, z)
Definition MD5.cpp:57
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
if(PassOpts->AAPipeline)
This file contains some templates that are useful if you are working with the STL at all.
Provides some synthesis utilities to produce sequences of values.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
Value * RHS
Value * LHS
static constexpr roundingMode rmTowardZero
Definition APFloat.h:349
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:345
static const fltSemantics & IEEEhalf()
Definition APFloat.h:295
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1185
opStatus divide(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1273
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:5901
bool bitwiseIsEqual(const APFloat &RHS) const
Definition APFloat.h:1509
bool isPosInfinity() const
Definition APFloat.h:1557
const fltSemantics & getSemantics() const
Definition APFloat.h:1552
APFloat makeQuiet() const
Assuming this is an IEEE-754 NaN value, quiet its signaling bit.
Definition APFloat.h:1381
bool isNaN() const
Definition APFloat.h:1542
bool isSignaling() const
Definition APFloat.h:1546
APInt bitcastToAPInt() const
Definition APFloat.h:1436
bool isNegInfinity() const
Definition APFloat.h:1558
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1144
bool isInfinity() const
Definition APFloat.h:1541
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1429
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1693
LLVM_ABI uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition APInt.cpp:521
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1535
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:968
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1662
bool isMask(unsigned numBits) const
Definition APInt.h:489
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition ArrayRef.h:218
size_t size() const
Get the array size.
Definition ArrayRef.h:141
static LLVM_ABI Attribute getWithDereferenceableBytes(LLVMContext &Context, uint64_t Bytes)
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
bool isTypeLegal(Type *Ty) const override
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
void setAttributes(AttributeList A)
Set the attributes for this call.
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
AttributeList getAttributes() const
Return the attributes for this call.
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ ICMP_NE
not equal
Definition InstrTypes.h:762
bool isSigned() const
Definition InstrTypes.h:993
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition InstrTypes.h:890
bool isFPPredicate() const
Definition InstrTypes.h:845
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:852
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:420
const APFloat & getValueAPF() const
Definition Constants.h:463
static LLVM_ABI ConstantFP * getZero(Type *Ty, bool Negative=false)
static LLVM_ABI ConstantFP * getNaN(Type *Ty, bool Negative=false, uint64_t Payload=0)
static LLVM_ABI ConstantFP * getInfinity(Type *Ty, bool Negative=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This class represents a range of values.
LLVM_ABI ConstantRange add(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an addition of a value in this ran...
LLVM_ABI bool isFullSet() const
Return true if this set contains all of the elements possible for this data-type.
LLVM_ABI ConstantRange intersectWith(const ConstantRange &CR, PreferredRangeType Type=Smallest) const
Return the range that results from the intersection of this range with another range.
This is an important base class in LLVM.
Definition Constant.h:43
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition Constant.h:64
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:791
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Tagged union holding either a T or a Error.
Definition Error.h:485
This class represents an extension of floating point types.
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:202
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:291
bool hasApproxFunc() const
Test if this operation allows approximations of math library functions or intrinsics.
Definition Operator.h:288
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool allowContract() const
Definition FMF.h:69
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:869
bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II, unsigned LaneAgIdx) const
Simplify a lane index operand (e.g.
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
Instruction * hoistLaneIntrinsicThroughOperand(InstCombiner &IC, IntrinsicInst &II) const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
Value * simplifyAMDGCNLaneIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts, APInt &UndefElts) const
bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0, const Value *Op1, InstCombiner &IC) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
LLVM_ABI CallInst * CreateIntrinsicWithoutFolding(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2669
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2657
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:599
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2180
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition IRBuilder.h:509
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2174
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1584
Value * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition IRBuilder.h:1164
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:201
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2420
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:534
Value * CreateMaxNum(Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create call to the maxnum intrinsic.
Definition IRBuilder.h:1095
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1563
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2162
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2691
LLVM_ABI Value * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={}, function_ref< void(CallInst *)> SetFn=[](CallInst *) {})
Variant to create a possibly constant-folded intrinsic.
Value * CreateMaximumNum(Value *LHS, Value *RHS, const Twine &Name="")
Create call to the maximum intrinsic.
Definition IRBuilder.h:1123
Value * CreateMinNum(Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create call to the minnum intrinsic.
Definition IRBuilder.h:1083
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1474
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2595
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFAddFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1693
Value * CreateMinimumNum(Value *LHS, Value *RHS, const Twine &Name="")
Create call to the minimumnum intrinsic.
Definition IRBuilder.h:1117
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1603
Value * CreateFMulFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1731
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2900
The core instruction combiner logic.
const DataLayout & getDataLayout() const
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
DominatorTree & getDominatorTree() const
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, const SimplifyQuery &Q, unsigned Depth=0)=0
IRBuilder< TargetFolder, IRBuilderInstCombineInserter > BuilderTy
An IRBuilder that automatically inserts new instructions into the worklist.
static Value * stripSignOnlyFPOps(Value *Val)
Ignore all operations which only change the sign of a value, returning the underlying magnitude value...
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
const SimplifyQuery & getSimplifyQuery() const
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:308
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI Type * getHalfTy(LLVMContext &C)
Definition Type.cpp:284
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:220
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:163
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:400
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY const MIMGOffsetMappingInfo * getMIMGOffsetMappingInfo(unsigned Offset)
uint8_t wmmaScaleF8F6F4FormatToNumRegs(unsigned Fmt)
const ImageDimIntrinsicInfo * getImageDimIntrinsicByBaseOpcode(unsigned BaseOpcode, unsigned Dim)
LLVM_READONLY const MIMGMIPMappingInfo * getMIMGMIPMappingInfo(unsigned MIP)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY const MIMGBiasMappingInfo * getMIMGBiasMappingInfo(unsigned Bias)
LLVM_READONLY const MIMGLZMappingInfo * getMIMGLZMappingInfo(unsigned L)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI bool isSignatureValid(Intrinsic::ID ID, FunctionType *FT, SmallVectorImpl< Type * > &OverloadTys, raw_ostream &OS=nulls())
Returns true if FT is a valid function type for intrinsic ID.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
auto m_Cmp()
Matches any compare instruction and ignore it.
bool match(Val *V, const Pattern &P)
cstfp_pred_ty< is_any_zero_fp > m_AnyZeroFP()
Match a floating-point negative zero or positive zero.
ap_match< APFloat > m_APFloat(const APFloat *&Res)
Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
auto m_Value()
Match an arbitrary value and ignore it.
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
cstfp_pred_ty< is_finitenonzero > m_FiniteNonZero()
Match a finite non-zero FP constant.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_ConstantFP()
Match an arbitrary ConstantFP and ignore it.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:558
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI Constant * ConstantFoldCompareInstOperands(unsigned Predicate, Constant *LHS, Constant *RHS, const DataLayout &DL, const TargetLibraryInfo *TLI=nullptr, const Instruction *I=nullptr)
Attempt to constant fold a compare instruction (icmp/fcmp) with the specified operands.
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
APFloat frexp(const APFloat &X, int &Exp, APFloat::roundingMode RM)
Equivalent of C standard library function.
Definition APFloat.h:1658
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE-754 2008 maxNum semantics.
Definition APFloat.h:1701
constexpr unsigned MaxAnalysisRecursionDepth
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
APFloat scalbn(APFloat X, int Exp, APFloat::roundingMode RM)
Returns: X * 2^Exp for integral exponents.
Definition APFloat.h:1646
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
@ FMul
Product of floats.
@ FAdd
Sum of floats.
LLVM_ABI Value * findScalarElement(Value *V, unsigned EltNo)
Given a vector and an element number, see if the scalar value is already around as a register,...
@ NearestTiesToEven
roundTiesToEven.
LLVM_ABI bool isKnownNeverInfOrNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point value can never contain a NaN or infinity.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2165
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI Constant * ConstantFoldInstOperands(const Instruction *I, ArrayRef< Constant * > Ops, const DataLayout &DL, const TargetLibraryInfo *TLI=nullptr, bool AllowNonDeterministic=true)
ConstantFoldInstOperands - Attempt to constant fold an instruction with the specified operands.
constexpr uint64_t Make_64(uint32_t High, uint32_t Low)
Make a 64-bit integer from a high / low pair of 32-bit integers.
Definition MathExtras.h:160
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, const SimplifyQuery &SQ, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition KnownBits.h:58
SimplifyQuery getWithInstruction(const Instruction *I) const
LLVM_ABI bool isUndefValue(Value *V) const
If CanUseUndef is true, returns whether V is undef.