File: | llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h |
Warning: | line 237, column 16 The result of the left shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int' |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===// | |||
2 | // | |||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
4 | // See https://llvm.org/LICENSE.txt for license information. | |||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
6 | // | |||
7 | //===----------------------------------------------------------------------===// | |||
8 | ||||
9 | #include "ARMTargetTransformInfo.h" | |||
10 | #include "ARMSubtarget.h" | |||
11 | #include "MCTargetDesc/ARMAddressingModes.h" | |||
12 | #include "llvm/ADT/APInt.h" | |||
13 | #include "llvm/ADT/SmallVector.h" | |||
14 | #include "llvm/Analysis/LoopInfo.h" | |||
15 | #include "llvm/CodeGen/CostTable.h" | |||
16 | #include "llvm/CodeGen/ISDOpcodes.h" | |||
17 | #include "llvm/CodeGen/ValueTypes.h" | |||
18 | #include "llvm/IR/BasicBlock.h" | |||
19 | #include "llvm/IR/DataLayout.h" | |||
20 | #include "llvm/IR/DerivedTypes.h" | |||
21 | #include "llvm/IR/Instruction.h" | |||
22 | #include "llvm/IR/Instructions.h" | |||
23 | #include "llvm/IR/Intrinsics.h" | |||
24 | #include "llvm/IR/IntrinsicInst.h" | |||
25 | #include "llvm/IR/IntrinsicsARM.h" | |||
26 | #include "llvm/IR/PatternMatch.h" | |||
27 | #include "llvm/IR/Type.h" | |||
28 | #include "llvm/MC/SubtargetFeature.h" | |||
29 | #include "llvm/Support/Casting.h" | |||
30 | #include "llvm/Support/KnownBits.h" | |||
31 | #include "llvm/Support/MachineValueType.h" | |||
32 | #include "llvm/Target/TargetMachine.h" | |||
33 | #include "llvm/Transforms/InstCombine/InstCombiner.h" | |||
34 | #include "llvm/Transforms/Utils/Local.h" | |||
35 | #include "llvm/Transforms/Utils/LoopUtils.h" | |||
36 | #include <algorithm> | |||
37 | #include <cassert> | |||
38 | #include <cstdint> | |||
39 | #include <utility> | |||
40 | ||||
41 | using namespace llvm; | |||
42 | ||||
43 | #define DEBUG_TYPE"armtti" "armtti" | |||
44 | ||||
45 | static cl::opt<bool> EnableMaskedLoadStores( | |||
46 | "enable-arm-maskedldst", cl::Hidden, cl::init(true), | |||
47 | cl::desc("Enable the generation of masked loads and stores")); | |||
48 | ||||
49 | static cl::opt<bool> DisableLowOverheadLoops( | |||
50 | "disable-arm-loloops", cl::Hidden, cl::init(false), | |||
51 | cl::desc("Disable the generation of low-overhead loops")); | |||
52 | ||||
53 | static cl::opt<bool> | |||
54 | AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), | |||
55 | cl::desc("Enable the generation of WLS loops")); | |||
56 | ||||
57 | extern cl::opt<TailPredication::Mode> EnableTailPredication; | |||
58 | ||||
59 | extern cl::opt<bool> EnableMaskedGatherScatters; | |||
60 | ||||
61 | extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor; | |||
62 | ||||
63 | /// Convert a vector load intrinsic into a simple llvm load instruction. | |||
64 | /// This is beneficial when the underlying object being addressed comes | |||
65 | /// from a constant, since we get constant-folding for free. | |||
66 | static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, | |||
67 | InstCombiner::BuilderTy &Builder) { | |||
68 | auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1)); | |||
69 | ||||
70 | if (!IntrAlign) | |||
71 | return nullptr; | |||
72 | ||||
73 | unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign | |||
74 | ? MemAlign | |||
75 | : IntrAlign->getLimitedValue(); | |||
76 | ||||
77 | if (!isPowerOf2_32(Alignment)) | |||
78 | return nullptr; | |||
79 | ||||
80 | auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0), | |||
81 | PointerType::get(II.getType(), 0)); | |||
82 | return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment)); | |||
83 | } | |||
84 | ||||
85 | bool ARMTTIImpl::areInlineCompatible(const Function *Caller, | |||
86 | const Function *Callee) const { | |||
87 | const TargetMachine &TM = getTLI()->getTargetMachine(); | |||
88 | const FeatureBitset &CallerBits = | |||
89 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); | |||
90 | const FeatureBitset &CalleeBits = | |||
91 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); | |||
92 | ||||
93 | // To inline a callee, all features not in the allowed list must match exactly. | |||
94 | bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) == | |||
95 | (CalleeBits & ~InlineFeaturesAllowed); | |||
96 | // For features in the allowed list, the callee's features must be a subset of | |||
97 | // the callers'. | |||
98 | bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) == | |||
99 | (CalleeBits & InlineFeaturesAllowed); | |||
100 | return MatchExact && MatchSubset; | |||
101 | } | |||
102 | ||||
103 | TTI::AddressingModeKind | |||
104 | ARMTTIImpl::getPreferredAddressingMode(const Loop *L, | |||
105 | ScalarEvolution *SE) const { | |||
106 | if (ST->hasMVEIntegerOps()) | |||
107 | return TTI::AMK_PostIndexed; | |||
108 | ||||
109 | if (L->getHeader()->getParent()->hasOptSize()) | |||
110 | return TTI::AMK_None; | |||
111 | ||||
112 | if (ST->isMClass() && ST->isThumb2() && | |||
113 | L->getNumBlocks() == 1) | |||
114 | return TTI::AMK_PreIndexed; | |||
115 | ||||
116 | return TTI::AMK_None; | |||
117 | } | |||
118 | ||||
119 | Optional<Instruction *> | |||
120 | ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { | |||
121 | using namespace PatternMatch; | |||
122 | Intrinsic::ID IID = II.getIntrinsicID(); | |||
123 | switch (IID) { | |||
124 | default: | |||
125 | break; | |||
126 | case Intrinsic::arm_neon_vld1: { | |||
127 | Align MemAlign = | |||
128 | getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II, | |||
129 | &IC.getAssumptionCache(), &IC.getDominatorTree()); | |||
130 | if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) { | |||
131 | return IC.replaceInstUsesWith(II, V); | |||
132 | } | |||
133 | break; | |||
134 | } | |||
135 | ||||
136 | case Intrinsic::arm_neon_vld2: | |||
137 | case Intrinsic::arm_neon_vld3: | |||
138 | case Intrinsic::arm_neon_vld4: | |||
139 | case Intrinsic::arm_neon_vld2lane: | |||
140 | case Intrinsic::arm_neon_vld3lane: | |||
141 | case Intrinsic::arm_neon_vld4lane: | |||
142 | case Intrinsic::arm_neon_vst1: | |||
143 | case Intrinsic::arm_neon_vst2: | |||
144 | case Intrinsic::arm_neon_vst3: | |||
145 | case Intrinsic::arm_neon_vst4: | |||
146 | case Intrinsic::arm_neon_vst2lane: | |||
147 | case Intrinsic::arm_neon_vst3lane: | |||
148 | case Intrinsic::arm_neon_vst4lane: { | |||
149 | Align MemAlign = | |||
150 | getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II, | |||
151 | &IC.getAssumptionCache(), &IC.getDominatorTree()); | |||
152 | unsigned AlignArg = II.getNumArgOperands() - 1; | |||
153 | Value *AlignArgOp = II.getArgOperand(AlignArg); | |||
154 | MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue(); | |||
155 | if (Align && *Align < MemAlign) { | |||
156 | return IC.replaceOperand( | |||
157 | II, AlignArg, | |||
158 | ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(), | |||
159 | false)); | |||
160 | } | |||
161 | break; | |||
162 | } | |||
163 | ||||
164 | case Intrinsic::arm_mve_pred_i2v: { | |||
165 | Value *Arg = II.getArgOperand(0); | |||
166 | Value *ArgArg; | |||
167 | if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>( | |||
168 | PatternMatch::m_Value(ArgArg))) && | |||
169 | II.getType() == ArgArg->getType()) { | |||
170 | return IC.replaceInstUsesWith(II, ArgArg); | |||
171 | } | |||
172 | Constant *XorMask; | |||
173 | if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>( | |||
174 | PatternMatch::m_Value(ArgArg)), | |||
175 | PatternMatch::m_Constant(XorMask))) && | |||
176 | II.getType() == ArgArg->getType()) { | |||
177 | if (auto *CI = dyn_cast<ConstantInt>(XorMask)) { | |||
178 | if (CI->getValue().trunc(16).isAllOnesValue()) { | |||
179 | auto TrueVector = IC.Builder.CreateVectorSplat( | |||
180 | cast<FixedVectorType>(II.getType())->getNumElements(), | |||
181 | IC.Builder.getTrue()); | |||
182 | return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector); | |||
183 | } | |||
184 | } | |||
185 | } | |||
186 | KnownBits ScalarKnown(32); | |||
187 | if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16), | |||
188 | ScalarKnown, 0)) { | |||
189 | return &II; | |||
190 | } | |||
191 | break; | |||
192 | } | |||
193 | case Intrinsic::arm_mve_pred_v2i: { | |||
194 | Value *Arg = II.getArgOperand(0); | |||
195 | Value *ArgArg; | |||
196 | if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>( | |||
197 | PatternMatch::m_Value(ArgArg)))) { | |||
198 | return IC.replaceInstUsesWith(II, ArgArg); | |||
199 | } | |||
200 | if (!II.getMetadata(LLVMContext::MD_range)) { | |||
201 | Type *IntTy32 = Type::getInt32Ty(II.getContext()); | |||
202 | Metadata *M[] = { | |||
203 | ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)), | |||
204 | ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0x10000))}; | |||
205 | II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M)); | |||
206 | return &II; | |||
207 | } | |||
208 | break; | |||
209 | } | |||
210 | case Intrinsic::arm_mve_vadc: | |||
211 | case Intrinsic::arm_mve_vadc_predicated: { | |||
212 | unsigned CarryOp = | |||
213 | (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2; | |||
214 | assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&(static_cast <bool> (II.getArgOperand(CarryOp)->getType ()->getScalarSizeInBits() == 32 && "Bad type for intrinsic!" ) ? void (0) : __assert_fail ("II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 && \"Bad type for intrinsic!\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp" , 215, __extension__ __PRETTY_FUNCTION__)) | |||
215 | "Bad type for intrinsic!")(static_cast <bool> (II.getArgOperand(CarryOp)->getType ()->getScalarSizeInBits() == 32 && "Bad type for intrinsic!" ) ? void (0) : __assert_fail ("II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 && \"Bad type for intrinsic!\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp" , 215, __extension__ __PRETTY_FUNCTION__)); | |||
216 | ||||
217 | KnownBits CarryKnown(32); | |||
218 | if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29), | |||
219 | CarryKnown)) { | |||
220 | return &II; | |||
221 | } | |||
222 | break; | |||
223 | } | |||
224 | case Intrinsic::arm_mve_vmldava: { | |||
225 | Instruction *I = cast<Instruction>(&II); | |||
226 | if (I->hasOneUse()) { | |||
227 | auto *User = cast<Instruction>(*I->user_begin()); | |||
228 | Value *OpZ; | |||
229 | if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) && | |||
230 | match(I->getOperand(3), m_Zero())) { | |||
231 | Value *OpX = I->getOperand(4); | |||
232 | Value *OpY = I->getOperand(5); | |||
233 | Type *OpTy = OpX->getType(); | |||
234 | ||||
235 | IC.Builder.SetInsertPoint(User); | |||
236 | Value *V = | |||
237 | IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy}, | |||
238 | {I->getOperand(0), I->getOperand(1), | |||
239 | I->getOperand(2), OpZ, OpX, OpY}); | |||
240 | ||||
241 | IC.replaceInstUsesWith(*User, V); | |||
242 | return IC.eraseInstFromFunction(*User); | |||
243 | } | |||
244 | } | |||
245 | return None; | |||
246 | } | |||
247 | } | |||
248 | return None; | |||
249 | } | |||
250 | ||||
251 | InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, | |||
252 | TTI::TargetCostKind CostKind) { | |||
253 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp" , 253, __extension__ __PRETTY_FUNCTION__)); | |||
254 | ||||
255 | unsigned Bits = Ty->getPrimitiveSizeInBits(); | |||
256 | if (Bits == 0 || Imm.getActiveBits() >= 64) | |||
257 | return 4; | |||
258 | ||||
259 | int64_t SImmVal = Imm.getSExtValue(); | |||
260 | uint64_t ZImmVal = Imm.getZExtValue(); | |||
261 | if (!ST->isThumb()) { | |||
262 | if ((SImmVal >= 0 && SImmVal < 65536) || | |||
263 | (ARM_AM::getSOImmVal(ZImmVal) != -1) || | |||
264 | (ARM_AM::getSOImmVal(~ZImmVal) != -1)) | |||
265 | return 1; | |||
266 | return ST->hasV6T2Ops() ? 2 : 3; | |||
267 | } | |||
268 | if (ST->isThumb2()) { | |||
269 | if ((SImmVal >= 0 && SImmVal < 65536) || | |||
270 | (ARM_AM::getT2SOImmVal(ZImmVal) != -1) || | |||
271 | (ARM_AM::getT2SOImmVal(~ZImmVal) != -1)) | |||
272 | return 1; | |||
273 | return ST->hasV6T2Ops() ? 2 : 3; | |||
274 | } | |||
275 | // Thumb1, any i8 imm cost 1. | |||
276 | if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256)) | |||
277 | return 1; | |||
278 | if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal)) | |||
279 | return 2; | |||
280 | // Load from constantpool. | |||
281 | return 3; | |||
282 | } | |||
283 | ||||
284 | // Constants smaller than 256 fit in the immediate field of | |||
285 | // Thumb1 instructions so we return a zero cost and 1 otherwise. | |||
286 | InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, | |||
287 | const APInt &Imm, Type *Ty) { | |||
288 | if (Imm.isNonNegative() && Imm.getLimitedValue() < 256) | |||
289 | return 0; | |||
290 | ||||
291 | return 1; | |||
292 | } | |||
293 | ||||
294 | // Checks whether Inst is part of a min(max()) or max(min()) pattern | |||
295 | // that will match to an SSAT instruction | |||
296 | static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) { | |||
297 | Value *LHS, *RHS; | |||
298 | ConstantInt *C; | |||
299 | SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor; | |||
300 | ||||
301 | if (InstSPF == SPF_SMAX && | |||
302 | PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) && | |||
303 | C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) { | |||
304 | ||||
305 | auto isSSatMin = [&](Value *MinInst) { | |||
306 | if (isa<SelectInst>(MinInst)) { | |||
307 | Value *MinLHS, *MinRHS; | |||
308 | ConstantInt *MinC; | |||
309 | SelectPatternFlavor MinSPF = | |||
310 | matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor; | |||
311 | if (MinSPF == SPF_SMIN && | |||
312 | PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) && | |||
313 | MinC->getValue() == ((-Imm) - 1)) | |||
314 | return true; | |||
315 | } | |||
316 | return false; | |||
317 | }; | |||
318 | ||||
319 | if (isSSatMin(Inst->getOperand(1)) || | |||
320 | (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) || | |||
321 | isSSatMin(*(++Inst->user_begin()))))) | |||
322 | return true; | |||
323 | } | |||
324 | return false; | |||
325 | } | |||
326 | ||||
327 | InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, | |||
328 | const APInt &Imm, Type *Ty, | |||
329 | TTI::TargetCostKind CostKind, | |||
330 | Instruction *Inst) { | |||
331 | // Division by a constant can be turned into multiplication, but only if we | |||
332 | // know it's constant. So it's not so much that the immediate is cheap (it's | |||
333 | // not), but that the alternative is worse. | |||
334 | // FIXME: this is probably unneeded with GlobalISel. | |||
335 | if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv || | |||
| ||||
336 | Opcode == Instruction::SRem || Opcode == Instruction::URem) && | |||
337 | Idx == 1) | |||
338 | return 0; | |||
339 | ||||
340 | // Leave any gep offsets for the CodeGenPrepare, which will do a better job at | |||
341 | // splitting any large offsets. | |||
342 | if (Opcode == Instruction::GetElementPtr && Idx != 0) | |||
343 | return 0; | |||
344 | ||||
345 | if (Opcode == Instruction::And) { | |||
346 | // UXTB/UXTH | |||
347 | if (Imm == 255 || Imm == 65535) | |||
348 | return 0; | |||
349 | // Conversion to BIC is free, and means we can use ~Imm instead. | |||
350 | return std::min(getIntImmCost(Imm, Ty, CostKind), | |||
351 | getIntImmCost(~Imm, Ty, CostKind)); | |||
352 | } | |||
353 | ||||
354 | if (Opcode == Instruction::Add) | |||
355 | // Conversion to SUB is free, and means we can use -Imm instead. | |||
356 | return std::min(getIntImmCost(Imm, Ty, CostKind), | |||
357 | getIntImmCost(-Imm, Ty, CostKind)); | |||
358 | ||||
359 | if (Opcode == Instruction::ICmp && Imm.isNegative() && | |||
360 | Ty->getIntegerBitWidth() == 32) { | |||
361 | int64_t NegImm = -Imm.getSExtValue(); | |||
362 | if (ST->isThumb2() && NegImm < 1<<12) | |||
363 | // icmp X, #-C -> cmn X, #C | |||
364 | return 0; | |||
365 | if (ST->isThumb() && NegImm < 1<<8) | |||
366 | // icmp X, #-C -> adds X, #C | |||
367 | return 0; | |||
368 | } | |||
369 | ||||
370 | // xor a, -1 can always be folded to MVN | |||
371 | if (Opcode == Instruction::Xor && Imm.isAllOnesValue()) | |||
372 | return 0; | |||
373 | ||||
374 | // Ensures negative constant of min(max()) or max(min()) patterns that | |||
375 | // match to SSAT instructions don't get hoisted | |||
376 | if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) && | |||
377 | Ty->getIntegerBitWidth() <= 32) { | |||
378 | if (isSSATMinMaxPattern(Inst, Imm) || | |||
379 | (isa<ICmpInst>(Inst) && Inst->hasOneUse() && | |||
380 | isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm))) | |||
381 | return 0; | |||
382 | } | |||
383 | ||||
384 | return getIntImmCost(Imm, Ty, CostKind); | |||
385 | } | |||
386 | ||||
387 | InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode, | |||
388 | TTI::TargetCostKind CostKind, | |||
389 | const Instruction *I) { | |||
390 | if (CostKind == TTI::TCK_RecipThroughput && | |||
391 | (ST->hasNEON() || ST->hasMVEIntegerOps())) { | |||
392 | // FIXME: The vectorizer is highly sensistive to the cost of these | |||
393 | // instructions, which suggests that it may be using the costs incorrectly. | |||
394 | // But, for now, just make them free to avoid performance regressions for | |||
395 | // vector targets. | |||
396 | return 0; | |||
397 | } | |||
398 | return BaseT::getCFInstrCost(Opcode, CostKind, I); | |||
399 | } | |||
400 | ||||
401 | InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, | |||
402 | Type *Src, | |||
403 | TTI::CastContextHint CCH, | |||
404 | TTI::TargetCostKind CostKind, | |||
405 | const Instruction *I) { | |||
406 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
407 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp" , 407, __extension__ __PRETTY_FUNCTION__)); | |||
408 | ||||
409 | // TODO: Allow non-throughput costs that aren't binary. | |||
410 | auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { | |||
411 | if (CostKind != TTI::TCK_RecipThroughput) | |||
412 | return Cost == 0 ? 0 : 1; | |||
413 | return Cost; | |||
414 | }; | |||
415 | auto IsLegalFPType = [this](EVT VT) { | |||
416 | EVT EltVT = VT.getScalarType(); | |||
417 | return (EltVT == MVT::f32 && ST->hasVFP2Base()) || | |||
418 | (EltVT == MVT::f64 && ST->hasFP64()) || | |||
419 | (EltVT == MVT::f16 && ST->hasFullFP16()); | |||
420 | }; | |||
421 | ||||
422 | EVT SrcTy = TLI->getValueType(DL, Src); | |||
423 | EVT DstTy = TLI->getValueType(DL, Dst); | |||
424 | ||||
425 | if (!SrcTy.isSimple() || !DstTy.isSimple()) | |||
426 | return AdjustCost( | |||
427 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); | |||
428 | ||||
429 | // Extending masked load/Truncating masked stores is expensive because we | |||
430 | // currently don't split them. This means that we'll likely end up | |||
431 | // loading/storing each element individually (hence the high cost). | |||
432 | if ((ST->hasMVEIntegerOps() && | |||
433 | (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt || | |||
434 | Opcode == Instruction::SExt)) || | |||
435 | (ST->hasMVEFloatOps() && | |||
436 | (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) && | |||
437 | IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))) | |||
438 | if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128) | |||
439 | return 2 * DstTy.getVectorNumElements() * | |||
440 | ST->getMVEVectorCostFactor(CostKind); | |||
441 | ||||
442 | // The extend of other kinds of load is free | |||
443 | if (CCH == TTI::CastContextHint::Normal || | |||
444 | CCH == TTI::CastContextHint::Masked) { | |||
445 | static const TypeConversionCostTblEntry LoadConversionTbl[] = { | |||
446 | {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0}, | |||
447 | {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0}, | |||
448 | {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0}, | |||
449 | {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0}, | |||
450 | {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0}, | |||
451 | {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0}, | |||
452 | {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1}, | |||
453 | {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1}, | |||
454 | {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1}, | |||
455 | {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1}, | |||
456 | {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1}, | |||
457 | {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1}, | |||
458 | }; | |||
459 | if (const auto *Entry = ConvertCostTableLookup( | |||
460 | LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) | |||
461 | return AdjustCost(Entry->Cost); | |||
462 | ||||
463 | static const TypeConversionCostTblEntry MVELoadConversionTbl[] = { | |||
464 | {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0}, | |||
465 | {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0}, | |||
466 | {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0}, | |||
467 | {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0}, | |||
468 | {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0}, | |||
469 | {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0}, | |||
470 | // The following extend from a legal type to an illegal type, so need to | |||
471 | // split the load. This introduced an extra load operation, but the | |||
472 | // extend is still "free". | |||
473 | {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1}, | |||
474 | {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1}, | |||
475 | {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3}, | |||
476 | {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3}, | |||
477 | {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1}, | |||
478 | {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1}, | |||
479 | }; | |||
480 | if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { | |||
481 | if (const auto *Entry = | |||
482 | ConvertCostTableLookup(MVELoadConversionTbl, ISD, | |||
483 | DstTy.getSimpleVT(), SrcTy.getSimpleVT())) | |||
484 | return Entry->Cost * ST->getMVEVectorCostFactor(CostKind); | |||
485 | } | |||
486 | ||||
487 | static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = { | |||
488 | // FPExtends are similar but also require the VCVT instructions. | |||
489 | {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, | |||
490 | {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3}, | |||
491 | }; | |||
492 | if (SrcTy.isVector() && ST->hasMVEFloatOps()) { | |||
493 | if (const auto *Entry = | |||
494 | ConvertCostTableLookup(MVEFLoadConversionTbl, ISD, | |||
495 | DstTy.getSimpleVT(), SrcTy.getSimpleVT())) | |||
496 | return Entry->Cost * ST->getMVEVectorCostFactor(CostKind); | |||
497 | } | |||
498 | ||||
499 | // The truncate of a store is free. This is the mirror of extends above. | |||
500 | static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = { | |||
501 | {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0}, | |||
502 | {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0}, | |||
503 | {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0}, | |||
504 | {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1}, | |||
505 | {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1}, | |||
506 | {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3}, | |||
507 | {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1}, | |||
508 | }; | |||
509 | if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { | |||
510 | if (const auto *Entry = | |||
511 | ConvertCostTableLookup(MVEStoreConversionTbl, ISD, | |||
512 | SrcTy.getSimpleVT(), DstTy.getSimpleVT())) | |||
513 | return Entry->Cost * ST->getMVEVectorCostFactor(CostKind); | |||
514 | } | |||
515 | ||||
516 | static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = { | |||
517 | {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1}, | |||
518 | {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3}, | |||
519 | }; | |||
520 | if (SrcTy.isVector() && ST->hasMVEFloatOps()) { | |||
521 | if (const auto *Entry = | |||
522 | ConvertCostTableLookup(MVEFStoreConversionTbl, ISD, | |||
523 | SrcTy.getSimpleVT(), DstTy.getSimpleVT())) | |||
524 | return Entry->Cost * ST->getMVEVectorCostFactor(CostKind); | |||
525 | } | |||
526 | } | |||
527 | ||||
528 | // NEON vector operations that can extend their inputs. | |||
529 | if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) && | |||
530 | I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) { | |||
531 | static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = { | |||
532 | // vaddl | |||
533 | { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 }, | |||
534 | { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 }, | |||
535 | // vsubl | |||
536 | { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 }, | |||
537 | { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 }, | |||
538 | // vmull | |||
539 | { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 }, | |||
540 | { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 }, | |||
541 | // vshll | |||
542 | { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 }, | |||
543 | { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 }, | |||
544 | }; | |||
545 | ||||
546 | auto *User = cast<Instruction>(*I->user_begin()); | |||
547 | int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode()); | |||
548 | if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD, | |||
549 | DstTy.getSimpleVT(), | |||
550 | SrcTy.getSimpleVT())) { | |||
551 | return AdjustCost(Entry->Cost); | |||
552 | } | |||
553 | } | |||
554 | ||||
555 | // Single to/from double precision conversions. | |||
556 | if (Src->isVectorTy() && ST->hasNEON() && | |||
557 | ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 && | |||
558 | DstTy.getScalarType() == MVT::f32) || | |||
559 | (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 && | |||
560 | DstTy.getScalarType() == MVT::f64))) { | |||
561 | static const CostTblEntry NEONFltDblTbl[] = { | |||
562 | // Vector fptrunc/fpext conversions. | |||
563 | {ISD::FP_ROUND, MVT::v2f64, 2}, | |||
564 | {ISD::FP_EXTEND, MVT::v2f32, 2}, | |||
565 | {ISD::FP_EXTEND, MVT::v4f32, 4}}; | |||
566 | ||||
567 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); | |||
568 | if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second)) | |||
569 | return AdjustCost(LT.first * Entry->Cost); | |||
570 | } | |||
571 | ||||
572 | // Some arithmetic, load and store operations have specific instructions | |||
573 | // to cast up/down their types automatically at no extra cost. | |||
574 | // TODO: Get these tables to know at least what the related operations are. | |||
575 | static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = { | |||
576 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, | |||
577 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, | |||
578 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 }, | |||
579 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 }, | |||
580 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, | |||
581 | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, | |||
582 | ||||
583 | // The number of vmovl instructions for the extension. | |||
584 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, | |||
585 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, | |||
586 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, | |||
587 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, | |||
588 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 }, | |||
589 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 }, | |||
590 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 }, | |||
591 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 }, | |||
592 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, | |||
593 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, | |||
594 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, | |||
595 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, | |||
596 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, | |||
597 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, | |||
598 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, | |||
599 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, | |||
600 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, | |||
601 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, | |||
602 | ||||
603 | // Operations that we legalize using splitting. | |||
604 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, | |||
605 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, | |||
606 | ||||
607 | // Vector float <-> i32 conversions. | |||
608 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, | |||
609 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, | |||
610 | ||||
611 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, | |||
612 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, | |||
613 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 }, | |||
614 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 }, | |||
615 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, | |||
616 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, | |||
617 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, | |||
618 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, | |||
619 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, | |||
620 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, | |||
621 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, | |||
622 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, | |||
623 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, | |||
624 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, | |||
625 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, | |||
626 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, | |||
627 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 }, | |||
628 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 }, | |||
629 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 }, | |||
630 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 }, | |||
631 | ||||
632 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, | |||
633 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, | |||
634 | { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 }, | |||
635 | { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 }, | |||
636 | { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, | |||
637 | { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, | |||
638 | ||||
639 | // Vector double <-> i32 conversions. | |||
640 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, | |||
641 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, | |||
642 | ||||
643 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, | |||
644 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, | |||
645 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 }, | |||
646 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 }, | |||
647 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, | |||
648 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, | |||
649 | ||||
650 | { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, | |||
651 | { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, | |||
652 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 }, | |||
653 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 }, | |||
654 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 }, | |||
655 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 } | |||
656 | }; | |||
657 | ||||
658 | if (SrcTy.isVector() && ST->hasNEON()) { | |||
659 | if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD, | |||
660 | DstTy.getSimpleVT(), | |||
661 | SrcTy.getSimpleVT())) | |||
662 | return AdjustCost(Entry->Cost); | |||
663 | } | |||
664 | ||||
665 | // Scalar float to integer conversions. | |||
666 | static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = { | |||
667 | { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 }, | |||
668 | { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 }, | |||
669 | { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 }, | |||
670 | { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 }, | |||
671 | { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 }, | |||
672 | { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 }, | |||
673 | { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 }, | |||
674 | { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 }, | |||
675 | { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 }, | |||
676 | { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 }, | |||
677 | { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 }, | |||
678 | { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 }, | |||
679 | { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 }, | |||
680 | { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 }, | |||
681 | { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 }, | |||
682 | { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 }, | |||
683 | { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 }, | |||
684 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 }, | |||
685 | { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 }, | |||
686 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 } | |||
687 | }; | |||
688 | if (SrcTy.isFloatingPoint() && ST->hasNEON()) { | |||
689 | if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD, | |||
690 | DstTy.getSimpleVT(), | |||
691 | SrcTy.getSimpleVT())) | |||
692 | return AdjustCost(Entry->Cost); | |||
693 | } | |||
694 | ||||
695 | // Scalar integer to float conversions. | |||
696 | static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = { | |||
697 | { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 }, | |||
698 | { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 }, | |||
699 | { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 }, | |||
700 | { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 }, | |||
701 | { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 }, | |||
702 | { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 }, | |||
703 | { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 }, | |||
704 | { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 }, | |||
705 | { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 }, | |||
706 | { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 }, | |||
707 | { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 }, | |||
708 | { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 }, | |||
709 | { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 }, | |||
710 | { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 }, | |||
711 | { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 }, | |||
712 | { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 }, | |||
713 | { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 }, | |||
714 | { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 }, | |||
715 | { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 }, | |||
716 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 } | |||
717 | }; | |||
718 | ||||
719 | if (SrcTy.isInteger() && ST->hasNEON()) { | |||
720 | if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl, | |||
721 | ISD, DstTy.getSimpleVT(), | |||
722 | SrcTy.getSimpleVT())) | |||
723 | return AdjustCost(Entry->Cost); | |||
724 | } | |||
725 | ||||
726 | // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one | |||
727 | // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext | |||
728 | // are linearised so take more. | |||
729 | static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = { | |||
730 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, | |||
731 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, | |||
732 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, | |||
733 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, | |||
734 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 }, | |||
735 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 }, | |||
736 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, | |||
737 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, | |||
738 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 }, | |||
739 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 }, | |||
740 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 }, | |||
741 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 }, | |||
742 | }; | |||
743 | ||||
744 | if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { | |||
745 | if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl, | |||
746 | ISD, DstTy.getSimpleVT(), | |||
747 | SrcTy.getSimpleVT())) | |||
748 | return Entry->Cost * ST->getMVEVectorCostFactor(CostKind); | |||
749 | } | |||
750 | ||||
751 | if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) { | |||
752 | // As general rule, fp converts that were not matched above are scalarized | |||
753 | // and cost 1 vcvt for each lane, so long as the instruction is available. | |||
754 | // If not it will become a series of function calls. | |||
755 | const InstructionCost CallCost = | |||
756 | getCallInstrCost(nullptr, Dst, {Src}, CostKind); | |||
757 | int Lanes = 1; | |||
758 | if (SrcTy.isFixedLengthVector()) | |||
759 | Lanes = SrcTy.getVectorNumElements(); | |||
760 | ||||
761 | if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)) | |||
762 | return Lanes; | |||
763 | else | |||
764 | return Lanes * CallCost; | |||
765 | } | |||
766 | ||||
767 | if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() && | |||
768 | SrcTy.isFixedLengthVector()) { | |||
769 | // Treat a truncate with larger than legal source (128bits for MVE) as | |||
770 | // expensive, 2 instructions per lane. | |||
771 | if ((SrcTy.getScalarType() == MVT::i8 || | |||
772 | SrcTy.getScalarType() == MVT::i16 || | |||
773 | SrcTy.getScalarType() == MVT::i32) && | |||
774 | SrcTy.getSizeInBits() > 128 && | |||
775 | SrcTy.getSizeInBits() > DstTy.getSizeInBits()) | |||
776 | return SrcTy.getVectorNumElements() * 2; | |||
777 | } | |||
778 | ||||
779 | // Scalar integer conversion costs. | |||
780 | static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = { | |||
781 | // i16 -> i64 requires two dependent operations. | |||
782 | { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 }, | |||
783 | ||||
784 | // Truncates on i64 are assumed to be free. | |||
785 | { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 }, | |||
786 | { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 }, | |||
787 | { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 }, | |||
788 | { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 } | |||
789 | }; | |||
790 | ||||
791 | if (SrcTy.isInteger()) { | |||
792 | if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD, | |||
793 | DstTy.getSimpleVT(), | |||
794 | SrcTy.getSimpleVT())) | |||
795 | return AdjustCost(Entry->Cost); | |||
796 | } | |||
797 | ||||
798 | int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy() | |||
799 | ? ST->getMVEVectorCostFactor(CostKind) | |||
800 | : 1; | |||
801 | return AdjustCost( | |||
802 | BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); | |||
803 | } | |||
804 | ||||
805 | InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, | |||
806 | unsigned Index) { | |||
807 | // Penalize inserting into an D-subregister. We end up with a three times | |||
808 | // lower estimated throughput on swift. | |||
809 | if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement && | |||
810 | ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32) | |||
811 | return 3; | |||
812 | ||||
813 | if (ST->hasNEON() && (Opcode == Instruction::InsertElement || | |||
814 | Opcode == Instruction::ExtractElement)) { | |||
815 | // Cross-class copies are expensive on many microarchitectures, | |||
816 | // so assume they are expensive by default. | |||
817 | if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy()) | |||
818 | return 3; | |||
819 | ||||
820 | // Even if it's not a cross class copy, this likely leads to mixing | |||
821 | // of NEON and VFP code and should be therefore penalized. | |||
822 | if (ValTy->isVectorTy() && | |||
823 | ValTy->getScalarSizeInBits() <= 32) | |||
824 | return std::max<InstructionCost>( | |||
825 | BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U); | |||
826 | } | |||
827 | ||||
828 | if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement || | |||
829 | Opcode == Instruction::ExtractElement)) { | |||
830 | // Integer cross-lane moves are more expensive than float, which can | |||
831 | // sometimes just be vmovs. Integer involve being passes to GPR registers, | |||
832 | // causing more of a delay. | |||
833 | std::pair<InstructionCost, MVT> LT = | |||
834 | getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType()); | |||
835 | return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1); | |||
836 | } | |||
837 | ||||
838 | return BaseT::getVectorInstrCost(Opcode, ValTy, Index); | |||
839 | } | |||
840 | ||||
841 | InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, | |||
842 | Type *CondTy, | |||
843 | CmpInst::Predicate VecPred, | |||
844 | TTI::TargetCostKind CostKind, | |||
845 | const Instruction *I) { | |||
846 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
847 | ||||
848 | // Thumb scalar code size cost for select. | |||
849 | if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT && | |||
850 | ST->isThumb() && !ValTy->isVectorTy()) { | |||
851 | // Assume expensive structs. | |||
852 | if (TLI->getValueType(DL, ValTy, true) == MVT::Other) | |||
853 | return TTI::TCC_Expensive; | |||
854 | ||||
855 | // Select costs can vary because they: | |||
856 | // - may require one or more conditional mov (including an IT), | |||
857 | // - can't operate directly on immediates, | |||
858 | // - require live flags, which we can't copy around easily. | |||
859 | InstructionCost Cost = TLI->getTypeLegalizationCost(DL, ValTy).first; | |||
860 | ||||
861 | // Possible IT instruction for Thumb2, or more for Thumb1. | |||
862 | ++Cost; | |||
863 | ||||
864 | // i1 values may need rematerialising by using mov immediates and/or | |||
865 | // flag setting instructions. | |||
866 | if (ValTy->isIntegerTy(1)) | |||
867 | ++Cost; | |||
868 | ||||
869 | return Cost; | |||
870 | } | |||
871 | ||||
872 | // If this is a vector min/max/abs, use the cost of that intrinsic directly | |||
873 | // instead. Hopefully when min/max intrinsics are more prevalent this code | |||
874 | // will not be needed. | |||
875 | const Instruction *Sel = I; | |||
876 | if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel && | |||
877 | Sel->hasOneUse()) | |||
878 | Sel = cast<Instruction>(Sel->user_back()); | |||
879 | if (Sel && ValTy->isVectorTy() && | |||
880 | (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) { | |||
881 | const Value *LHS, *RHS; | |||
882 | SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor; | |||
883 | unsigned IID = 0; | |||
884 | switch (SPF) { | |||
885 | case SPF_ABS: | |||
886 | IID = Intrinsic::abs; | |||
887 | break; | |||
888 | case SPF_SMIN: | |||
889 | IID = Intrinsic::smin; | |||
890 | break; | |||
891 | case SPF_SMAX: | |||
892 | IID = Intrinsic::smax; | |||
893 | break; | |||
894 | case SPF_UMIN: | |||
895 | IID = Intrinsic::umin; | |||
896 | break; | |||
897 | case SPF_UMAX: | |||
898 | IID = Intrinsic::umax; | |||
899 | break; | |||
900 | case SPF_FMINNUM: | |||
901 | IID = Intrinsic::minnum; | |||
902 | break; | |||
903 | case SPF_FMAXNUM: | |||
904 | IID = Intrinsic::maxnum; | |||
905 | break; | |||
906 | default: | |||
907 | break; | |||
908 | } | |||
909 | if (IID) { | |||
910 | // The ICmp is free, the select gets the cost of the min/max/etc | |||
911 | if (Sel != I) | |||
912 | return 0; | |||
913 | IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy}); | |||
914 | return getIntrinsicInstrCost(CostAttrs, CostKind); | |||
915 | } | |||
916 | } | |||
917 | ||||
918 | // On NEON a vector select gets lowered to vbsl. | |||
919 | if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) { | |||
920 | // Lowering of some vector selects is currently far from perfect. | |||
921 | static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = { | |||
922 | { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 }, | |||
923 | { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 }, | |||
924 | { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 } | |||
925 | }; | |||
926 | ||||
927 | EVT SelCondTy = TLI->getValueType(DL, CondTy); | |||
928 | EVT SelValTy = TLI->getValueType(DL, ValTy); | |||
929 | if (SelCondTy.isSimple() && SelValTy.isSimple()) { | |||
930 | if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD, | |||
931 | SelCondTy.getSimpleVT(), | |||
932 | SelValTy.getSimpleVT())) | |||
933 | return Entry->Cost; | |||
934 | } | |||
935 | ||||
936 | std::pair<InstructionCost, MVT> LT = | |||
937 | TLI->getTypeLegalizationCost(DL, ValTy); | |||
938 | return LT.first; | |||
939 | } | |||
940 | ||||
941 | if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() && | |||
942 | (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && | |||
943 | cast<FixedVectorType>(ValTy)->getNumElements() > 1) { | |||
944 | FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy); | |||
945 | FixedVectorType *VecCondTy = dyn_cast_or_null<FixedVectorType>(CondTy); | |||
946 | if (!VecCondTy) | |||
947 | VecCondTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecValTy)); | |||
948 | ||||
949 | // If we don't have mve.fp any fp operations will need to be scalarized. | |||
950 | if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) { | |||
951 | // One scalaization insert, one scalarization extract and the cost of the | |||
952 | // fcmps. | |||
953 | return BaseT::getScalarizationOverhead(VecValTy, false, true) + | |||
954 | BaseT::getScalarizationOverhead(VecCondTy, true, false) + | |||
955 | VecValTy->getNumElements() * | |||
956 | getCmpSelInstrCost(Opcode, ValTy->getScalarType(), | |||
957 | VecCondTy->getScalarType(), VecPred, CostKind, | |||
958 | I); | |||
959 | } | |||
960 | ||||
961 | std::pair<InstructionCost, MVT> LT = | |||
962 | TLI->getTypeLegalizationCost(DL, ValTy); | |||
963 | int BaseCost = ST->getMVEVectorCostFactor(CostKind); | |||
964 | // There are two types - the input that specifies the type of the compare | |||
965 | // and the output vXi1 type. Because we don't know how the output will be | |||
966 | // split, we may need an expensive shuffle to get two in sync. This has the | |||
967 | // effect of making larger than legal compares (v8i32 for example) | |||
968 | // expensive. | |||
969 | if (LT.second.getVectorNumElements() > 2) { | |||
970 | if (LT.first > 1) | |||
971 | return LT.first * BaseCost + | |||
972 | BaseT::getScalarizationOverhead(VecCondTy, true, false); | |||
973 | return BaseCost; | |||
974 | } | |||
975 | } | |||
976 | ||||
977 | // Default to cheap (throughput/size of 1 instruction) but adjust throughput | |||
978 | // for "multiple beats" potentially needed by MVE instructions. | |||
979 | int BaseCost = 1; | |||
980 | if (ST->hasMVEIntegerOps() && ValTy->isVectorTy()) | |||
981 | BaseCost = ST->getMVEVectorCostFactor(CostKind); | |||
982 | ||||
983 | return BaseCost * | |||
984 | BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); | |||
985 | } | |||
986 | ||||
987 | InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty, | |||
988 | ScalarEvolution *SE, | |||
989 | const SCEV *Ptr) { | |||
990 | // Address computations in vectorized code with non-consecutive addresses will | |||
991 | // likely result in more instructions compared to scalar code where the | |||
992 | // computation can more often be merged into the index mode. The resulting | |||
993 | // extra micro-ops can significantly decrease throughput. | |||
994 | unsigned NumVectorInstToHideOverhead = 10; | |||
995 | int MaxMergeDistance = 64; | |||
996 | ||||
997 | if (ST->hasNEON()) { | |||
998 | if (Ty->isVectorTy() && SE && | |||
999 | !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) | |||
1000 | return NumVectorInstToHideOverhead; | |||
1001 | ||||
1002 | // In many cases the address computation is not merged into the instruction | |||
1003 | // addressing mode. | |||
1004 | return 1; | |||
1005 | } | |||
1006 | return BaseT::getAddressComputationCost(Ty, SE, Ptr); | |||
1007 | } | |||
1008 | ||||
1009 | bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) { | |||
1010 | if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { | |||
1011 | // If a VCTP is part of a chain, it's already profitable and shouldn't be | |||
1012 | // optimized, else LSR may block tail-predication. | |||
1013 | switch (II->getIntrinsicID()) { | |||
1014 | case Intrinsic::arm_mve_vctp8: | |||
1015 | case Intrinsic::arm_mve_vctp16: | |||
1016 | case Intrinsic::arm_mve_vctp32: | |||
1017 | case Intrinsic::arm_mve_vctp64: | |||
1018 | return true; | |||
1019 | default: | |||
1020 | break; | |||
1021 | } | |||
1022 | } | |||
1023 | return false; | |||
1024 | } | |||
1025 | ||||
1026 | bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { | |||
1027 | if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps()) | |||
1028 | return false; | |||
1029 | ||||
1030 | if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) { | |||
1031 | // Don't support v2i1 yet. | |||
1032 | if (VecTy->getNumElements() == 2) | |||
1033 | return false; | |||
1034 | ||||
1035 | // We don't support extending fp types. | |||
1036 | unsigned VecWidth = DataTy->getPrimitiveSizeInBits(); | |||
1037 | if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy()) | |||
1038 | return false; | |||
1039 | } | |||
1040 | ||||
1041 | unsigned EltWidth = DataTy->getScalarSizeInBits(); | |||
1042 | return (EltWidth == 32 && Alignment >= 4) || | |||
1043 | (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8); | |||
1044 | } | |||
1045 | ||||
1046 | bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) { | |||
1047 | if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps()) | |||
1048 | return false; | |||
1049 | ||||
1050 | // This method is called in 2 places: | |||
1051 | // - from the vectorizer with a scalar type, in which case we need to get | |||
1052 | // this as good as we can with the limited info we have (and rely on the cost | |||
1053 | // model for the rest). | |||
1054 | // - from the masked intrinsic lowering pass with the actual vector type. | |||
1055 | // For MVE, we have a custom lowering pass that will already have custom | |||
1056 | // legalised any gathers that we can to MVE intrinsics, and want to expand all | |||
1057 | // the rest. The pass runs before the masked intrinsic lowering pass, so if we | |||
1058 | // are here, we know we want to expand. | |||
1059 | if (isa<VectorType>(Ty)) | |||
1060 | return false; | |||
1061 | ||||
1062 | unsigned EltWidth = Ty->getScalarSizeInBits(); | |||
1063 | return ((EltWidth == 32 && Alignment >= 4) || | |||
1064 | (EltWidth == 16 && Alignment >= 2) || EltWidth == 8); | |||
1065 | } | |||
1066 | ||||
1067 | /// Given a memcpy/memset/memmove instruction, return the number of memory | |||
1068 | /// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a | |||
1069 | /// call is used. | |||
1070 | int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const { | |||
1071 | MemOp MOp; | |||
1072 | unsigned DstAddrSpace = ~0u; | |||
1073 | unsigned SrcAddrSpace = ~0u; | |||
1074 | const Function *F = I->getParent()->getParent(); | |||
1075 | ||||
1076 | if (const auto *MC = dyn_cast<MemTransferInst>(I)) { | |||
1077 | ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength()); | |||
1078 | // If 'size' is not a constant, a library call will be generated. | |||
1079 | if (!C) | |||
1080 | return -1; | |||
1081 | ||||
1082 | const unsigned Size = C->getValue().getZExtValue(); | |||
1083 | const Align DstAlign = *MC->getDestAlign(); | |||
1084 | const Align SrcAlign = *MC->getSourceAlign(); | |||
1085 | ||||
1086 | MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign, | |||
1087 | /*IsVolatile*/ false); | |||
1088 | DstAddrSpace = MC->getDestAddressSpace(); | |||
1089 | SrcAddrSpace = MC->getSourceAddressSpace(); | |||
1090 | } | |||
1091 | else if (const auto *MS = dyn_cast<MemSetInst>(I)) { | |||
1092 | ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength()); | |||
1093 | // If 'size' is not a constant, a library call will be generated. | |||
1094 | if (!C) | |||
1095 | return -1; | |||
1096 | ||||
1097 | const unsigned Size = C->getValue().getZExtValue(); | |||
1098 | const Align DstAlign = *MS->getDestAlign(); | |||
1099 | ||||
1100 | MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign, | |||
1101 | /*IsZeroMemset*/ false, /*IsVolatile*/ false); | |||
1102 | DstAddrSpace = MS->getDestAddressSpace(); | |||
1103 | } | |||
1104 | else | |||
1105 | llvm_unreachable("Expected a memcpy/move or memset!")::llvm::llvm_unreachable_internal("Expected a memcpy/move or memset!" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp" , 1105); | |||
1106 | ||||
1107 | unsigned Limit, Factor = 2; | |||
1108 | switch(I->getIntrinsicID()) { | |||
1109 | case Intrinsic::memcpy: | |||
1110 | Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize()); | |||
1111 | break; | |||
1112 | case Intrinsic::memmove: | |||
1113 | Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize()); | |||
1114 | break; | |||
1115 | case Intrinsic::memset: | |||
1116 | Limit = TLI->getMaxStoresPerMemset(F->hasMinSize()); | |||
1117 | Factor = 1; | |||
1118 | break; | |||
1119 | default: | |||
1120 | llvm_unreachable("Expected a memcpy/move or memset!")::llvm::llvm_unreachable_internal("Expected a memcpy/move or memset!" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp" , 1120); | |||
1121 | } | |||
1122 | ||||
1123 | // MemOps will be poplulated with a list of data types that needs to be | |||
1124 | // loaded and stored. That's why we multiply the number of elements by 2 to | |||
1125 | // get the cost for this memcpy. | |||
1126 | std::vector<EVT> MemOps; | |||
1127 | if (getTLI()->findOptimalMemOpLowering( | |||
1128 | MemOps, Limit, MOp, DstAddrSpace, | |||
1129 | SrcAddrSpace, F->getAttributes())) | |||
1130 | return MemOps.size() * Factor; | |||
1131 | ||||
1132 | // If we can't find an optimal memop lowering, return the default cost | |||
1133 | return -1; | |||
1134 | } | |||
1135 | ||||
1136 | InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) { | |||
1137 | int NumOps = getNumMemOps(cast<IntrinsicInst>(I)); | |||
1138 | ||||
1139 | // To model the cost of a library call, we assume 1 for the call, and | |||
1140 | // 3 for the argument setup. | |||
1141 | if (NumOps == -1) | |||
1142 | return 4; | |||
1143 | return NumOps; | |||
1144 | } | |||
1145 | ||||
1146 | InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, | |||
1147 | VectorType *Tp, ArrayRef<int> Mask, | |||
1148 | int Index, VectorType *SubTp) { | |||
1149 | Kind = improveShuffleKindFromMask(Kind, Mask); | |||
1150 | if (ST->hasNEON()) { | |||
1151 | if (Kind == TTI::SK_Broadcast) { | |||
1152 | static const CostTblEntry NEONDupTbl[] = { | |||
1153 | // VDUP handles these cases. | |||
1154 | {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, | |||
1155 | {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, | |||
1156 | {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, | |||
1157 | {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, | |||
1158 | {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1}, | |||
1159 | {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1}, | |||
1160 | ||||
1161 | {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, | |||
1162 | {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, | |||
1163 | {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, | |||
1164 | {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}}; | |||
1165 | ||||
1166 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); | |||
1167 | if (const auto *Entry = | |||
1168 | CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second)) | |||
1169 | return LT.first * Entry->Cost; | |||
1170 | } | |||
1171 | if (Kind == TTI::SK_Reverse) { | |||
1172 | static const CostTblEntry NEONShuffleTbl[] = { | |||
1173 | // Reverse shuffle cost one instruction if we are shuffling within a | |||
1174 | // double word (vrev) or two if we shuffle a quad word (vrev, vext). | |||
1175 | {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, | |||
1176 | {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, | |||
1177 | {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, | |||
1178 | {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, | |||
1179 | {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1}, | |||
1180 | {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1}, | |||
1181 | ||||
1182 | {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, | |||
1183 | {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, | |||
1184 | {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2}, | |||
1185 | {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}}; | |||
1186 | ||||
1187 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); | |||
1188 | if (const auto *Entry = | |||
1189 | CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) | |||
1190 | return LT.first * Entry->Cost; | |||
1191 | } | |||
1192 | if (Kind == TTI::SK_Select) { | |||
1193 | static const CostTblEntry NEONSelShuffleTbl[] = { | |||
1194 | // Select shuffle cost table for ARM. Cost is the number of | |||
1195 | // instructions | |||
1196 | // required to create the shuffled vector. | |||
1197 | ||||
1198 | {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1}, | |||
1199 | {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, | |||
1200 | {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, | |||
1201 | {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, | |||
1202 | ||||
1203 | {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, | |||
1204 | {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, | |||
1205 | {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2}, | |||
1206 | ||||
1207 | {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16}, | |||
1208 | ||||
1209 | {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}}; | |||
1210 | ||||
1211 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); | |||
1212 | if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl, | |||
1213 | ISD::VECTOR_SHUFFLE, LT.second)) | |||
1214 | return LT.first * Entry->Cost; | |||
1215 | } | |||
1216 | } | |||
1217 | if (ST->hasMVEIntegerOps()) { | |||
1218 | if (Kind == TTI::SK_Broadcast) { | |||
1219 | static const CostTblEntry MVEDupTbl[] = { | |||
1220 | // VDUP handles these cases. | |||
1221 | {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, | |||
1222 | {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, | |||
1223 | {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}, | |||
1224 | {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, | |||
1225 | {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}}; | |||
1226 | ||||
1227 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); | |||
1228 | if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE, | |||
1229 | LT.second)) | |||
1230 | return LT.first * Entry->Cost * | |||
1231 | ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput); | |||
1232 | } | |||
1233 | ||||
1234 | if (!Mask.empty()) { | |||
1235 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); | |||
1236 | if (Mask.size() <= LT.second.getVectorNumElements() && | |||
1237 | (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) || | |||
1238 | isVREVMask(Mask, LT.second, 64))) | |||
1239 | return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first; | |||
1240 | } | |||
1241 | } | |||
1242 | ||||
1243 | int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy() | |||
1244 | ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) | |||
1245 | : 1; | |||
1246 | return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); | |||
1247 | } | |||
1248 | ||||
1249 | InstructionCost ARMTTIImpl::getArithmeticInstrCost( | |||
1250 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, | |||
1251 | TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, | |||
1252 | TTI::OperandValueProperties Opd1PropInfo, | |||
1253 | TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, | |||
1254 | const Instruction *CxtI) { | |||
1255 | int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); | |||
1256 | if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) { | |||
1257 | // Make operations on i1 relatively expensive as this often involves | |||
1258 | // combining predicates. AND and XOR should be easier to handle with IT | |||
1259 | // blocks. | |||
1260 | switch (ISDOpcode) { | |||
1261 | default: | |||
1262 | break; | |||
1263 | case ISD::AND: | |||
1264 | case ISD::XOR: | |||
1265 | return 2; | |||
1266 | case ISD::OR: | |||
1267 | return 3; | |||
1268 | } | |||
1269 | } | |||
1270 | ||||
1271 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); | |||
1272 | ||||
1273 | if (ST->hasNEON()) { | |||
1274 | const unsigned FunctionCallDivCost = 20; | |||
1275 | const unsigned ReciprocalDivCost = 10; | |||
1276 | static const CostTblEntry CostTbl[] = { | |||
1277 | // Division. | |||
1278 | // These costs are somewhat random. Choose a cost of 20 to indicate that | |||
1279 | // vectorizing devision (added function call) is going to be very expensive. | |||
1280 | // Double registers types. | |||
1281 | { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost}, | |||
1282 | { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost}, | |||
1283 | { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost}, | |||
1284 | { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost}, | |||
1285 | { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost}, | |||
1286 | { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost}, | |||
1287 | { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost}, | |||
1288 | { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost}, | |||
1289 | { ISD::SDIV, MVT::v4i16, ReciprocalDivCost}, | |||
1290 | { ISD::UDIV, MVT::v4i16, ReciprocalDivCost}, | |||
1291 | { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost}, | |||
1292 | { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost}, | |||
1293 | { ISD::SDIV, MVT::v8i8, ReciprocalDivCost}, | |||
1294 | { ISD::UDIV, MVT::v8i8, ReciprocalDivCost}, | |||
1295 | { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost}, | |||
1296 | { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost}, | |||
1297 | // Quad register types. | |||
1298 | { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost}, | |||
1299 | { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost}, | |||
1300 | { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost}, | |||
1301 | { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost}, | |||
1302 | { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost}, | |||
1303 | { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost}, | |||
1304 | { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost}, | |||
1305 | { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost}, | |||
1306 | { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost}, | |||
1307 | { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost}, | |||
1308 | { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost}, | |||
1309 | { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost}, | |||
1310 | { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost}, | |||
1311 | { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost}, | |||
1312 | { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost}, | |||
1313 | { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost}, | |||
1314 | // Multiplication. | |||
1315 | }; | |||
1316 | ||||
1317 | if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second)) | |||
1318 | return LT.first * Entry->Cost; | |||
1319 | ||||
1320 | InstructionCost Cost = BaseT::getArithmeticInstrCost( | |||
1321 | Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo); | |||
1322 | ||||
1323 | // This is somewhat of a hack. The problem that we are facing is that SROA | |||
1324 | // creates a sequence of shift, and, or instructions to construct values. | |||
1325 | // These sequences are recognized by the ISel and have zero-cost. Not so for | |||
1326 | // the vectorized code. Because we have support for v2i64 but not i64 those | |||
1327 | // sequences look particularly beneficial to vectorize. | |||
1328 | // To work around this we increase the cost of v2i64 operations to make them | |||
1329 | // seem less beneficial. | |||
1330 | if (LT.second == MVT::v2i64 && | |||
1331 | Op2Info == TargetTransformInfo::OK_UniformConstantValue) | |||
1332 | Cost += 4; | |||
1333 | ||||
1334 | return Cost; | |||
1335 | } | |||
1336 | ||||
1337 | // If this operation is a shift on arm/thumb2, it might well be folded into | |||
1338 | // the following instruction, hence having a cost of 0. | |||
1339 | auto LooksLikeAFreeShift = [&]() { | |||
1340 | if (ST->isThumb1Only() || Ty->isVectorTy()) | |||
1341 | return false; | |||
1342 | ||||
1343 | if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift()) | |||
1344 | return false; | |||
1345 | if (Op2Info != TargetTransformInfo::OK_UniformConstantValue) | |||
1346 | return false; | |||
1347 | ||||
1348 | // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB | |||
1349 | switch (cast<Instruction>(CxtI->user_back())->getOpcode()) { | |||
1350 | case Instruction::Add: | |||
1351 | case Instruction::Sub: | |||
1352 | case Instruction::And: | |||
1353 | case Instruction::Xor: | |||
1354 | case Instruction::Or: | |||
1355 | case Instruction::ICmp: | |||
1356 | return true; | |||
1357 | default: | |||
1358 | return false; | |||
1359 | } | |||
1360 | }; | |||
1361 | if (LooksLikeAFreeShift()) | |||
1362 | return 0; | |||
1363 | ||||
1364 | // Default to cheap (throughput/size of 1 instruction) but adjust throughput | |||
1365 | // for "multiple beats" potentially needed by MVE instructions. | |||
1366 | int BaseCost = 1; | |||
1367 | if (ST->hasMVEIntegerOps() && Ty->isVectorTy()) | |||
1368 | BaseCost = ST->getMVEVectorCostFactor(CostKind); | |||
1369 | ||||
1370 | // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost, | |||
1371 | // without treating floats as more expensive that scalars or increasing the | |||
1372 | // costs for custom operations. The results is also multiplied by the | |||
1373 | // MVEVectorCostFactor where appropriate. | |||
1374 | if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second)) | |||
1375 | return LT.first * BaseCost; | |||
1376 | ||||
1377 | // Else this is expand, assume that we need to scalarize this op. | |||
1378 | if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) { | |||
1379 | unsigned Num = VTy->getNumElements(); | |||
1380 | InstructionCost Cost = | |||
1381 | getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind); | |||
1382 | // Return the cost of multiple scalar invocation plus the cost of | |||
1383 | // inserting and extracting the values. | |||
1384 | SmallVector<Type *> Tys(Args.size(), Ty); | |||
1385 | return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost; | |||
1386 | } | |||
1387 | ||||
1388 | return BaseCost; | |||
1389 | } | |||
1390 | ||||
1391 | InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, | |||
1392 | MaybeAlign Alignment, | |||
1393 | unsigned AddressSpace, | |||
1394 | TTI::TargetCostKind CostKind, | |||
1395 | const Instruction *I) { | |||
1396 | // TODO: Handle other cost kinds. | |||
1397 | if (CostKind != TTI::TCK_RecipThroughput) | |||
1398 | return 1; | |||
1399 | ||||
1400 | // Type legalization can't handle structs | |||
1401 | if (TLI->getValueType(DL, Src, true) == MVT::Other) | |||
1402 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | |||
1403 | CostKind); | |||
1404 | ||||
1405 | if (ST->hasNEON() && Src->isVectorTy() && | |||
1406 | (Alignment && *Alignment != Align(16)) && | |||
1407 | cast<VectorType>(Src)->getElementType()->isDoubleTy()) { | |||
1408 | // Unaligned loads/stores are extremely inefficient. | |||
1409 | // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr. | |||
1410 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); | |||
1411 | return LT.first * 4; | |||
1412 | } | |||
1413 | ||||
1414 | // MVE can optimize a fpext(load(4xhalf)) using an extending integer load. | |||
1415 | // Same for stores. | |||
1416 | if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I && | |||
1417 | ((Opcode == Instruction::Load && I->hasOneUse() && | |||
1418 | isa<FPExtInst>(*I->user_begin())) || | |||
1419 | (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) { | |||
1420 | FixedVectorType *SrcVTy = cast<FixedVectorType>(Src); | |||
1421 | Type *DstTy = | |||
1422 | Opcode == Instruction::Load | |||
1423 | ? (*I->user_begin())->getType() | |||
1424 | : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType(); | |||
1425 | if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() && | |||
1426 | DstTy->getScalarType()->isFloatTy()) | |||
1427 | return ST->getMVEVectorCostFactor(CostKind); | |||
1428 | } | |||
1429 | ||||
1430 | int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy() | |||
1431 | ? ST->getMVEVectorCostFactor(CostKind) | |||
1432 | : 1; | |||
1433 | return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | |||
1434 | CostKind, I); | |||
1435 | } | |||
1436 | ||||
1437 | InstructionCost | |||
1438 | ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, | |||
1439 | unsigned AddressSpace, | |||
1440 | TTI::TargetCostKind CostKind) { | |||
1441 | if (ST->hasMVEIntegerOps()) { | |||
1442 | if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment)) | |||
1443 | return ST->getMVEVectorCostFactor(CostKind); | |||
1444 | if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment)) | |||
1445 | return ST->getMVEVectorCostFactor(CostKind); | |||
1446 | } | |||
1447 | if (!isa<FixedVectorType>(Src)) | |||
1448 | return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | |||
1449 | CostKind); | |||
1450 | // Scalar cost, which is currently very high due to the efficiency of the | |||
1451 | // generated code. | |||
1452 | return cast<FixedVectorType>(Src)->getNumElements() * 8; | |||
1453 | } | |||
1454 | ||||
1455 | InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost( | |||
1456 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, | |||
1457 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, | |||
1458 | bool UseMaskForCond, bool UseMaskForGaps) { | |||
1459 | assert(Factor >= 2 && "Invalid interleave factor")(static_cast <bool> (Factor >= 2 && "Invalid interleave factor" ) ? void (0) : __assert_fail ("Factor >= 2 && \"Invalid interleave factor\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp" , 1459, __extension__ __PRETTY_FUNCTION__)); | |||
1460 | assert(isa<VectorType>(VecTy) && "Expect a vector type")(static_cast <bool> (isa<VectorType>(VecTy) && "Expect a vector type") ? void (0) : __assert_fail ("isa<VectorType>(VecTy) && \"Expect a vector type\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp" , 1460, __extension__ __PRETTY_FUNCTION__)); | |||
1461 | ||||
1462 | // vldN/vstN doesn't support vector types of i64/f64 element. | |||
1463 | bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64; | |||
1464 | ||||
1465 | if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits && | |||
1466 | !UseMaskForCond && !UseMaskForGaps) { | |||
1467 | unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements(); | |||
1468 | auto *SubVecTy = | |||
1469 | FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor); | |||
1470 | ||||
1471 | // vldN/vstN only support legal vector types of size 64 or 128 in bits. | |||
1472 | // Accesses having vector types that are a multiple of 128 bits can be | |||
1473 | // matched to more than one vldN/vstN instruction. | |||
1474 | int BaseCost = | |||
1475 | ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1; | |||
1476 | if (NumElts % Factor == 0 && | |||
1477 | TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL)) | |||
1478 | return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL); | |||
1479 | ||||
1480 | // Some smaller than legal interleaved patterns are cheap as we can make | |||
1481 | // use of the vmovn or vrev patterns to interleave a standard load. This is | |||
1482 | // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is | |||
1483 | // promoted differently). The cost of 2 here is then a load and vrev or | |||
1484 | // vmovn. | |||
1485 | if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 && | |||
1486 | VecTy->isIntOrIntVectorTy() && | |||
1487 | DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64) | |||
1488 | return 2 * BaseCost; | |||
1489 | } | |||
1490 | ||||
1491 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | |||
1492 | Alignment, AddressSpace, CostKind, | |||
1493 | UseMaskForCond, UseMaskForGaps); | |||
1494 | } | |||
1495 | ||||
1496 | InstructionCost ARMTTIImpl::getGatherScatterOpCost( | |||
1497 | unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, | |||
1498 | Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { | |||
1499 | using namespace PatternMatch; | |||
1500 | if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters) | |||
1501 | return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, | |||
1502 | Alignment, CostKind, I); | |||
1503 | ||||
1504 | assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!")(static_cast <bool> (DataTy->isVectorTy() && "Can't do gather/scatters on scalar!") ? void (0) : __assert_fail ("DataTy->isVectorTy() && \"Can't do gather/scatters on scalar!\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp" , 1504, __extension__ __PRETTY_FUNCTION__)); | |||
1505 | auto *VTy = cast<FixedVectorType>(DataTy); | |||
1506 | ||||
1507 | // TODO: Splitting, once we do that. | |||
1508 | ||||
1509 | unsigned NumElems = VTy->getNumElements(); | |||
1510 | unsigned EltSize = VTy->getScalarSizeInBits(); | |||
1511 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy); | |||
1512 | ||||
1513 | // For now, it is assumed that for the MVE gather instructions the loads are | |||
1514 | // all effectively serialised. This means the cost is the scalar cost | |||
1515 | // multiplied by the number of elements being loaded. This is possibly very | |||
1516 | // conservative, but even so we still end up vectorising loops because the | |||
1517 | // cost per iteration for many loops is lower than for scalar loops. | |||
1518 | InstructionCost VectorCost = | |||
1519 | NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind); | |||
1520 | // The scalarization cost should be a lot higher. We use the number of vector | |||
1521 | // elements plus the scalarization overhead. | |||
1522 | InstructionCost ScalarCost = | |||
1523 | NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) + | |||
1524 | BaseT::getScalarizationOverhead(VTy, false, true); | |||
1525 | ||||
1526 | if (EltSize < 8 || Alignment < EltSize / 8) | |||
1527 | return ScalarCost; | |||
1528 | ||||
1529 | unsigned ExtSize = EltSize; | |||
1530 | // Check whether there's a single user that asks for an extended type | |||
1531 | if (I != nullptr) { | |||
1532 | // Dependent of the caller of this function, a gather instruction will | |||
1533 | // either have opcode Instruction::Load or be a call to the masked_gather | |||
1534 | // intrinsic | |||
1535 | if ((I->getOpcode() == Instruction::Load || | |||
1536 | match(I, m_Intrinsic<Intrinsic::masked_gather>())) && | |||
1537 | I->hasOneUse()) { | |||
1538 | const User *Us = *I->users().begin(); | |||
1539 | if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) { | |||
1540 | // only allow valid type combinations | |||
1541 | unsigned TypeSize = | |||
1542 | cast<Instruction>(Us)->getType()->getScalarSizeInBits(); | |||
1543 | if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) || | |||
1544 | (TypeSize == 16 && EltSize == 8)) && | |||
1545 | TypeSize * NumElems == 128) { | |||
1546 | ExtSize = TypeSize; | |||
1547 | } | |||
1548 | } | |||
1549 | } | |||
1550 | // Check whether the input data needs to be truncated | |||
1551 | TruncInst *T; | |||
1552 | if ((I->getOpcode() == Instruction::Store || | |||
1553 | match(I, m_Intrinsic<Intrinsic::masked_scatter>())) && | |||
1554 | (T = dyn_cast<TruncInst>(I->getOperand(0)))) { | |||
1555 | // Only allow valid type combinations | |||
1556 | unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits(); | |||
1557 | if (((EltSize == 16 && TypeSize == 32) || | |||
1558 | (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) && | |||
1559 | TypeSize * NumElems == 128) | |||
1560 | ExtSize = TypeSize; | |||
1561 | } | |||
1562 | } | |||
1563 | ||||
1564 | if (ExtSize * NumElems != 128 || NumElems < 4) | |||
1565 | return ScalarCost; | |||
1566 | ||||
1567 | // Any (aligned) i32 gather will not need to be scalarised. | |||
1568 | if (ExtSize == 32) | |||
1569 | return VectorCost; | |||
1570 | // For smaller types, we need to ensure that the gep's inputs are correctly | |||
1571 | // extended from a small enough value. Other sizes (including i64) are | |||
1572 | // scalarized for now. | |||
1573 | if (ExtSize != 8 && ExtSize != 16) | |||
1574 | return ScalarCost; | |||
1575 | ||||
1576 | if (const auto *BC = dyn_cast<BitCastInst>(Ptr)) | |||
1577 | Ptr = BC->getOperand(0); | |||
1578 | if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) { | |||
1579 | if (GEP->getNumOperands() != 2) | |||
1580 | return ScalarCost; | |||
1581 | unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType()); | |||
1582 | // Scale needs to be correct (which is only relevant for i16s). | |||
1583 | if (Scale != 1 && Scale * 8 != ExtSize) | |||
1584 | return ScalarCost; | |||
1585 | // And we need to zext (not sext) the indexes from a small enough type. | |||
1586 | if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) { | |||
1587 | if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize) | |||
1588 | return VectorCost; | |||
1589 | } | |||
1590 | return ScalarCost; | |||
1591 | } | |||
1592 | return ScalarCost; | |||
1593 | } | |||
1594 | ||||
1595 | InstructionCost | |||
1596 | ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, | |||
1597 | Optional<FastMathFlags> FMF, | |||
1598 | TTI::TargetCostKind CostKind) { | |||
1599 | if (TTI::requiresOrderedReduction(FMF)) | |||
1600 | return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); | |||
1601 | ||||
1602 | EVT ValVT = TLI->getValueType(DL, ValTy); | |||
1603 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
1604 | if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD) | |||
1605 | return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); | |||
1606 | ||||
1607 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); | |||
1608 | ||||
1609 | static const CostTblEntry CostTblAdd[]{ | |||
1610 | {ISD::ADD, MVT::v16i8, 1}, | |||
1611 | {ISD::ADD, MVT::v8i16, 1}, | |||
1612 | {ISD::ADD, MVT::v4i32, 1}, | |||
1613 | }; | |||
1614 | if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second)) | |||
1615 | return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first; | |||
1616 | ||||
1617 | return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); | |||
1618 | } | |||
1619 | ||||
1620 | InstructionCost | |||
1621 | ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, | |||
1622 | Type *ResTy, VectorType *ValTy, | |||
1623 | TTI::TargetCostKind CostKind) { | |||
1624 | EVT ValVT = TLI->getValueType(DL, ValTy); | |||
1625 | EVT ResVT = TLI->getValueType(DL, ResTy); | |||
1626 | ||||
1627 | if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) { | |||
1628 | std::pair<InstructionCost, MVT> LT = | |||
1629 | TLI->getTypeLegalizationCost(DL, ValTy); | |||
1630 | ||||
1631 | // The legal cases are: | |||
1632 | // VADDV u/s 8/16/32 | |||
1633 | // VMLAV u/s 8/16/32 | |||
1634 | // VADDLV u/s 32 | |||
1635 | // VMLALV u/s 16/32 | |||
1636 | // Codegen currently cannot always handle larger than legal vectors very | |||
1637 | // well, especially for predicated reductions where the mask needs to be | |||
1638 | // split, so restrict to 128bit or smaller input types. | |||
1639 | unsigned RevVTSize = ResVT.getSizeInBits(); | |||
1640 | if (ValVT.getSizeInBits() <= 128 && | |||
1641 | ((LT.second == MVT::v16i8 && RevVTSize <= 32) || | |||
1642 | (LT.second == MVT::v8i16 && RevVTSize <= (IsMLA ? 64u : 32u)) || | |||
1643 | (LT.second == MVT::v4i32 && RevVTSize <= 64))) | |||
1644 | return ST->getMVEVectorCostFactor(CostKind) * LT.first; | |||
1645 | } | |||
1646 | ||||
1647 | return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy, | |||
1648 | CostKind); | |||
1649 | } | |||
1650 | ||||
1651 | InstructionCost | |||
1652 | ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, | |||
1653 | TTI::TargetCostKind CostKind) { | |||
1654 | switch (ICA.getID()) { | |||
1655 | case Intrinsic::get_active_lane_mask: | |||
1656 | // Currently we make a somewhat optimistic assumption that | |||
1657 | // active_lane_mask's are always free. In reality it may be freely folded | |||
1658 | // into a tail predicated loop, expanded into a VCPT or expanded into a lot | |||
1659 | // of add/icmp code. We may need to improve this in the future, but being | |||
1660 | // able to detect if it is free or not involves looking at a lot of other | |||
1661 | // code. We currently assume that the vectorizer inserted these, and knew | |||
1662 | // what it was doing in adding one. | |||
1663 | if (ST->hasMVEIntegerOps()) | |||
1664 | return 0; | |||
1665 | break; | |||
1666 | case Intrinsic::sadd_sat: | |||
1667 | case Intrinsic::ssub_sat: | |||
1668 | case Intrinsic::uadd_sat: | |||
1669 | case Intrinsic::usub_sat: { | |||
1670 | if (!ST->hasMVEIntegerOps()) | |||
1671 | break; | |||
1672 | Type *VT = ICA.getReturnType(); | |||
1673 | ||||
1674 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT); | |||
1675 | if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 || | |||
1676 | LT.second == MVT::v16i8) { | |||
1677 | // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we | |||
1678 | // need to extend the type, as it uses shr(qadd(shl, shl)). | |||
1679 | unsigned Instrs = | |||
1680 | LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4; | |||
1681 | return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs; | |||
1682 | } | |||
1683 | break; | |||
1684 | } | |||
1685 | case Intrinsic::abs: | |||
1686 | case Intrinsic::smin: | |||
1687 | case Intrinsic::smax: | |||
1688 | case Intrinsic::umin: | |||
1689 | case Intrinsic::umax: { | |||
1690 | if (!ST->hasMVEIntegerOps()) | |||
1691 | break; | |||
1692 | Type *VT = ICA.getReturnType(); | |||
1693 | ||||
1694 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT); | |||
1695 | if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 || | |||
1696 | LT.second == MVT::v16i8) | |||
1697 | return LT.first * ST->getMVEVectorCostFactor(CostKind); | |||
1698 | break; | |||
1699 | } | |||
1700 | case Intrinsic::minnum: | |||
1701 | case Intrinsic::maxnum: { | |||
1702 | if (!ST->hasMVEFloatOps()) | |||
1703 | break; | |||
1704 | Type *VT = ICA.getReturnType(); | |||
1705 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, VT); | |||
1706 | if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) | |||
1707 | return LT.first * ST->getMVEVectorCostFactor(CostKind); | |||
1708 | break; | |||
1709 | } | |||
1710 | } | |||
1711 | ||||
1712 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); | |||
1713 | } | |||
1714 | ||||
1715 | bool ARMTTIImpl::isLoweredToCall(const Function *F) { | |||
1716 | if (!F->isIntrinsic()) | |||
1717 | BaseT::isLoweredToCall(F); | |||
1718 | ||||
1719 | // Assume all Arm-specific intrinsics map to an instruction. | |||
1720 | if (F->getName().startswith("llvm.arm")) | |||
1721 | return false; | |||
1722 | ||||
1723 | switch (F->getIntrinsicID()) { | |||
1724 | default: break; | |||
1725 | case Intrinsic::powi: | |||
1726 | case Intrinsic::sin: | |||
1727 | case Intrinsic::cos: | |||
1728 | case Intrinsic::pow: | |||
1729 | case Intrinsic::log: | |||
1730 | case Intrinsic::log10: | |||
1731 | case Intrinsic::log2: | |||
1732 | case Intrinsic::exp: | |||
1733 | case Intrinsic::exp2: | |||
1734 | return true; | |||
1735 | case Intrinsic::sqrt: | |||
1736 | case Intrinsic::fabs: | |||
1737 | case Intrinsic::copysign: | |||
1738 | case Intrinsic::floor: | |||
1739 | case Intrinsic::ceil: | |||
1740 | case Intrinsic::trunc: | |||
1741 | case Intrinsic::rint: | |||
1742 | case Intrinsic::nearbyint: | |||
1743 | case Intrinsic::round: | |||
1744 | case Intrinsic::canonicalize: | |||
1745 | case Intrinsic::lround: | |||
1746 | case Intrinsic::llround: | |||
1747 | case Intrinsic::lrint: | |||
1748 | case Intrinsic::llrint: | |||
1749 | if (F->getReturnType()->isDoubleTy() && !ST->hasFP64()) | |||
1750 | return true; | |||
1751 | if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16()) | |||
1752 | return true; | |||
1753 | // Some operations can be handled by vector instructions and assume | |||
1754 | // unsupported vectors will be expanded into supported scalar ones. | |||
1755 | // TODO Handle scalar operations properly. | |||
1756 | return !ST->hasFPARMv8Base() && !ST->hasVFP2Base(); | |||
1757 | case Intrinsic::masked_store: | |||
1758 | case Intrinsic::masked_load: | |||
1759 | case Intrinsic::masked_gather: | |||
1760 | case Intrinsic::masked_scatter: | |||
1761 | return !ST->hasMVEIntegerOps(); | |||
1762 | case Intrinsic::sadd_with_overflow: | |||
1763 | case Intrinsic::uadd_with_overflow: | |||
1764 | case Intrinsic::ssub_with_overflow: | |||
1765 | case Intrinsic::usub_with_overflow: | |||
1766 | case Intrinsic::sadd_sat: | |||
1767 | case Intrinsic::uadd_sat: | |||
1768 | case Intrinsic::ssub_sat: | |||
1769 | case Intrinsic::usub_sat: | |||
1770 | return false; | |||
1771 | } | |||
1772 | ||||
1773 | return BaseT::isLoweredToCall(F); | |||
1774 | } | |||
1775 | ||||
1776 | bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) { | |||
1777 | unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode()); | |||
1778 | EVT VT = TLI->getValueType(DL, I.getType(), true); | |||
1779 | if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall) | |||
1780 | return true; | |||
1781 | ||||
1782 | // Check if an intrinsic will be lowered to a call and assume that any | |||
1783 | // other CallInst will generate a bl. | |||
1784 | if (auto *Call = dyn_cast<CallInst>(&I)) { | |||
1785 | if (auto *II = dyn_cast<IntrinsicInst>(Call)) { | |||
1786 | switch(II->getIntrinsicID()) { | |||
1787 | case Intrinsic::memcpy: | |||
1788 | case Intrinsic::memset: | |||
1789 | case Intrinsic::memmove: | |||
1790 | return getNumMemOps(II) == -1; | |||
1791 | default: | |||
1792 | if (const Function *F = Call->getCalledFunction()) | |||
1793 | return isLoweredToCall(F); | |||
1794 | } | |||
1795 | } | |||
1796 | return true; | |||
1797 | } | |||
1798 | ||||
1799 | // FPv5 provides conversions between integer, double-precision, | |||
1800 | // single-precision, and half-precision formats. | |||
1801 | switch (I.getOpcode()) { | |||
1802 | default: | |||
1803 | break; | |||
1804 | case Instruction::FPToSI: | |||
1805 | case Instruction::FPToUI: | |||
1806 | case Instruction::SIToFP: | |||
1807 | case Instruction::UIToFP: | |||
1808 | case Instruction::FPTrunc: | |||
1809 | case Instruction::FPExt: | |||
1810 | return !ST->hasFPARMv8Base(); | |||
1811 | } | |||
1812 | ||||
1813 | // FIXME: Unfortunately the approach of checking the Operation Action does | |||
1814 | // not catch all cases of Legalization that use library calls. Our | |||
1815 | // Legalization step categorizes some transformations into library calls as | |||
1816 | // Custom, Expand or even Legal when doing type legalization. So for now | |||
1817 | // we have to special case for instance the SDIV of 64bit integers and the | |||
1818 | // use of floating point emulation. | |||
1819 | if (VT.isInteger() && VT.getSizeInBits() >= 64) { | |||
1820 | switch (ISD) { | |||
1821 | default: | |||
1822 | break; | |||
1823 | case ISD::SDIV: | |||
1824 | case ISD::UDIV: | |||
1825 | case ISD::SREM: | |||
1826 | case ISD::UREM: | |||
1827 | case ISD::SDIVREM: | |||
1828 | case ISD::UDIVREM: | |||
1829 | return true; | |||
1830 | } | |||
1831 | } | |||
1832 | ||||
1833 | // Assume all other non-float operations are supported. | |||
1834 | if (!VT.isFloatingPoint()) | |||
1835 | return false; | |||
1836 | ||||
1837 | // We'll need a library call to handle most floats when using soft. | |||
1838 | if (TLI->useSoftFloat()) { | |||
1839 | switch (I.getOpcode()) { | |||
1840 | default: | |||
1841 | return true; | |||
1842 | case Instruction::Alloca: | |||
1843 | case Instruction::Load: | |||
1844 | case Instruction::Store: | |||
1845 | case Instruction::Select: | |||
1846 | case Instruction::PHI: | |||
1847 | return false; | |||
1848 | } | |||
1849 | } | |||
1850 | ||||
1851 | // We'll need a libcall to perform double precision operations on a single | |||
1852 | // precision only FPU. | |||
1853 | if (I.getType()->isDoubleTy() && !ST->hasFP64()) | |||
1854 | return true; | |||
1855 | ||||
1856 | // Likewise for half precision arithmetic. | |||
1857 | if (I.getType()->isHalfTy() && !ST->hasFullFP16()) | |||
1858 | return true; | |||
1859 | ||||
1860 | return false; | |||
1861 | } | |||
1862 | ||||
1863 | bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, | |||
1864 | AssumptionCache &AC, | |||
1865 | TargetLibraryInfo *LibInfo, | |||
1866 | HardwareLoopInfo &HWLoopInfo) { | |||
1867 | // Low-overhead branches are only supported in the 'low-overhead branch' | |||
1868 | // extension of v8.1-m. | |||
1869 | if (!ST->hasLOB() || DisableLowOverheadLoops) { | |||
1870 | LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "ARMHWLoops: Disabled\n"; } } while (false); | |||
1871 | return false; | |||
1872 | } | |||
1873 | ||||
1874 | if (!SE.hasLoopInvariantBackedgeTakenCount(L)) { | |||
1875 | LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "ARMHWLoops: No BETC\n"; } } while (false); | |||
1876 | return false; | |||
1877 | } | |||
1878 | ||||
1879 | const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); | |||
1880 | if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) { | |||
1881 | LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "ARMHWLoops: Uncomputable BETC\n" ; } } while (false); | |||
1882 | return false; | |||
1883 | } | |||
1884 | ||||
1885 | const SCEV *TripCountSCEV = | |||
1886 | SE.getAddExpr(BackedgeTakenCount, | |||
1887 | SE.getOne(BackedgeTakenCount->getType())); | |||
1888 | ||||
1889 | // We need to store the trip count in LR, a 32-bit register. | |||
1890 | if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) { | |||
1891 | LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n" ; } } while (false); | |||
1892 | return false; | |||
1893 | } | |||
1894 | ||||
1895 | // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little | |||
1896 | // point in generating a hardware loop if that's going to happen. | |||
1897 | ||||
1898 | auto IsHardwareLoopIntrinsic = [](Instruction &I) { | |||
1899 | if (auto *Call = dyn_cast<IntrinsicInst>(&I)) { | |||
1900 | switch (Call->getIntrinsicID()) { | |||
1901 | default: | |||
1902 | break; | |||
1903 | case Intrinsic::start_loop_iterations: | |||
1904 | case Intrinsic::test_start_loop_iterations: | |||
1905 | case Intrinsic::loop_decrement: | |||
1906 | case Intrinsic::loop_decrement_reg: | |||
1907 | return true; | |||
1908 | } | |||
1909 | } | |||
1910 | return false; | |||
1911 | }; | |||
1912 | ||||
1913 | // Scan the instructions to see if there's any that we know will turn into a | |||
1914 | // call or if this loop is already a low-overhead loop or will become a tail | |||
1915 | // predicated loop. | |||
1916 | bool IsTailPredLoop = false; | |||
1917 | auto ScanLoop = [&](Loop *L) { | |||
1918 | for (auto *BB : L->getBlocks()) { | |||
1919 | for (auto &I : *BB) { | |||
1920 | if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) || | |||
1921 | isa<InlineAsm>(I)) { | |||
1922 | LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n"; } } while (false); | |||
1923 | return false; | |||
1924 | } | |||
1925 | if (auto *II = dyn_cast<IntrinsicInst>(&I)) | |||
1926 | IsTailPredLoop |= | |||
1927 | II->getIntrinsicID() == Intrinsic::get_active_lane_mask || | |||
1928 | II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 || | |||
1929 | II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 || | |||
1930 | II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 || | |||
1931 | II->getIntrinsicID() == Intrinsic::arm_mve_vctp64; | |||
1932 | } | |||
1933 | } | |||
1934 | return true; | |||
1935 | }; | |||
1936 | ||||
1937 | // Visit inner loops. | |||
1938 | for (auto Inner : *L) | |||
1939 | if (!ScanLoop(Inner)) | |||
1940 | return false; | |||
1941 | ||||
1942 | if (!ScanLoop(L)) | |||
1943 | return false; | |||
1944 | ||||
1945 | // TODO: Check whether the trip count calculation is expensive. If L is the | |||
1946 | // inner loop but we know it has a low trip count, calculating that trip | |||
1947 | // count (in the parent loop) may be detrimental. | |||
1948 | ||||
1949 | LLVMContext &C = L->getHeader()->getContext(); | |||
1950 | HWLoopInfo.CounterInReg = true; | |||
1951 | HWLoopInfo.IsNestingLegal = false; | |||
1952 | HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop; | |||
1953 | HWLoopInfo.CountType = Type::getInt32Ty(C); | |||
1954 | HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1); | |||
1955 | return true; | |||
1956 | } | |||
1957 | ||||
1958 | static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) { | |||
1959 | // We don't allow icmp's, and because we only look at single block loops, | |||
1960 | // we simply count the icmps, i.e. there should only be 1 for the backedge. | |||
1961 | if (isa<ICmpInst>(&I) && ++ICmpCount > 1) | |||
1962 | return false; | |||
1963 | ||||
1964 | if (isa<FCmpInst>(&I)) | |||
1965 | return false; | |||
1966 | ||||
1967 | // We could allow extending/narrowing FP loads/stores, but codegen is | |||
1968 | // too inefficient so reject this for now. | |||
1969 | if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I)) | |||
1970 | return false; | |||
1971 | ||||
1972 | // Extends have to be extending-loads | |||
1973 | if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) ) | |||
1974 | if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0))) | |||
1975 | return false; | |||
1976 | ||||
1977 | // Truncs have to be narrowing-stores | |||
1978 | if (isa<TruncInst>(&I) ) | |||
1979 | if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin())) | |||
1980 | return false; | |||
1981 | ||||
1982 | return true; | |||
1983 | } | |||
1984 | ||||
1985 | // To set up a tail-predicated loop, we need to know the total number of | |||
1986 | // elements processed by that loop. Thus, we need to determine the element | |||
1987 | // size and: | |||
1988 | // 1) it should be uniform for all operations in the vector loop, so we | |||
1989 | // e.g. don't want any widening/narrowing operations. | |||
1990 | // 2) it should be smaller than i64s because we don't have vector operations | |||
1991 | // that work on i64s. | |||
1992 | // 3) we don't want elements to be reversed or shuffled, to make sure the | |||
1993 | // tail-predication masks/predicates the right lanes. | |||
1994 | // | |||
1995 | static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, | |||
1996 | const DataLayout &DL, | |||
1997 | const LoopAccessInfo *LAI) { | |||
1998 | LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Tail-predication: checking allowed instructions\n" ; } } while (false); | |||
1999 | ||||
2000 | // If there are live-out values, it is probably a reduction. We can predicate | |||
2001 | // most reduction operations freely under MVE using a combination of | |||
2002 | // prefer-predicated-reduction-select and inloop reductions. We limit this to | |||
2003 | // floating point and integer reductions, but don't check for operators | |||
2004 | // specifically here. If the value ends up not being a reduction (and so the | |||
2005 | // vectorizer cannot tailfold the loop), we should fall back to standard | |||
2006 | // vectorization automatically. | |||
2007 | SmallVector< Instruction *, 8 > LiveOuts; | |||
2008 | LiveOuts = llvm::findDefsUsedOutsideOfLoop(L); | |||
2009 | bool ReductionsDisabled = | |||
2010 | EnableTailPredication == TailPredication::EnabledNoReductions || | |||
2011 | EnableTailPredication == TailPredication::ForceEnabledNoReductions; | |||
2012 | ||||
2013 | for (auto *I : LiveOuts) { | |||
2014 | if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() && | |||
2015 | !I->getType()->isHalfTy()) { | |||
2016 | LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Don't tail-predicate loop with non-integer/float " "live-out value\n"; } } while (false) | |||
2017 | "live-out value\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Don't tail-predicate loop with non-integer/float " "live-out value\n"; } } while (false); | |||
2018 | return false; | |||
2019 | } | |||
2020 | if (ReductionsDisabled) { | |||
2021 | LLVM_DEBUG(dbgs() << "Reductions not enabled\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Reductions not enabled\n"; } } while (false); | |||
2022 | return false; | |||
2023 | } | |||
2024 | } | |||
2025 | ||||
2026 | // Next, check that all instructions can be tail-predicated. | |||
2027 | PredicatedScalarEvolution PSE = LAI->getPSE(); | |||
2028 | SmallVector<Instruction *, 16> LoadStores; | |||
2029 | int ICmpCount = 0; | |||
2030 | ||||
2031 | for (BasicBlock *BB : L->blocks()) { | |||
2032 | for (Instruction &I : BB->instructionsWithoutDebug()) { | |||
2033 | if (isa<PHINode>(&I)) | |||
2034 | continue; | |||
2035 | if (!canTailPredicateInstruction(I, ICmpCount)) { | |||
2036 | LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Instruction not allowed: "; I. dump(); } } while (false); | |||
2037 | return false; | |||
2038 | } | |||
2039 | ||||
2040 | Type *T = I.getType(); | |||
2041 | if (T->isPointerTy()) | |||
2042 | T = T->getPointerElementType(); | |||
2043 | ||||
2044 | if (T->getScalarSizeInBits() > 32) { | |||
2045 | LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Unsupported Type: "; T->dump (); } } while (false); | |||
2046 | return false; | |||
2047 | } | |||
2048 | if (isa<StoreInst>(I) || isa<LoadInst>(I)) { | |||
2049 | Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1); | |||
2050 | int64_t NextStride = getPtrStride(PSE, Ptr, L); | |||
2051 | if (NextStride == 1) { | |||
2052 | // TODO: for now only allow consecutive strides of 1. We could support | |||
2053 | // other strides as long as it is uniform, but let's keep it simple | |||
2054 | // for now. | |||
2055 | continue; | |||
2056 | } else if (NextStride == -1 || | |||
2057 | (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) || | |||
2058 | (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) { | |||
2059 | LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Consecutive strides of 2 found, vld2/vstr2 can't " "be tail-predicated\n."; } } while (false) | |||
2060 | << "Consecutive strides of 2 found, vld2/vstr2 can't "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Consecutive strides of 2 found, vld2/vstr2 can't " "be tail-predicated\n."; } } while (false) | |||
2061 | "be tail-predicated\n.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Consecutive strides of 2 found, vld2/vstr2 can't " "be tail-predicated\n."; } } while (false); | |||
2062 | return false; | |||
2063 | // TODO: don't tail predicate if there is a reversed load? | |||
2064 | } else if (EnableMaskedGatherScatters) { | |||
2065 | // Gather/scatters do allow loading from arbitrary strides, at | |||
2066 | // least if they are loop invariant. | |||
2067 | // TODO: Loop variant strides should in theory work, too, but | |||
2068 | // this requires further testing. | |||
2069 | const SCEV *PtrScev = | |||
2070 | replaceSymbolicStrideSCEV(PSE, llvm::ValueToValueMap(), Ptr); | |||
2071 | if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) { | |||
2072 | const SCEV *Step = AR->getStepRecurrence(*PSE.getSE()); | |||
2073 | if (PSE.getSE()->isLoopInvariant(Step, L)) | |||
2074 | continue; | |||
2075 | } | |||
2076 | } | |||
2077 | LLVM_DEBUG(dbgs() << "Bad stride found, can't "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Bad stride found, can't " "tail-predicate\n." ; } } while (false) | |||
2078 | "tail-predicate\n.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Bad stride found, can't " "tail-predicate\n." ; } } while (false); | |||
2079 | return false; | |||
2080 | } | |||
2081 | } | |||
2082 | } | |||
2083 | ||||
2084 | LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "tail-predication: all instructions allowed!\n" ; } } while (false); | |||
2085 | return true; | |||
2086 | } | |||
2087 | ||||
2088 | bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, | |||
2089 | ScalarEvolution &SE, | |||
2090 | AssumptionCache &AC, | |||
2091 | TargetLibraryInfo *TLI, | |||
2092 | DominatorTree *DT, | |||
2093 | const LoopAccessInfo *LAI) { | |||
2094 | if (!EnableTailPredication) { | |||
2095 | LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Tail-predication not enabled.\n" ; } } while (false); | |||
2096 | return false; | |||
2097 | } | |||
2098 | ||||
2099 | // Creating a predicated vector loop is the first step for generating a | |||
2100 | // tail-predicated hardware loop, for which we need the MVE masked | |||
2101 | // load/stores instructions: | |||
2102 | if (!ST->hasMVEIntegerOps()) | |||
2103 | return false; | |||
2104 | ||||
2105 | // For now, restrict this to single block loops. | |||
2106 | if (L->getNumBlocks() > 1) { | |||
2107 | LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "preferPredicateOverEpilogue: not a single block " "loop.\n"; } } while (false) | |||
2108 | "loop.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "preferPredicateOverEpilogue: not a single block " "loop.\n"; } } while (false); | |||
2109 | return false; | |||
2110 | } | |||
2111 | ||||
2112 | assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected")(static_cast <bool> (L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected" ) ? void (0) : __assert_fail ("L->isInnermost() && \"preferPredicateOverEpilogue: inner-loop expected\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp" , 2112, __extension__ __PRETTY_FUNCTION__)); | |||
2113 | ||||
2114 | HardwareLoopInfo HWLoopInfo(L); | |||
2115 | if (!HWLoopInfo.canAnalyze(*LI)) { | |||
2116 | LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " "analyzable.\n"; } } while (false) | |||
2117 | "analyzable.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " "analyzable.\n"; } } while (false); | |||
2118 | return false; | |||
2119 | } | |||
2120 | ||||
2121 | // This checks if we have the low-overhead branch architecture | |||
2122 | // extension, and if we will create a hardware-loop: | |||
2123 | if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) { | |||
2124 | LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " "profitable.\n"; } } while (false) | |||
2125 | "profitable.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " "profitable.\n"; } } while (false); | |||
2126 | return false; | |||
2127 | } | |||
2128 | ||||
2129 | if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) { | |||
2130 | LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " "a candidate.\n"; } } while (false) | |||
2131 | "a candidate.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " "a candidate.\n"; } } while (false); | |||
2132 | return false; | |||
2133 | } | |||
2134 | ||||
2135 | return canTailPredicateLoop(L, LI, SE, DL, LAI); | |||
2136 | } | |||
2137 | ||||
2138 | bool ARMTTIImpl::emitGetActiveLaneMask() const { | |||
2139 | if (!ST->hasMVEIntegerOps() || !EnableTailPredication) | |||
2140 | return false; | |||
2141 | ||||
2142 | // Intrinsic @llvm.get.active.lane.mask is supported. | |||
2143 | // It is used in the MVETailPredication pass, which requires the number of | |||
2144 | // elements processed by this vector loop to setup the tail-predicated | |||
2145 | // loop. | |||
2146 | return true; | |||
2147 | } | |||
2148 | void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, | |||
2149 | TTI::UnrollingPreferences &UP, | |||
2150 | OptimizationRemarkEmitter *ORE) { | |||
2151 | // Enable Upper bound unrolling universally, not dependant upon the conditions | |||
2152 | // below. | |||
2153 | UP.UpperBound = true; | |||
2154 | ||||
2155 | // Only currently enable these preferences for M-Class cores. | |||
2156 | if (!ST->isMClass()) | |||
2157 | return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE); | |||
2158 | ||||
2159 | // Disable loop unrolling for Oz and Os. | |||
2160 | UP.OptSizeThreshold = 0; | |||
2161 | UP.PartialOptSizeThreshold = 0; | |||
2162 | if (L->getHeader()->getParent()->hasOptSize()) | |||
2163 | return; | |||
2164 | ||||
2165 | SmallVector<BasicBlock*, 4> ExitingBlocks; | |||
2166 | L->getExitingBlocks(ExitingBlocks); | |||
2167 | LLVM_DEBUG(dbgs() << "Loop has:\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Loop has:\n" << "Blocks: " << L->getNumBlocks() << "\n" << "Exit blocks: " << ExitingBlocks.size() << "\n"; } } while (false ) | |||
2168 | << "Blocks: " << L->getNumBlocks() << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Loop has:\n" << "Blocks: " << L->getNumBlocks() << "\n" << "Exit blocks: " << ExitingBlocks.size() << "\n"; } } while (false ) | |||
2169 | << "Exit blocks: " << ExitingBlocks.size() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Loop has:\n" << "Blocks: " << L->getNumBlocks() << "\n" << "Exit blocks: " << ExitingBlocks.size() << "\n"; } } while (false ); | |||
2170 | ||||
2171 | // Only allow another exit other than the latch. This acts as an early exit | |||
2172 | // as it mirrors the profitability calculation of the runtime unroller. | |||
2173 | if (ExitingBlocks.size() > 2) | |||
2174 | return; | |||
2175 | ||||
2176 | // Limit the CFG of the loop body for targets with a branch predictor. | |||
2177 | // Allowing 4 blocks permits if-then-else diamonds in the body. | |||
2178 | if (ST->hasBranchPredictor() && L->getNumBlocks() > 4) | |||
2179 | return; | |||
2180 | ||||
2181 | // Don't unroll vectorized loops, including the remainder loop | |||
2182 | if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) | |||
2183 | return; | |||
2184 | ||||
2185 | // Scan the loop: don't unroll loops with calls as this could prevent | |||
2186 | // inlining. | |||
2187 | InstructionCost Cost = 0; | |||
2188 | for (auto *BB : L->getBlocks()) { | |||
2189 | for (auto &I : *BB) { | |||
2190 | // Don't unroll vectorised loop. MVE does not benefit from it as much as | |||
2191 | // scalar code. | |||
2192 | if (I.getType()->isVectorTy()) | |||
2193 | return; | |||
2194 | ||||
2195 | if (isa<CallInst>(I) || isa<InvokeInst>(I)) { | |||
2196 | if (const Function *F = cast<CallBase>(I).getCalledFunction()) { | |||
2197 | if (!isLoweredToCall(F)) | |||
2198 | continue; | |||
2199 | } | |||
2200 | return; | |||
2201 | } | |||
2202 | ||||
2203 | SmallVector<const Value*, 4> Operands(I.operand_values()); | |||
2204 | Cost += | |||
2205 | getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency); | |||
2206 | } | |||
2207 | } | |||
2208 | ||||
2209 | // On v6m cores, there are very few registers available. We can easily end up | |||
2210 | // spilling and reloading more registers in an unrolled loop. Look at the | |||
2211 | // number of LCSSA phis as a rough measure of how many registers will need to | |||
2212 | // be live out of the loop, reducing the default unroll count if more than 1 | |||
2213 | // value is needed. In the long run, all of this should be being learnt by a | |||
2214 | // machine. | |||
2215 | unsigned UnrollCount = 4; | |||
2216 | if (ST->isThumb1Only()) { | |||
2217 | unsigned ExitingValues = 0; | |||
2218 | SmallVector<BasicBlock *, 4> ExitBlocks; | |||
2219 | L->getExitBlocks(ExitBlocks); | |||
2220 | for (auto *Exit : ExitBlocks) { | |||
2221 | // Count the number of LCSSA phis. Exclude values coming from GEP's as | |||
2222 | // only the last is expected to be needed for address operands. | |||
2223 | unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) { | |||
2224 | return PH.getNumOperands() != 1 || | |||
2225 | !isa<GetElementPtrInst>(PH.getOperand(0)); | |||
2226 | }); | |||
2227 | ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues; | |||
2228 | } | |||
2229 | if (ExitingValues) | |||
2230 | UnrollCount /= ExitingValues; | |||
2231 | if (UnrollCount <= 1) | |||
2232 | return; | |||
2233 | } | |||
2234 | ||||
2235 | LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Cost of loop: " << Cost << "\n"; } } while (false); | |||
2236 | LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("armtti")) { dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n"; } } while (false); | |||
2237 | ||||
2238 | UP.Partial = true; | |||
2239 | UP.Runtime = true; | |||
2240 | UP.UnrollRemainder = true; | |||
2241 | UP.DefaultUnrollRuntimeCount = UnrollCount; | |||
2242 | UP.UnrollAndJam = true; | |||
2243 | UP.UnrollAndJamInnerLoopThreshold = 60; | |||
2244 | ||||
2245 | // Force unrolling small loops can be very useful because of the branch | |||
2246 | // taken cost of the backedge. | |||
2247 | if (Cost < 12) | |||
2248 | UP.Force = true; | |||
2249 | } | |||
2250 | ||||
2251 | void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, | |||
2252 | TTI::PeelingPreferences &PP) { | |||
2253 | BaseT::getPeelingPreferences(L, SE, PP); | |||
2254 | } | |||
2255 | ||||
2256 | bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty, | |||
2257 | TTI::ReductionFlags Flags) const { | |||
2258 | if (!ST->hasMVEIntegerOps()) | |||
2259 | return false; | |||
2260 | ||||
2261 | unsigned ScalarBits = Ty->getScalarSizeInBits(); | |||
2262 | switch (Opcode) { | |||
2263 | case Instruction::Add: | |||
2264 | return ScalarBits <= 64; | |||
2265 | default: | |||
2266 | return false; | |||
2267 | } | |||
2268 | } | |||
2269 | ||||
2270 | bool ARMTTIImpl::preferPredicatedReductionSelect( | |||
2271 | unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { | |||
2272 | if (!ST->hasMVEIntegerOps()) | |||
2273 | return false; | |||
2274 | return true; | |||
2275 | } |
1 | //===-- ARMAddressingModes.h - ARM Addressing Modes -------------*- C++ -*-===// | |||
2 | // | |||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
4 | // See https://llvm.org/LICENSE.txt for license information. | |||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
6 | // | |||
7 | //===----------------------------------------------------------------------===// | |||
8 | // | |||
9 | // This file contains the ARM addressing mode implementation stuff. | |||
10 | // | |||
11 | //===----------------------------------------------------------------------===// | |||
12 | ||||
13 | #ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMADDRESSINGMODES_H | |||
14 | #define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMADDRESSINGMODES_H | |||
15 | ||||
16 | #include "llvm/ADT/APFloat.h" | |||
17 | #include "llvm/ADT/APInt.h" | |||
18 | #include "llvm/ADT/bit.h" | |||
19 | #include "llvm/Support/ErrorHandling.h" | |||
20 | #include "llvm/Support/MathExtras.h" | |||
21 | #include <cassert> | |||
22 | ||||
23 | namespace llvm { | |||
24 | ||||
25 | /// ARM_AM - ARM Addressing Mode Stuff | |||
26 | namespace ARM_AM { | |||
27 | enum ShiftOpc { | |||
28 | no_shift = 0, | |||
29 | asr, | |||
30 | lsl, | |||
31 | lsr, | |||
32 | ror, | |||
33 | rrx, | |||
34 | uxtw | |||
35 | }; | |||
36 | ||||
37 | enum AddrOpc { | |||
38 | sub = 0, | |||
39 | add | |||
40 | }; | |||
41 | ||||
42 | inline const char *getAddrOpcStr(AddrOpc Op) { return Op == sub ? "-" : ""; } | |||
43 | ||||
44 | inline const char *getShiftOpcStr(ShiftOpc Op) { | |||
45 | switch (Op) { | |||
46 | default: llvm_unreachable("Unknown shift opc!")::llvm::llvm_unreachable_internal("Unknown shift opc!", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 46); | |||
47 | case ARM_AM::asr: return "asr"; | |||
48 | case ARM_AM::lsl: return "lsl"; | |||
49 | case ARM_AM::lsr: return "lsr"; | |||
50 | case ARM_AM::ror: return "ror"; | |||
51 | case ARM_AM::rrx: return "rrx"; | |||
52 | case ARM_AM::uxtw: return "uxtw"; | |||
53 | } | |||
54 | } | |||
55 | ||||
56 | inline unsigned getShiftOpcEncoding(ShiftOpc Op) { | |||
57 | switch (Op) { | |||
58 | default: llvm_unreachable("Unknown shift opc!")::llvm::llvm_unreachable_internal("Unknown shift opc!", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 58); | |||
59 | case ARM_AM::asr: return 2; | |||
60 | case ARM_AM::lsl: return 0; | |||
61 | case ARM_AM::lsr: return 1; | |||
62 | case ARM_AM::ror: return 3; | |||
63 | } | |||
64 | } | |||
65 | ||||
66 | enum AMSubMode { | |||
67 | bad_am_submode = 0, | |||
68 | ia, | |||
69 | ib, | |||
70 | da, | |||
71 | db | |||
72 | }; | |||
73 | ||||
74 | inline const char *getAMSubModeStr(AMSubMode Mode) { | |||
75 | switch (Mode) { | |||
76 | default: llvm_unreachable("Unknown addressing sub-mode!")::llvm::llvm_unreachable_internal("Unknown addressing sub-mode!" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 76); | |||
77 | case ARM_AM::ia: return "ia"; | |||
78 | case ARM_AM::ib: return "ib"; | |||
79 | case ARM_AM::da: return "da"; | |||
80 | case ARM_AM::db: return "db"; | |||
81 | } | |||
82 | } | |||
83 | ||||
84 | /// rotr32 - Rotate a 32-bit unsigned value right by a specified # bits. | |||
85 | /// | |||
86 | inline unsigned rotr32(unsigned Val, unsigned Amt) { | |||
87 | assert(Amt < 32 && "Invalid rotate amount")(static_cast <bool> (Amt < 32 && "Invalid rotate amount" ) ? void (0) : __assert_fail ("Amt < 32 && \"Invalid rotate amount\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 87, __extension__ __PRETTY_FUNCTION__)); | |||
88 | return (Val >> Amt) | (Val << ((32-Amt)&31)); | |||
89 | } | |||
90 | ||||
91 | /// rotl32 - Rotate a 32-bit unsigned value left by a specified # bits. | |||
92 | /// | |||
93 | inline unsigned rotl32(unsigned Val, unsigned Amt) { | |||
94 | assert(Amt < 32 && "Invalid rotate amount")(static_cast <bool> (Amt < 32 && "Invalid rotate amount" ) ? void (0) : __assert_fail ("Amt < 32 && \"Invalid rotate amount\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 94, __extension__ __PRETTY_FUNCTION__)); | |||
95 | return (Val << Amt) | (Val >> ((32-Amt)&31)); | |||
96 | } | |||
97 | ||||
98 | //===--------------------------------------------------------------------===// | |||
99 | // Addressing Mode #1: shift_operand with registers | |||
100 | //===--------------------------------------------------------------------===// | |||
101 | // | |||
102 | // This 'addressing mode' is used for arithmetic instructions. It can | |||
103 | // represent things like: | |||
104 | // reg | |||
105 | // reg [asr|lsl|lsr|ror|rrx] reg | |||
106 | // reg [asr|lsl|lsr|ror|rrx] imm | |||
107 | // | |||
108 | // This is stored three operands [rega, regb, opc]. The first is the base | |||
109 | // reg, the second is the shift amount (or reg0 if not present or imm). The | |||
110 | // third operand encodes the shift opcode and the imm if a reg isn't present. | |||
111 | // | |||
112 | inline unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm) { | |||
113 | return ShOp | (Imm << 3); | |||
114 | } | |||
115 | inline unsigned getSORegOffset(unsigned Op) { return Op >> 3; } | |||
116 | inline ShiftOpc getSORegShOp(unsigned Op) { return (ShiftOpc)(Op & 7); } | |||
117 | ||||
118 | /// getSOImmValImm - Given an encoded imm field for the reg/imm form, return | |||
119 | /// the 8-bit imm value. | |||
120 | inline unsigned getSOImmValImm(unsigned Imm) { return Imm & 0xFF; } | |||
121 | /// getSOImmValRot - Given an encoded imm field for the reg/imm form, return | |||
122 | /// the rotate amount. | |||
123 | inline unsigned getSOImmValRot(unsigned Imm) { return (Imm >> 8) * 2; } | |||
124 | ||||
125 | /// getSOImmValRotate - Try to handle Imm with an immediate shifter operand, | |||
126 | /// computing the rotate amount to use. If this immediate value cannot be | |||
127 | /// handled with a single shifter-op, determine a good rotate amount that will | |||
128 | /// take a maximal chunk of bits out of the immediate. | |||
129 | inline unsigned getSOImmValRotate(unsigned Imm) { | |||
130 | // 8-bit (or less) immediates are trivially shifter_operands with a rotate | |||
131 | // of zero. | |||
132 | if ((Imm & ~255U) == 0) return 0; | |||
133 | ||||
134 | // Use CTZ to compute the rotate amount. | |||
135 | unsigned TZ = countTrailingZeros(Imm); | |||
136 | ||||
137 | // Rotate amount must be even. Something like 0x200 must be rotated 8 bits, | |||
138 | // not 9. | |||
139 | unsigned RotAmt = TZ & ~1; | |||
140 | ||||
141 | // If we can handle this spread, return it. | |||
142 | if ((rotr32(Imm, RotAmt) & ~255U) == 0) | |||
143 | return (32-RotAmt)&31; // HW rotates right, not left. | |||
144 | ||||
145 | // For values like 0xF000000F, we should ignore the low 6 bits, then | |||
146 | // retry the hunt. | |||
147 | if (Imm & 63U) { | |||
148 | unsigned TZ2 = countTrailingZeros(Imm & ~63U); | |||
149 | unsigned RotAmt2 = TZ2 & ~1; | |||
150 | if ((rotr32(Imm, RotAmt2) & ~255U) == 0) | |||
151 | return (32-RotAmt2)&31; // HW rotates right, not left. | |||
152 | } | |||
153 | ||||
154 | // Otherwise, we have no way to cover this span of bits with a single | |||
155 | // shifter_op immediate. Return a chunk of bits that will be useful to | |||
156 | // handle. | |||
157 | return (32-RotAmt)&31; // HW rotates right, not left. | |||
158 | } | |||
159 | ||||
160 | /// getSOImmVal - Given a 32-bit immediate, if it is something that can fit | |||
161 | /// into an shifter_operand immediate operand, return the 12-bit encoding for | |||
162 | /// it. If not, return -1. | |||
163 | inline int getSOImmVal(unsigned Arg) { | |||
164 | // 8-bit (or less) immediates are trivially shifter_operands with a rotate | |||
165 | // of zero. | |||
166 | if ((Arg & ~255U) == 0) return Arg; | |||
167 | ||||
168 | unsigned RotAmt = getSOImmValRotate(Arg); | |||
169 | ||||
170 | // If this cannot be handled with a single shifter_op, bail out. | |||
171 | if (rotr32(~255U, RotAmt) & Arg) | |||
172 | return -1; | |||
173 | ||||
174 | // Encode this correctly. | |||
175 | return rotl32(Arg, RotAmt) | ((RotAmt>>1) << 8); | |||
176 | } | |||
177 | ||||
178 | /// isSOImmTwoPartVal - Return true if the specified value can be obtained by | |||
179 | /// or'ing together two SOImmVal's. | |||
180 | inline bool isSOImmTwoPartVal(unsigned V) { | |||
181 | // If this can be handled with a single shifter_op, bail out. | |||
182 | V = rotr32(~255U, getSOImmValRotate(V)) & V; | |||
183 | if (V == 0) | |||
184 | return false; | |||
185 | ||||
186 | // If this can be handled with two shifter_op's, accept. | |||
187 | V = rotr32(~255U, getSOImmValRotate(V)) & V; | |||
188 | return V == 0; | |||
189 | } | |||
190 | ||||
191 | /// getSOImmTwoPartFirst - If V is a value that satisfies isSOImmTwoPartVal, | |||
192 | /// return the first chunk of it. | |||
193 | inline unsigned getSOImmTwoPartFirst(unsigned V) { | |||
194 | return rotr32(255U, getSOImmValRotate(V)) & V; | |||
195 | } | |||
196 | ||||
197 | /// getSOImmTwoPartSecond - If V is a value that satisfies isSOImmTwoPartVal, | |||
198 | /// return the second chunk of it. | |||
199 | inline unsigned getSOImmTwoPartSecond(unsigned V) { | |||
200 | // Mask out the first hunk. | |||
201 | V = rotr32(~255U, getSOImmValRotate(V)) & V; | |||
202 | ||||
203 | // Take what's left. | |||
204 | assert(V == (rotr32(255U, getSOImmValRotate(V)) & V))(static_cast <bool> (V == (rotr32(255U, getSOImmValRotate (V)) & V)) ? void (0) : __assert_fail ("V == (rotr32(255U, getSOImmValRotate(V)) & V)" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 204, __extension__ __PRETTY_FUNCTION__)); | |||
205 | return V; | |||
206 | } | |||
207 | ||||
208 | /// isSOImmTwoPartValNeg - Return true if the specified value can be obtained | |||
209 | /// by two SOImmVal, that -V = First + Second. | |||
210 | /// "R+V" can be optimized to (sub (sub R, First), Second). | |||
211 | /// "R=V" can be optimized to (sub (mvn R, ~(-First)), Second). | |||
212 | inline bool isSOImmTwoPartValNeg(unsigned V) { | |||
213 | unsigned First; | |||
214 | if (!isSOImmTwoPartVal(-V)) | |||
215 | return false; | |||
216 | // Return false if ~(-First) is not a SoImmval. | |||
217 | First = getSOImmTwoPartFirst(-V); | |||
218 | First = ~(-First); | |||
219 | return !(rotr32(~255U, getSOImmValRotate(First)) & First); | |||
220 | } | |||
221 | ||||
222 | /// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed | |||
223 | /// by a left shift. Returns the shift amount to use. | |||
224 | inline unsigned getThumbImmValShift(unsigned Imm) { | |||
225 | // 8-bit (or less) immediates are trivially immediate operand with a shift | |||
226 | // of zero. | |||
227 | if ((Imm & ~255U) == 0) return 0; | |||
228 | ||||
229 | // Use CTZ to compute the shift amount. | |||
230 | return countTrailingZeros(Imm); | |||
231 | } | |||
232 | ||||
233 | /// isThumbImmShiftedVal - Return true if the specified value can be obtained | |||
234 | /// by left shifting a 8-bit immediate. | |||
235 | inline bool isThumbImmShiftedVal(unsigned V) { | |||
236 | // If this can be handled with | |||
237 | V = (~255U << getThumbImmValShift(V)) & V; | |||
| ||||
238 | return V == 0; | |||
239 | } | |||
240 | ||||
241 | /// getThumbImm16ValShift - Try to handle Imm with a 16-bit immediate followed | |||
242 | /// by a left shift. Returns the shift amount to use. | |||
243 | inline unsigned getThumbImm16ValShift(unsigned Imm) { | |||
244 | // 16-bit (or less) immediates are trivially immediate operand with a shift | |||
245 | // of zero. | |||
246 | if ((Imm & ~65535U) == 0) return 0; | |||
247 | ||||
248 | // Use CTZ to compute the shift amount. | |||
249 | return countTrailingZeros(Imm); | |||
250 | } | |||
251 | ||||
252 | /// isThumbImm16ShiftedVal - Return true if the specified value can be | |||
253 | /// obtained by left shifting a 16-bit immediate. | |||
254 | inline bool isThumbImm16ShiftedVal(unsigned V) { | |||
255 | // If this can be handled with | |||
256 | V = (~65535U << getThumbImm16ValShift(V)) & V; | |||
257 | return V == 0; | |||
258 | } | |||
259 | ||||
260 | /// getThumbImmNonShiftedVal - If V is a value that satisfies | |||
261 | /// isThumbImmShiftedVal, return the non-shiftd value. | |||
262 | inline unsigned getThumbImmNonShiftedVal(unsigned V) { | |||
263 | return V >> getThumbImmValShift(V); | |||
264 | } | |||
265 | ||||
266 | ||||
267 | /// getT2SOImmValSplat - Return the 12-bit encoded representation | |||
268 | /// if the specified value can be obtained by splatting the low 8 bits | |||
269 | /// into every other byte or every byte of a 32-bit value. i.e., | |||
270 | /// 00000000 00000000 00000000 abcdefgh control = 0 | |||
271 | /// 00000000 abcdefgh 00000000 abcdefgh control = 1 | |||
272 | /// abcdefgh 00000000 abcdefgh 00000000 control = 2 | |||
273 | /// abcdefgh abcdefgh abcdefgh abcdefgh control = 3 | |||
274 | /// Return -1 if none of the above apply. | |||
275 | /// See ARM Reference Manual A6.3.2. | |||
276 | inline int getT2SOImmValSplatVal(unsigned V) { | |||
277 | unsigned u, Vs, Imm; | |||
278 | // control = 0 | |||
279 | if ((V & 0xffffff00) == 0) | |||
280 | return V; | |||
281 | ||||
282 | // If the value is zeroes in the first byte, just shift those off | |||
283 | Vs = ((V & 0xff) == 0) ? V >> 8 : V; | |||
284 | // Any passing value only has 8 bits of payload, splatted across the word | |||
285 | Imm = Vs & 0xff; | |||
286 | // Likewise, any passing values have the payload splatted into the 3rd byte | |||
287 | u = Imm | (Imm << 16); | |||
288 | ||||
289 | // control = 1 or 2 | |||
290 | if (Vs == u) | |||
291 | return (((Vs == V) ? 1 : 2) << 8) | Imm; | |||
292 | ||||
293 | // control = 3 | |||
294 | if (Vs == (u | (u << 8))) | |||
295 | return (3 << 8) | Imm; | |||
296 | ||||
297 | return -1; | |||
298 | } | |||
299 | ||||
300 | /// getT2SOImmValRotateVal - Return the 12-bit encoded representation if the | |||
301 | /// specified value is a rotated 8-bit value. Return -1 if no rotation | |||
302 | /// encoding is possible. | |||
303 | /// See ARM Reference Manual A6.3.2. | |||
304 | inline int getT2SOImmValRotateVal(unsigned V) { | |||
305 | unsigned RotAmt = countLeadingZeros(V); | |||
306 | if (RotAmt >= 24) | |||
307 | return -1; | |||
308 | ||||
309 | // If 'Arg' can be handled with a single shifter_op return the value. | |||
310 | if ((rotr32(0xff000000U, RotAmt) & V) == V) | |||
311 | return (rotr32(V, 24 - RotAmt) & 0x7f) | ((RotAmt + 8) << 7); | |||
312 | ||||
313 | return -1; | |||
314 | } | |||
315 | ||||
316 | /// getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit | |||
317 | /// into a Thumb-2 shifter_operand immediate operand, return the 12-bit | |||
318 | /// encoding for it. If not, return -1. | |||
319 | /// See ARM Reference Manual A6.3.2. | |||
320 | inline int getT2SOImmVal(unsigned Arg) { | |||
321 | // If 'Arg' is an 8-bit splat, then get the encoded value. | |||
322 | int Splat = getT2SOImmValSplatVal(Arg); | |||
323 | if (Splat != -1) | |||
324 | return Splat; | |||
325 | ||||
326 | // If 'Arg' can be handled with a single shifter_op return the value. | |||
327 | int Rot = getT2SOImmValRotateVal(Arg); | |||
328 | if (Rot != -1) | |||
329 | return Rot; | |||
330 | ||||
331 | return -1; | |||
332 | } | |||
333 | ||||
334 | inline unsigned getT2SOImmValRotate(unsigned V) { | |||
335 | if ((V & ~255U) == 0) return 0; | |||
336 | // Use CTZ to compute the rotate amount. | |||
337 | unsigned RotAmt = countTrailingZeros(V); | |||
338 | return (32 - RotAmt) & 31; | |||
339 | } | |||
340 | ||||
341 | inline bool isT2SOImmTwoPartVal(unsigned Imm) { | |||
342 | unsigned V = Imm; | |||
343 | // Passing values can be any combination of splat values and shifter | |||
344 | // values. If this can be handled with a single shifter or splat, bail | |||
345 | // out. Those should be handled directly, not with a two-part val. | |||
346 | if (getT2SOImmValSplatVal(V) != -1) | |||
347 | return false; | |||
348 | V = rotr32 (~255U, getT2SOImmValRotate(V)) & V; | |||
349 | if (V == 0) | |||
350 | return false; | |||
351 | ||||
352 | // If this can be handled as an immediate, accept. | |||
353 | if (getT2SOImmVal(V) != -1) return true; | |||
354 | ||||
355 | // Likewise, try masking out a splat value first. | |||
356 | V = Imm; | |||
357 | if (getT2SOImmValSplatVal(V & 0xff00ff00U) != -1) | |||
358 | V &= ~0xff00ff00U; | |||
359 | else if (getT2SOImmValSplatVal(V & 0x00ff00ffU) != -1) | |||
360 | V &= ~0x00ff00ffU; | |||
361 | // If what's left can be handled as an immediate, accept. | |||
362 | if (getT2SOImmVal(V) != -1) return true; | |||
363 | ||||
364 | // Otherwise, do not accept. | |||
365 | return false; | |||
366 | } | |||
367 | ||||
368 | inline unsigned getT2SOImmTwoPartFirst(unsigned Imm) { | |||
369 | assert (isT2SOImmTwoPartVal(Imm) &&(static_cast <bool> (isT2SOImmTwoPartVal(Imm) && "Immedate cannot be encoded as two part immediate!") ? void ( 0) : __assert_fail ("isT2SOImmTwoPartVal(Imm) && \"Immedate cannot be encoded as two part immediate!\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 370, __extension__ __PRETTY_FUNCTION__)) | |||
370 | "Immedate cannot be encoded as two part immediate!")(static_cast <bool> (isT2SOImmTwoPartVal(Imm) && "Immedate cannot be encoded as two part immediate!") ? void ( 0) : __assert_fail ("isT2SOImmTwoPartVal(Imm) && \"Immedate cannot be encoded as two part immediate!\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 370, __extension__ __PRETTY_FUNCTION__)); | |||
371 | // Try a shifter operand as one part | |||
372 | unsigned V = rotr32 (~255, getT2SOImmValRotate(Imm)) & Imm; | |||
373 | // If the rest is encodable as an immediate, then return it. | |||
374 | if (getT2SOImmVal(V) != -1) return V; | |||
375 | ||||
376 | // Try masking out a splat value first. | |||
377 | if (getT2SOImmValSplatVal(Imm & 0xff00ff00U) != -1) | |||
378 | return Imm & 0xff00ff00U; | |||
379 | ||||
380 | // The other splat is all that's left as an option. | |||
381 | assert (getT2SOImmValSplatVal(Imm & 0x00ff00ffU) != -1)(static_cast <bool> (getT2SOImmValSplatVal(Imm & 0x00ff00ffU ) != -1) ? void (0) : __assert_fail ("getT2SOImmValSplatVal(Imm & 0x00ff00ffU) != -1" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 381, __extension__ __PRETTY_FUNCTION__)); | |||
382 | return Imm & 0x00ff00ffU; | |||
383 | } | |||
384 | ||||
385 | inline unsigned getT2SOImmTwoPartSecond(unsigned Imm) { | |||
386 | // Mask out the first hunk | |||
387 | Imm ^= getT2SOImmTwoPartFirst(Imm); | |||
388 | // Return what's left | |||
389 | assert (getT2SOImmVal(Imm) != -1 &&(static_cast <bool> (getT2SOImmVal(Imm) != -1 && "Unable to encode second part of T2 two part SO immediate") ? void (0) : __assert_fail ("getT2SOImmVal(Imm) != -1 && \"Unable to encode second part of T2 two part SO immediate\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 390, __extension__ __PRETTY_FUNCTION__)) | |||
390 | "Unable to encode second part of T2 two part SO immediate")(static_cast <bool> (getT2SOImmVal(Imm) != -1 && "Unable to encode second part of T2 two part SO immediate") ? void (0) : __assert_fail ("getT2SOImmVal(Imm) != -1 && \"Unable to encode second part of T2 two part SO immediate\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 390, __extension__ __PRETTY_FUNCTION__)); | |||
391 | return Imm; | |||
392 | } | |||
393 | ||||
394 | ||||
395 | //===--------------------------------------------------------------------===// | |||
396 | // Addressing Mode #2 | |||
397 | //===--------------------------------------------------------------------===// | |||
398 | // | |||
399 | // This is used for most simple load/store instructions. | |||
400 | // | |||
401 | // addrmode2 := reg +/- reg shop imm | |||
402 | // addrmode2 := reg +/- imm12 | |||
403 | // | |||
404 | // The first operand is always a Reg. The second operand is a reg if in | |||
405 | // reg/reg form, otherwise it's reg#0. The third field encodes the operation | |||
406 | // in bit 12, the immediate in bits 0-11, and the shift op in 13-15. The | |||
407 | // fourth operand 16-17 encodes the index mode. | |||
408 | // | |||
409 | // If this addressing mode is a frame index (before prolog/epilog insertion | |||
410 | // and code rewriting), this operand will have the form: FI#, reg0, <offs> | |||
411 | // with no shift amount for the frame offset. | |||
412 | // | |||
413 | inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO, | |||
414 | unsigned IdxMode = 0) { | |||
415 | assert(Imm12 < (1 << 12) && "Imm too large!")(static_cast <bool> (Imm12 < (1 << 12) && "Imm too large!") ? void (0) : __assert_fail ("Imm12 < (1 << 12) && \"Imm too large!\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 415, __extension__ __PRETTY_FUNCTION__)); | |||
416 | bool isSub = Opc == sub; | |||
417 | return Imm12 | ((int)isSub << 12) | (SO << 13) | (IdxMode << 16) ; | |||
418 | } | |||
419 | inline unsigned getAM2Offset(unsigned AM2Opc) { | |||
420 | return AM2Opc & ((1 << 12)-1); | |||
421 | } | |||
422 | inline AddrOpc getAM2Op(unsigned AM2Opc) { | |||
423 | return ((AM2Opc >> 12) & 1) ? sub : add; | |||
424 | } | |||
425 | inline ShiftOpc getAM2ShiftOpc(unsigned AM2Opc) { | |||
426 | return (ShiftOpc)((AM2Opc >> 13) & 7); | |||
427 | } | |||
428 | inline unsigned getAM2IdxMode(unsigned AM2Opc) { return (AM2Opc >> 16); } | |||
429 | ||||
430 | //===--------------------------------------------------------------------===// | |||
431 | // Addressing Mode #3 | |||
432 | //===--------------------------------------------------------------------===// | |||
433 | // | |||
434 | // This is used for sign-extending loads, and load/store-pair instructions. | |||
435 | // | |||
436 | // addrmode3 := reg +/- reg | |||
437 | // addrmode3 := reg +/- imm8 | |||
438 | // | |||
439 | // The first operand is always a Reg. The second operand is a reg if in | |||
440 | // reg/reg form, otherwise it's reg#0. The third field encodes the operation | |||
441 | // in bit 8, the immediate in bits 0-7. The fourth operand 9-10 encodes the | |||
442 | // index mode. | |||
443 | ||||
444 | /// getAM3Opc - This function encodes the addrmode3 opc field. | |||
445 | inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset, | |||
446 | unsigned IdxMode = 0) { | |||
447 | bool isSub = Opc == sub; | |||
448 | return ((int)isSub << 8) | Offset | (IdxMode << 9); | |||
449 | } | |||
450 | inline unsigned char getAM3Offset(unsigned AM3Opc) { return AM3Opc & 0xFF; } | |||
451 | inline AddrOpc getAM3Op(unsigned AM3Opc) { | |||
452 | return ((AM3Opc >> 8) & 1) ? sub : add; | |||
453 | } | |||
454 | inline unsigned getAM3IdxMode(unsigned AM3Opc) { return (AM3Opc >> 9); } | |||
455 | ||||
456 | //===--------------------------------------------------------------------===// | |||
457 | // Addressing Mode #4 | |||
458 | //===--------------------------------------------------------------------===// | |||
459 | // | |||
460 | // This is used for load / store multiple instructions. | |||
461 | // | |||
462 | // addrmode4 := reg, <mode> | |||
463 | // | |||
464 | // The four modes are: | |||
465 | // IA - Increment after | |||
466 | // IB - Increment before | |||
467 | // DA - Decrement after | |||
468 | // DB - Decrement before | |||
469 | // For VFP instructions, only the IA and DB modes are valid. | |||
470 | ||||
471 | inline AMSubMode getAM4SubMode(unsigned Mode) { | |||
472 | return (AMSubMode)(Mode & 0x7); | |||
473 | } | |||
474 | ||||
475 | inline unsigned getAM4ModeImm(AMSubMode SubMode) { return (int)SubMode; } | |||
476 | ||||
477 | //===--------------------------------------------------------------------===// | |||
478 | // Addressing Mode #5 | |||
479 | //===--------------------------------------------------------------------===// | |||
480 | // | |||
481 | // This is used for coprocessor instructions, such as FP load/stores. | |||
482 | // | |||
483 | // addrmode5 := reg +/- imm8*4 | |||
484 | // | |||
485 | // The first operand is always a Reg. The second operand encodes the | |||
486 | // operation (add or subtract) in bit 8 and the immediate in bits 0-7. | |||
487 | ||||
488 | /// getAM5Opc - This function encodes the addrmode5 opc field. | |||
489 | inline unsigned getAM5Opc(AddrOpc Opc, unsigned char Offset) { | |||
490 | bool isSub = Opc == sub; | |||
491 | return ((int)isSub << 8) | Offset; | |||
492 | } | |||
493 | inline unsigned char getAM5Offset(unsigned AM5Opc) { return AM5Opc & 0xFF; } | |||
494 | inline AddrOpc getAM5Op(unsigned AM5Opc) { | |||
495 | return ((AM5Opc >> 8) & 1) ? sub : add; | |||
496 | } | |||
497 | ||||
498 | //===--------------------------------------------------------------------===// | |||
499 | // Addressing Mode #5 FP16 | |||
500 | //===--------------------------------------------------------------------===// | |||
501 | // | |||
502 | // This is used for coprocessor instructions, such as 16-bit FP load/stores. | |||
503 | // | |||
504 | // addrmode5fp16 := reg +/- imm8*2 | |||
505 | // | |||
506 | // The first operand is always a Reg. The second operand encodes the | |||
507 | // operation (add or subtract) in bit 8 and the immediate in bits 0-7. | |||
508 | ||||
509 | /// getAM5FP16Opc - This function encodes the addrmode5fp16 opc field. | |||
510 | inline unsigned getAM5FP16Opc(AddrOpc Opc, unsigned char Offset) { | |||
511 | bool isSub = Opc == sub; | |||
512 | return ((int)isSub << 8) | Offset; | |||
513 | } | |||
514 | inline unsigned char getAM5FP16Offset(unsigned AM5Opc) { | |||
515 | return AM5Opc & 0xFF; | |||
516 | } | |||
517 | inline AddrOpc getAM5FP16Op(unsigned AM5Opc) { | |||
518 | return ((AM5Opc >> 8) & 1) ? sub : add; | |||
519 | } | |||
520 | ||||
521 | //===--------------------------------------------------------------------===// | |||
522 | // Addressing Mode #6 | |||
523 | //===--------------------------------------------------------------------===// | |||
524 | // | |||
525 | // This is used for NEON load / store instructions. | |||
526 | // | |||
527 | // addrmode6 := reg with optional alignment | |||
528 | // | |||
529 | // This is stored in two operands [regaddr, align]. The first is the | |||
530 | // address register. The second operand is the value of the alignment | |||
531 | // specifier in bytes or zero if no explicit alignment. | |||
532 | // Valid alignments depend on the specific instruction. | |||
533 | ||||
534 | //===--------------------------------------------------------------------===// | |||
535 | // NEON/MVE Modified Immediates | |||
536 | //===--------------------------------------------------------------------===// | |||
537 | // | |||
538 | // Several NEON and MVE instructions (e.g., VMOV) take a "modified immediate" | |||
539 | // vector operand, where a small immediate encoded in the instruction | |||
540 | // specifies a full NEON vector value. These modified immediates are | |||
541 | // represented here as encoded integers. The low 8 bits hold the immediate | |||
542 | // value; bit 12 holds the "Op" field of the instruction, and bits 11-8 hold | |||
543 | // the "Cmode" field of the instruction. The interfaces below treat the | |||
544 | // Op and Cmode values as a single 5-bit value. | |||
545 | ||||
546 | inline unsigned createVMOVModImm(unsigned OpCmode, unsigned Val) { | |||
547 | return (OpCmode << 8) | Val; | |||
548 | } | |||
549 | inline unsigned getVMOVModImmOpCmode(unsigned ModImm) { | |||
550 | return (ModImm >> 8) & 0x1f; | |||
551 | } | |||
552 | inline unsigned getVMOVModImmVal(unsigned ModImm) { return ModImm & 0xff; } | |||
553 | ||||
554 | /// decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the | |||
555 | /// element value and the element size in bits. (If the element size is | |||
556 | /// smaller than the vector, it is splatted into all the elements.) | |||
557 | inline uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits) { | |||
558 | unsigned OpCmode = getVMOVModImmOpCmode(ModImm); | |||
559 | unsigned Imm8 = getVMOVModImmVal(ModImm); | |||
560 | uint64_t Val = 0; | |||
561 | ||||
562 | if (OpCmode == 0xe) { | |||
563 | // 8-bit vector elements | |||
564 | Val = Imm8; | |||
565 | EltBits = 8; | |||
566 | } else if ((OpCmode & 0xc) == 0x8) { | |||
567 | // 16-bit vector elements | |||
568 | unsigned ByteNum = (OpCmode & 0x6) >> 1; | |||
569 | Val = Imm8 << (8 * ByteNum); | |||
570 | EltBits = 16; | |||
571 | } else if ((OpCmode & 0x8) == 0) { | |||
572 | // 32-bit vector elements, zero with one byte set | |||
573 | unsigned ByteNum = (OpCmode & 0x6) >> 1; | |||
574 | Val = Imm8 << (8 * ByteNum); | |||
575 | EltBits = 32; | |||
576 | } else if ((OpCmode & 0xe) == 0xc) { | |||
577 | // 32-bit vector elements, one byte with low bits set | |||
578 | unsigned ByteNum = 1 + (OpCmode & 0x1); | |||
579 | Val = (Imm8 << (8 * ByteNum)) | (0xffff >> (8 * (2 - ByteNum))); | |||
580 | EltBits = 32; | |||
581 | } else if (OpCmode == 0x1e) { | |||
582 | // 64-bit vector elements | |||
583 | for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) { | |||
584 | if ((ModImm >> ByteNum) & 1) | |||
585 | Val |= (uint64_t)0xff << (8 * ByteNum); | |||
586 | } | |||
587 | EltBits = 64; | |||
588 | } else { | |||
589 | llvm_unreachable("Unsupported VMOV immediate")::llvm::llvm_unreachable_internal("Unsupported VMOV immediate" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 589); | |||
590 | } | |||
591 | return Val; | |||
592 | } | |||
593 | ||||
594 | // Generic validation for single-byte immediate (0X00, 00X0, etc). | |||
595 | inline bool isNEONBytesplat(unsigned Value, unsigned Size) { | |||
596 | assert(Size >= 1 && Size <= 4 && "Invalid size")(static_cast <bool> (Size >= 1 && Size <= 4 && "Invalid size") ? void (0) : __assert_fail ("Size >= 1 && Size <= 4 && \"Invalid size\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 596, __extension__ __PRETTY_FUNCTION__)); | |||
597 | unsigned count = 0; | |||
598 | for (unsigned i = 0; i < Size; ++i) { | |||
599 | if (Value & 0xff) count++; | |||
600 | Value >>= 8; | |||
601 | } | |||
602 | return count == 1; | |||
603 | } | |||
604 | ||||
605 | /// Checks if Value is a correct immediate for instructions like VBIC/VORR. | |||
606 | inline bool isNEONi16splat(unsigned Value) { | |||
607 | if (Value > 0xffff) | |||
608 | return false; | |||
609 | // i16 value with set bits only in one byte X0 or 0X. | |||
610 | return Value == 0 || isNEONBytesplat(Value, 2); | |||
611 | } | |||
612 | ||||
613 | // Encode NEON 16 bits Splat immediate for instructions like VBIC/VORR | |||
614 | inline unsigned encodeNEONi16splat(unsigned Value) { | |||
615 | assert(isNEONi16splat(Value) && "Invalid NEON splat value")(static_cast <bool> (isNEONi16splat(Value) && "Invalid NEON splat value" ) ? void (0) : __assert_fail ("isNEONi16splat(Value) && \"Invalid NEON splat value\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 615, __extension__ __PRETTY_FUNCTION__)); | |||
616 | if (Value >= 0x100) | |||
617 | Value = (Value >> 8) | 0xa00; | |||
618 | else | |||
619 | Value |= 0x800; | |||
620 | return Value; | |||
621 | } | |||
622 | ||||
623 | /// Checks if Value is a correct immediate for instructions like VBIC/VORR. | |||
624 | inline bool isNEONi32splat(unsigned Value) { | |||
625 | // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X. | |||
626 | return Value == 0 || isNEONBytesplat(Value, 4); | |||
627 | } | |||
628 | ||||
629 | /// Encode NEON 32 bits Splat immediate for instructions like VBIC/VORR. | |||
630 | inline unsigned encodeNEONi32splat(unsigned Value) { | |||
631 | assert(isNEONi32splat(Value) && "Invalid NEON splat value")(static_cast <bool> (isNEONi32splat(Value) && "Invalid NEON splat value" ) ? void (0) : __assert_fail ("isNEONi32splat(Value) && \"Invalid NEON splat value\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h" , 631, __extension__ __PRETTY_FUNCTION__)); | |||
632 | if (Value >= 0x100 && Value <= 0xff00) | |||
633 | Value = (Value >> 8) | 0x200; | |||
634 | else if (Value > 0xffff && Value <= 0xff0000) | |||
635 | Value = (Value >> 16) | 0x400; | |||
636 | else if (Value > 0xffffff) | |||
637 | Value = (Value >> 24) | 0x600; | |||
638 | return Value; | |||
639 | } | |||
640 | ||||
641 | //===--------------------------------------------------------------------===// | |||
642 | // Floating-point Immediates | |||
643 | // | |||
644 | inline float getFPImmFloat(unsigned Imm) { | |||
645 | // We expect an 8-bit binary encoding of a floating-point number here. | |||
646 | ||||
647 | uint8_t Sign = (Imm >> 7) & 0x1; | |||
648 | uint8_t Exp = (Imm >> 4) & 0x7; | |||
649 | uint8_t Mantissa = Imm & 0xf; | |||
650 | ||||
651 | // 8-bit FP IEEE Float Encoding | |||
652 | // abcd efgh aBbbbbbc defgh000 00000000 00000000 | |||
653 | // | |||
654 | // where B = NOT(b); | |||
655 | uint32_t I = 0; | |||
656 | I |= Sign << 31; | |||
657 | I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30; | |||
658 | I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25; | |||
659 | I |= (Exp & 0x3) << 23; | |||
660 | I |= Mantissa << 19; | |||
661 | return bit_cast<float>(I); | |||
662 | } | |||
663 | ||||
664 | /// getFP16Imm - Return an 8-bit floating-point version of the 16-bit | |||
665 | /// floating-point value. If the value cannot be represented as an 8-bit | |||
666 | /// floating-point value, then return -1. | |||
667 | inline int getFP16Imm(const APInt &Imm) { | |||
668 | uint32_t Sign = Imm.lshr(15).getZExtValue() & 1; | |||
669 | int32_t Exp = (Imm.lshr(10).getSExtValue() & 0x1f) - 15; // -14 to 15 | |||
670 | int64_t Mantissa = Imm.getZExtValue() & 0x3ff; // 10 bits | |||
671 | ||||
672 | // We can handle 4 bits of mantissa. | |||
673 | // mantissa = (16+UInt(e:f:g:h))/16. | |||
674 | if (Mantissa & 0x3f) | |||
675 | return -1; | |||
676 | Mantissa >>= 6; | |||
677 | ||||
678 | // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3 | |||
679 | if (Exp < -3 || Exp > 4) | |||
680 | return -1; | |||
681 | Exp = ((Exp+3) & 0x7) ^ 4; | |||
682 | ||||
683 | return ((int)Sign << 7) | (Exp << 4) | Mantissa; | |||
684 | } | |||
685 | ||||
686 | inline int getFP16Imm(const APFloat &FPImm) { | |||
687 | return getFP16Imm(FPImm.bitcastToAPInt()); | |||
688 | } | |||
689 | ||||
690 | /// If this is a FP16Imm encoded as a fp32 value, return the 8-bit encoding | |||
691 | /// for it. Otherwise return -1 like getFP16Imm. | |||
692 | inline int getFP32FP16Imm(const APInt &Imm) { | |||
693 | if (Imm.getActiveBits() > 16) | |||
694 | return -1; | |||
695 | return ARM_AM::getFP16Imm(Imm.trunc(16)); | |||
696 | } | |||
697 | ||||
698 | inline int getFP32FP16Imm(const APFloat &FPImm) { | |||
699 | return getFP32FP16Imm(FPImm.bitcastToAPInt()); | |||
700 | } | |||
701 | ||||
702 | /// getFP32Imm - Return an 8-bit floating-point version of the 32-bit | |||
703 | /// floating-point value. If the value cannot be represented as an 8-bit | |||
704 | /// floating-point value, then return -1. | |||
705 | inline int getFP32Imm(const APInt &Imm) { | |||
706 | uint32_t Sign = Imm.lshr(31).getZExtValue() & 1; | |||
707 | int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127; // -126 to 127 | |||
708 | int64_t Mantissa = Imm.getZExtValue() & 0x7fffff; // 23 bits | |||
709 | ||||
710 | // We can handle 4 bits of mantissa. | |||
711 | // mantissa = (16+UInt(e:f:g:h))/16. | |||
712 | if (Mantissa & 0x7ffff) | |||
713 | return -1; | |||
714 | Mantissa >>= 19; | |||
715 | if ((Mantissa & 0xf) != Mantissa) | |||
716 | return -1; | |||
717 | ||||
718 | // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3 | |||
719 | if (Exp < -3 || Exp > 4) | |||
720 | return -1; | |||
721 | Exp = ((Exp+3) & 0x7) ^ 4; | |||
722 | ||||
723 | return ((int)Sign << 7) | (Exp << 4) | Mantissa; | |||
724 | } | |||
725 | ||||
726 | inline int getFP32Imm(const APFloat &FPImm) { | |||
727 | return getFP32Imm(FPImm.bitcastToAPInt()); | |||
728 | } | |||
729 | ||||
730 | /// getFP64Imm - Return an 8-bit floating-point version of the 64-bit | |||
731 | /// floating-point value. If the value cannot be represented as an 8-bit | |||
732 | /// floating-point value, then return -1. | |||
733 | inline int getFP64Imm(const APInt &Imm) { | |||
734 | uint64_t Sign = Imm.lshr(63).getZExtValue() & 1; | |||
735 | int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023; // -1022 to 1023 | |||
736 | uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL; | |||
737 | ||||
738 | // We can handle 4 bits of mantissa. | |||
739 | // mantissa = (16+UInt(e:f:g:h))/16. | |||
740 | if (Mantissa & 0xffffffffffffULL) | |||
741 | return -1; | |||
742 | Mantissa >>= 48; | |||
743 | if ((Mantissa & 0xf) != Mantissa) | |||
744 | return -1; | |||
745 | ||||
746 | // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3 | |||
747 | if (Exp < -3 || Exp > 4) | |||
748 | return -1; | |||
749 | Exp = ((Exp+3) & 0x7) ^ 4; | |||
750 | ||||
751 | return ((int)Sign << 7) | (Exp << 4) | Mantissa; | |||
752 | } | |||
753 | ||||
754 | inline int getFP64Imm(const APFloat &FPImm) { | |||
755 | return getFP64Imm(FPImm.bitcastToAPInt()); | |||
756 | } | |||
757 | ||||
758 | } // end namespace ARM_AM | |||
759 | } // end namespace llvm | |||
760 | ||||
761 | #endif | |||
762 |
1 | //===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===// | ||||||
2 | // | ||||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||
6 | // | ||||||
7 | //===----------------------------------------------------------------------===// | ||||||
8 | // | ||||||
9 | // This file contains some functions that are useful for math stuff. | ||||||
10 | // | ||||||
11 | //===----------------------------------------------------------------------===// | ||||||
12 | |||||||
13 | #ifndef LLVM_SUPPORT_MATHEXTRAS_H | ||||||
14 | #define LLVM_SUPPORT_MATHEXTRAS_H | ||||||
15 | |||||||
16 | #include "llvm/Support/Compiler.h" | ||||||
17 | #include <cassert> | ||||||
18 | #include <climits> | ||||||
19 | #include <cmath> | ||||||
20 | #include <cstdint> | ||||||
21 | #include <cstring> | ||||||
22 | #include <limits> | ||||||
23 | #include <type_traits> | ||||||
24 | |||||||
25 | #ifdef __ANDROID_NDK__ | ||||||
26 | #include <android/api-level.h> | ||||||
27 | #endif | ||||||
28 | |||||||
29 | #ifdef _MSC_VER | ||||||
30 | // Declare these intrinsics manually rather including intrin.h. It's very | ||||||
31 | // expensive, and MathExtras.h is popular. | ||||||
32 | // #include <intrin.h> | ||||||
33 | extern "C" { | ||||||
34 | unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask); | ||||||
35 | unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask); | ||||||
36 | unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask); | ||||||
37 | unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask); | ||||||
38 | } | ||||||
39 | #endif | ||||||
40 | |||||||
41 | namespace llvm { | ||||||
42 | |||||||
43 | /// The behavior an operation has on an input of 0. | ||||||
44 | enum ZeroBehavior { | ||||||
45 | /// The returned value is undefined. | ||||||
46 | ZB_Undefined, | ||||||
47 | /// The returned value is numeric_limits<T>::max() | ||||||
48 | ZB_Max, | ||||||
49 | /// The returned value is numeric_limits<T>::digits | ||||||
50 | ZB_Width | ||||||
51 | }; | ||||||
52 | |||||||
53 | /// Mathematical constants. | ||||||
54 | namespace numbers { | ||||||
55 | // TODO: Track C++20 std::numbers. | ||||||
56 | // TODO: Favor using the hexadecimal FP constants (requires C++17). | ||||||
57 | constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113 | ||||||
58 | egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620 | ||||||
59 | ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162 | ||||||
60 | ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392 | ||||||
61 | log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0) | ||||||
62 | log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2) | ||||||
63 | pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796 | ||||||
64 | inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541 | ||||||
65 | sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161 | ||||||
66 | inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197 | ||||||
67 | sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219 | ||||||
68 | inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1) | ||||||
69 | sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194 | ||||||
70 | inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1) | ||||||
71 | phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622 | ||||||
72 | constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113 | ||||||
73 | egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620 | ||||||
74 | ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162 | ||||||
75 | ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392 | ||||||
76 | log2ef = 1.44269504F, // (0x1.715476P+0) | ||||||
77 | log10ef = .434294482F, // (0x1.bcb7b2P-2) | ||||||
78 | pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796 | ||||||
79 | inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541 | ||||||
80 | sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161 | ||||||
81 | inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197 | ||||||
82 | sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193 | ||||||
83 | inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1) | ||||||
84 | sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194 | ||||||
85 | inv_sqrt3f = .577350269F, // (0x1.279a74P-1) | ||||||
86 | phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622 | ||||||
87 | } // namespace numbers | ||||||
88 | |||||||
89 | namespace detail { | ||||||
90 | template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter { | ||||||
91 | static unsigned count(T Val, ZeroBehavior) { | ||||||
92 | if (!Val) | ||||||
93 | return std::numeric_limits<T>::digits; | ||||||
94 | if (Val & 0x1) | ||||||
95 | return 0; | ||||||
96 | |||||||
97 | // Bisection method. | ||||||
98 | unsigned ZeroBits = 0; | ||||||
99 | T Shift = std::numeric_limits<T>::digits >> 1; | ||||||
100 | T Mask = std::numeric_limits<T>::max() >> Shift; | ||||||
101 | while (Shift) { | ||||||
102 | if ((Val & Mask) == 0) { | ||||||
103 | Val >>= Shift; | ||||||
104 | ZeroBits |= Shift; | ||||||
105 | } | ||||||
106 | Shift >>= 1; | ||||||
107 | Mask >>= Shift; | ||||||
108 | } | ||||||
109 | return ZeroBits; | ||||||
110 | } | ||||||
111 | }; | ||||||
112 | |||||||
113 | #if defined(__GNUC__4) || defined(_MSC_VER) | ||||||
114 | template <typename T> struct TrailingZerosCounter<T, 4> { | ||||||
115 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
116 | if (ZB
| ||||||
117 | return 32; | ||||||
118 | |||||||
119 | #if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4) | ||||||
120 | return __builtin_ctz(Val); | ||||||
121 | #elif defined(_MSC_VER) | ||||||
122 | unsigned long Index; | ||||||
123 | _BitScanForward(&Index, Val); | ||||||
124 | return Index; | ||||||
125 | #endif | ||||||
126 | } | ||||||
127 | }; | ||||||
128 | |||||||
129 | #if !defined(_MSC_VER) || defined(_M_X64) | ||||||
130 | template <typename T> struct TrailingZerosCounter<T, 8> { | ||||||
131 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
132 | if (ZB != ZB_Undefined && Val == 0) | ||||||
133 | return 64; | ||||||
134 | |||||||
135 | #if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4) | ||||||
136 | return __builtin_ctzll(Val); | ||||||
137 | #elif defined(_MSC_VER) | ||||||
138 | unsigned long Index; | ||||||
139 | _BitScanForward64(&Index, Val); | ||||||
140 | return Index; | ||||||
141 | #endif | ||||||
142 | } | ||||||
143 | }; | ||||||
144 | #endif | ||||||
145 | #endif | ||||||
146 | } // namespace detail | ||||||
147 | |||||||
148 | /// Count number of 0's from the least significant bit to the most | ||||||
149 | /// stopping at the first 1. | ||||||
150 | /// | ||||||
151 | /// Only unsigned integral types are allowed. | ||||||
152 | /// | ||||||
153 | /// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are | ||||||
154 | /// valid arguments. | ||||||
155 | template <typename T> | ||||||
156 | unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) { | ||||||
157 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
158 | !std::numeric_limits<T>::is_signed, | ||||||
159 | "Only unsigned integral types are allowed."); | ||||||
160 | return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB); | ||||||
161 | } | ||||||
162 | |||||||
163 | namespace detail { | ||||||
164 | template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter { | ||||||
165 | static unsigned count(T Val, ZeroBehavior) { | ||||||
166 | if (!Val) | ||||||
167 | return std::numeric_limits<T>::digits; | ||||||
168 | |||||||
169 | // Bisection method. | ||||||
170 | unsigned ZeroBits = 0; | ||||||
171 | for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) { | ||||||
172 | T Tmp = Val >> Shift; | ||||||
173 | if (Tmp) | ||||||
174 | Val = Tmp; | ||||||
175 | else | ||||||
176 | ZeroBits |= Shift; | ||||||
177 | } | ||||||
178 | return ZeroBits; | ||||||
179 | } | ||||||
180 | }; | ||||||
181 | |||||||
182 | #if defined(__GNUC__4) || defined(_MSC_VER) | ||||||
183 | template <typename T> struct LeadingZerosCounter<T, 4> { | ||||||
184 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
185 | if (ZB != ZB_Undefined && Val == 0) | ||||||
186 | return 32; | ||||||
187 | |||||||
188 | #if __has_builtin(__builtin_clz)1 || defined(__GNUC__4) | ||||||
189 | return __builtin_clz(Val); | ||||||
190 | #elif defined(_MSC_VER) | ||||||
191 | unsigned long Index; | ||||||
192 | _BitScanReverse(&Index, Val); | ||||||
193 | return Index ^ 31; | ||||||
194 | #endif | ||||||
195 | } | ||||||
196 | }; | ||||||
197 | |||||||
198 | #if !defined(_MSC_VER) || defined(_M_X64) | ||||||
199 | template <typename T> struct LeadingZerosCounter<T, 8> { | ||||||
200 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||||
201 | if (ZB != ZB_Undefined && Val == 0) | ||||||
202 | return 64; | ||||||
203 | |||||||
204 | #if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4) | ||||||
205 | return __builtin_clzll(Val); | ||||||
206 | #elif defined(_MSC_VER) | ||||||
207 | unsigned long Index; | ||||||
208 | _BitScanReverse64(&Index, Val); | ||||||
209 | return Index ^ 63; | ||||||
210 | #endif | ||||||
211 | } | ||||||
212 | }; | ||||||
213 | #endif | ||||||
214 | #endif | ||||||
215 | } // namespace detail | ||||||
216 | |||||||
217 | /// Count number of 0's from the most significant bit to the least | ||||||
218 | /// stopping at the first 1. | ||||||
219 | /// | ||||||
220 | /// Only unsigned integral types are allowed. | ||||||
221 | /// | ||||||
222 | /// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are | ||||||
223 | /// valid arguments. | ||||||
224 | template <typename T> | ||||||
225 | unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) { | ||||||
226 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
227 | !std::numeric_limits<T>::is_signed, | ||||||
228 | "Only unsigned integral types are allowed."); | ||||||
229 | return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB); | ||||||
230 | } | ||||||
231 | |||||||
232 | /// Get the index of the first set bit starting from the least | ||||||
233 | /// significant bit. | ||||||
234 | /// | ||||||
235 | /// Only unsigned integral types are allowed. | ||||||
236 | /// | ||||||
237 | /// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are | ||||||
238 | /// valid arguments. | ||||||
239 | template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) { | ||||||
240 | if (ZB == ZB_Max && Val == 0) | ||||||
241 | return std::numeric_limits<T>::max(); | ||||||
242 | |||||||
243 | return countTrailingZeros(Val, ZB_Undefined); | ||||||
244 | } | ||||||
245 | |||||||
246 | /// Create a bitmask with the N right-most bits set to 1, and all other | ||||||
247 | /// bits set to 0. Only unsigned types are allowed. | ||||||
248 | template <typename T> T maskTrailingOnes(unsigned N) { | ||||||
249 | static_assert(std::is_unsigned<T>::value, "Invalid type!"); | ||||||
250 | const unsigned Bits = CHAR_BIT8 * sizeof(T); | ||||||
251 | assert(N <= Bits && "Invalid bit index")(static_cast <bool> (N <= Bits && "Invalid bit index" ) ? void (0) : __assert_fail ("N <= Bits && \"Invalid bit index\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 251, __extension__ __PRETTY_FUNCTION__)); | ||||||
252 | return N == 0 ? 0 : (T(-1) >> (Bits - N)); | ||||||
253 | } | ||||||
254 | |||||||
255 | /// Create a bitmask with the N left-most bits set to 1, and all other | ||||||
256 | /// bits set to 0. Only unsigned types are allowed. | ||||||
257 | template <typename T> T maskLeadingOnes(unsigned N) { | ||||||
258 | return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N); | ||||||
259 | } | ||||||
260 | |||||||
261 | /// Create a bitmask with the N right-most bits set to 0, and all other | ||||||
262 | /// bits set to 1. Only unsigned types are allowed. | ||||||
263 | template <typename T> T maskTrailingZeros(unsigned N) { | ||||||
264 | return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N); | ||||||
265 | } | ||||||
266 | |||||||
267 | /// Create a bitmask with the N left-most bits set to 0, and all other | ||||||
268 | /// bits set to 1. Only unsigned types are allowed. | ||||||
269 | template <typename T> T maskLeadingZeros(unsigned N) { | ||||||
270 | return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N); | ||||||
271 | } | ||||||
272 | |||||||
273 | /// Get the index of the last set bit starting from the least | ||||||
274 | /// significant bit. | ||||||
275 | /// | ||||||
276 | /// Only unsigned integral types are allowed. | ||||||
277 | /// | ||||||
278 | /// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are | ||||||
279 | /// valid arguments. | ||||||
280 | template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) { | ||||||
281 | if (ZB == ZB_Max && Val == 0) | ||||||
282 | return std::numeric_limits<T>::max(); | ||||||
283 | |||||||
284 | // Use ^ instead of - because both gcc and llvm can remove the associated ^ | ||||||
285 | // in the __builtin_clz intrinsic on x86. | ||||||
286 | return countLeadingZeros(Val, ZB_Undefined) ^ | ||||||
287 | (std::numeric_limits<T>::digits - 1); | ||||||
288 | } | ||||||
289 | |||||||
290 | /// Macro compressed bit reversal table for 256 bits. | ||||||
291 | /// | ||||||
292 | /// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable | ||||||
293 | static const unsigned char BitReverseTable256[256] = { | ||||||
294 | #define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64 | ||||||
295 | #define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16) | ||||||
296 | #define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4) | ||||||
297 | R6(0), R6(2), R6(1), R6(3) | ||||||
298 | #undef R2 | ||||||
299 | #undef R4 | ||||||
300 | #undef R6 | ||||||
301 | }; | ||||||
302 | |||||||
303 | /// Reverse the bits in \p Val. | ||||||
304 | template <typename T> | ||||||
305 | T reverseBits(T Val) { | ||||||
306 | unsigned char in[sizeof(Val)]; | ||||||
307 | unsigned char out[sizeof(Val)]; | ||||||
308 | std::memcpy(in, &Val, sizeof(Val)); | ||||||
309 | for (unsigned i = 0; i < sizeof(Val); ++i) | ||||||
310 | out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]]; | ||||||
311 | std::memcpy(&Val, out, sizeof(Val)); | ||||||
312 | return Val; | ||||||
313 | } | ||||||
314 | |||||||
315 | #if __has_builtin(__builtin_bitreverse8)1 | ||||||
316 | template<> | ||||||
317 | inline uint8_t reverseBits<uint8_t>(uint8_t Val) { | ||||||
318 | return __builtin_bitreverse8(Val); | ||||||
319 | } | ||||||
320 | #endif | ||||||
321 | |||||||
322 | #if __has_builtin(__builtin_bitreverse16)1 | ||||||
323 | template<> | ||||||
324 | inline uint16_t reverseBits<uint16_t>(uint16_t Val) { | ||||||
325 | return __builtin_bitreverse16(Val); | ||||||
326 | } | ||||||
327 | #endif | ||||||
328 | |||||||
329 | #if __has_builtin(__builtin_bitreverse32)1 | ||||||
330 | template<> | ||||||
331 | inline uint32_t reverseBits<uint32_t>(uint32_t Val) { | ||||||
332 | return __builtin_bitreverse32(Val); | ||||||
333 | } | ||||||
334 | #endif | ||||||
335 | |||||||
336 | #if __has_builtin(__builtin_bitreverse64)1 | ||||||
337 | template<> | ||||||
338 | inline uint64_t reverseBits<uint64_t>(uint64_t Val) { | ||||||
339 | return __builtin_bitreverse64(Val); | ||||||
340 | } | ||||||
341 | #endif | ||||||
342 | |||||||
343 | // NOTE: The following support functions use the _32/_64 extensions instead of | ||||||
344 | // type overloading so that signed and unsigned integers can be used without | ||||||
345 | // ambiguity. | ||||||
346 | |||||||
347 | /// Return the high 32 bits of a 64 bit value. | ||||||
348 | constexpr inline uint32_t Hi_32(uint64_t Value) { | ||||||
349 | return static_cast<uint32_t>(Value >> 32); | ||||||
350 | } | ||||||
351 | |||||||
352 | /// Return the low 32 bits of a 64 bit value. | ||||||
353 | constexpr inline uint32_t Lo_32(uint64_t Value) { | ||||||
354 | return static_cast<uint32_t>(Value); | ||||||
355 | } | ||||||
356 | |||||||
357 | /// Make a 64-bit integer from a high / low pair of 32-bit integers. | ||||||
358 | constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) { | ||||||
359 | return ((uint64_t)High << 32) | (uint64_t)Low; | ||||||
360 | } | ||||||
361 | |||||||
362 | /// Checks if an integer fits into the given bit width. | ||||||
363 | template <unsigned N> constexpr inline bool isInt(int64_t x) { | ||||||
364 | return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1))); | ||||||
365 | } | ||||||
366 | // Template specializations to get better code for common cases. | ||||||
367 | template <> constexpr inline bool isInt<8>(int64_t x) { | ||||||
368 | return static_cast<int8_t>(x) == x; | ||||||
369 | } | ||||||
370 | template <> constexpr inline bool isInt<16>(int64_t x) { | ||||||
371 | return static_cast<int16_t>(x) == x; | ||||||
372 | } | ||||||
373 | template <> constexpr inline bool isInt<32>(int64_t x) { | ||||||
374 | return static_cast<int32_t>(x) == x; | ||||||
375 | } | ||||||
376 | |||||||
377 | /// Checks if a signed integer is an N bit number shifted left by S. | ||||||
378 | template <unsigned N, unsigned S> | ||||||
379 | constexpr inline bool isShiftedInt(int64_t x) { | ||||||
380 | static_assert( | ||||||
381 | N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number."); | ||||||
382 | static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide."); | ||||||
383 | return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0); | ||||||
384 | } | ||||||
385 | |||||||
386 | /// Checks if an unsigned integer fits into the given bit width. | ||||||
387 | /// | ||||||
388 | /// This is written as two functions rather than as simply | ||||||
389 | /// | ||||||
390 | /// return N >= 64 || X < (UINT64_C(1) << N); | ||||||
391 | /// | ||||||
392 | /// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting | ||||||
393 | /// left too many places. | ||||||
394 | template <unsigned N> | ||||||
395 | constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) { | ||||||
396 | static_assert(N > 0, "isUInt<0> doesn't make sense"); | ||||||
397 | return X < (UINT64_C(1)1UL << (N)); | ||||||
398 | } | ||||||
399 | template <unsigned N> | ||||||
400 | constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t) { | ||||||
401 | return true; | ||||||
402 | } | ||||||
403 | |||||||
404 | // Template specializations to get better code for common cases. | ||||||
405 | template <> constexpr inline bool isUInt<8>(uint64_t x) { | ||||||
406 | return static_cast<uint8_t>(x) == x; | ||||||
407 | } | ||||||
408 | template <> constexpr inline bool isUInt<16>(uint64_t x) { | ||||||
409 | return static_cast<uint16_t>(x) == x; | ||||||
410 | } | ||||||
411 | template <> constexpr inline bool isUInt<32>(uint64_t x) { | ||||||
412 | return static_cast<uint32_t>(x) == x; | ||||||
413 | } | ||||||
414 | |||||||
415 | /// Checks if a unsigned integer is an N bit number shifted left by S. | ||||||
416 | template <unsigned N, unsigned S> | ||||||
417 | constexpr inline bool isShiftedUInt(uint64_t x) { | ||||||
418 | static_assert( | ||||||
419 | N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)"); | ||||||
420 | static_assert(N + S <= 64, | ||||||
421 | "isShiftedUInt<N, S> with N + S > 64 is too wide."); | ||||||
422 | // Per the two static_asserts above, S must be strictly less than 64. So | ||||||
423 | // 1 << S is not undefined behavior. | ||||||
424 | return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0); | ||||||
425 | } | ||||||
426 | |||||||
427 | /// Gets the maximum value for a N-bit unsigned integer. | ||||||
428 | inline uint64_t maxUIntN(uint64_t N) { | ||||||
429 | assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 && "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 429, __extension__ __PRETTY_FUNCTION__)); | ||||||
430 | |||||||
431 | // uint64_t(1) << 64 is undefined behavior, so we can't do | ||||||
432 | // (uint64_t(1) << N) - 1 | ||||||
433 | // without checking first that N != 64. But this works and doesn't have a | ||||||
434 | // branch. | ||||||
435 | return UINT64_MAX(18446744073709551615UL) >> (64 - N); | ||||||
436 | } | ||||||
437 | |||||||
438 | /// Gets the minimum value for a N-bit signed integer. | ||||||
439 | inline int64_t minIntN(int64_t N) { | ||||||
440 | assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 && "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 440, __extension__ __PRETTY_FUNCTION__)); | ||||||
441 | |||||||
442 | return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1)); | ||||||
443 | } | ||||||
444 | |||||||
445 | /// Gets the maximum value for a N-bit signed integer. | ||||||
446 | inline int64_t maxIntN(int64_t N) { | ||||||
447 | assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 && "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 447, __extension__ __PRETTY_FUNCTION__)); | ||||||
448 | |||||||
449 | // This relies on two's complement wraparound when N == 64, so we convert to | ||||||
450 | // int64_t only at the very end to avoid UB. | ||||||
451 | return (UINT64_C(1)1UL << (N - 1)) - 1; | ||||||
452 | } | ||||||
453 | |||||||
454 | /// Checks if an unsigned integer fits into the given (dynamic) bit width. | ||||||
455 | inline bool isUIntN(unsigned N, uint64_t x) { | ||||||
456 | return N >= 64 || x <= maxUIntN(N); | ||||||
457 | } | ||||||
458 | |||||||
459 | /// Checks if an signed integer fits into the given (dynamic) bit width. | ||||||
460 | inline bool isIntN(unsigned N, int64_t x) { | ||||||
461 | return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N)); | ||||||
462 | } | ||||||
463 | |||||||
464 | /// Return true if the argument is a non-empty sequence of ones starting at the | ||||||
465 | /// least significant bit with the remainder zero (32 bit version). | ||||||
466 | /// Ex. isMask_32(0x0000FFFFU) == true. | ||||||
467 | constexpr inline bool isMask_32(uint32_t Value) { | ||||||
468 | return Value && ((Value + 1) & Value) == 0; | ||||||
469 | } | ||||||
470 | |||||||
471 | /// Return true if the argument is a non-empty sequence of ones starting at the | ||||||
472 | /// least significant bit with the remainder zero (64 bit version). | ||||||
473 | constexpr inline bool isMask_64(uint64_t Value) { | ||||||
474 | return Value && ((Value + 1) & Value) == 0; | ||||||
475 | } | ||||||
476 | |||||||
477 | /// Return true if the argument contains a non-empty sequence of ones with the | ||||||
478 | /// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true. | ||||||
479 | constexpr inline bool isShiftedMask_32(uint32_t Value) { | ||||||
480 | return Value && isMask_32((Value - 1) | Value); | ||||||
481 | } | ||||||
482 | |||||||
483 | /// Return true if the argument contains a non-empty sequence of ones with the | ||||||
484 | /// remainder zero (64 bit version.) | ||||||
485 | constexpr inline bool isShiftedMask_64(uint64_t Value) { | ||||||
486 | return Value && isMask_64((Value - 1) | Value); | ||||||
487 | } | ||||||
488 | |||||||
489 | /// Return true if the argument is a power of two > 0. | ||||||
490 | /// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.) | ||||||
491 | constexpr inline bool isPowerOf2_32(uint32_t Value) { | ||||||
492 | return Value && !(Value & (Value - 1)); | ||||||
493 | } | ||||||
494 | |||||||
495 | /// Return true if the argument is a power of two > 0 (64 bit edition.) | ||||||
496 | constexpr inline bool isPowerOf2_64(uint64_t Value) { | ||||||
497 | return Value && !(Value & (Value - 1)); | ||||||
498 | } | ||||||
499 | |||||||
500 | /// Count the number of ones from the most significant bit to the first | ||||||
501 | /// zero bit. | ||||||
502 | /// | ||||||
503 | /// Ex. countLeadingOnes(0xFF0FFF00) == 8. | ||||||
504 | /// Only unsigned integral types are allowed. | ||||||
505 | /// | ||||||
506 | /// \param ZB the behavior on an input of all ones. Only ZB_Width and | ||||||
507 | /// ZB_Undefined are valid arguments. | ||||||
508 | template <typename T> | ||||||
509 | unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) { | ||||||
510 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
511 | !std::numeric_limits<T>::is_signed, | ||||||
512 | "Only unsigned integral types are allowed."); | ||||||
513 | return countLeadingZeros<T>(~Value, ZB); | ||||||
514 | } | ||||||
515 | |||||||
516 | /// Count the number of ones from the least significant bit to the first | ||||||
517 | /// zero bit. | ||||||
518 | /// | ||||||
519 | /// Ex. countTrailingOnes(0x00FF00FF) == 8. | ||||||
520 | /// Only unsigned integral types are allowed. | ||||||
521 | /// | ||||||
522 | /// \param ZB the behavior on an input of all ones. Only ZB_Width and | ||||||
523 | /// ZB_Undefined are valid arguments. | ||||||
524 | template <typename T> | ||||||
525 | unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) { | ||||||
526 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
527 | !std::numeric_limits<T>::is_signed, | ||||||
528 | "Only unsigned integral types are allowed."); | ||||||
529 | return countTrailingZeros<T>(~Value, ZB); | ||||||
530 | } | ||||||
531 | |||||||
532 | namespace detail { | ||||||
533 | template <typename T, std::size_t SizeOfT> struct PopulationCounter { | ||||||
534 | static unsigned count(T Value) { | ||||||
535 | // Generic version, forward to 32 bits. | ||||||
536 | static_assert(SizeOfT <= 4, "Not implemented!"); | ||||||
537 | #if defined(__GNUC__4) | ||||||
538 | return __builtin_popcount(Value); | ||||||
539 | #else | ||||||
540 | uint32_t v = Value; | ||||||
541 | v = v - ((v >> 1) & 0x55555555); | ||||||
542 | v = (v & 0x33333333) + ((v >> 2) & 0x33333333); | ||||||
543 | return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; | ||||||
544 | #endif | ||||||
545 | } | ||||||
546 | }; | ||||||
547 | |||||||
548 | template <typename T> struct PopulationCounter<T, 8> { | ||||||
549 | static unsigned count(T Value) { | ||||||
550 | #if defined(__GNUC__4) | ||||||
551 | return __builtin_popcountll(Value); | ||||||
552 | #else | ||||||
553 | uint64_t v = Value; | ||||||
554 | v = v - ((v >> 1) & 0x5555555555555555ULL); | ||||||
555 | v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL); | ||||||
556 | v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL; | ||||||
557 | return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56); | ||||||
558 | #endif | ||||||
559 | } | ||||||
560 | }; | ||||||
561 | } // namespace detail | ||||||
562 | |||||||
563 | /// Count the number of set bits in a value. | ||||||
564 | /// Ex. countPopulation(0xF000F000) = 8 | ||||||
565 | /// Returns 0 if the word is zero. | ||||||
566 | template <typename T> | ||||||
567 | inline unsigned countPopulation(T Value) { | ||||||
568 | static_assert(std::numeric_limits<T>::is_integer && | ||||||
569 | !std::numeric_limits<T>::is_signed, | ||||||
570 | "Only unsigned integral types are allowed."); | ||||||
571 | return detail::PopulationCounter<T, sizeof(T)>::count(Value); | ||||||
572 | } | ||||||
573 | |||||||
574 | /// Compile time Log2. | ||||||
575 | /// Valid only for positive powers of two. | ||||||
576 | template <size_t kValue> constexpr inline size_t CTLog2() { | ||||||
577 | static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue), | ||||||
578 | "Value is not a valid power of 2"); | ||||||
579 | return 1 + CTLog2<kValue / 2>(); | ||||||
580 | } | ||||||
581 | |||||||
582 | template <> constexpr inline size_t CTLog2<1>() { return 0; } | ||||||
583 | |||||||
584 | /// Return the log base 2 of the specified value. | ||||||
585 | inline double Log2(double Value) { | ||||||
586 | #if defined(__ANDROID_API__) && __ANDROID_API__ < 18 | ||||||
587 | return __builtin_log(Value) / __builtin_log(2.0); | ||||||
588 | #else | ||||||
589 | return log2(Value); | ||||||
590 | #endif | ||||||
591 | } | ||||||
592 | |||||||
593 | /// Return the floor log base 2 of the specified value, -1 if the value is zero. | ||||||
594 | /// (32 bit edition.) | ||||||
595 | /// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2 | ||||||
596 | inline unsigned Log2_32(uint32_t Value) { | ||||||
597 | return 31 - countLeadingZeros(Value); | ||||||
598 | } | ||||||
599 | |||||||
600 | /// Return the floor log base 2 of the specified value, -1 if the value is zero. | ||||||
601 | /// (64 bit edition.) | ||||||
602 | inline unsigned Log2_64(uint64_t Value) { | ||||||
603 | return 63 - countLeadingZeros(Value); | ||||||
604 | } | ||||||
605 | |||||||
606 | /// Return the ceil log base 2 of the specified value, 32 if the value is zero. | ||||||
607 | /// (32 bit edition). | ||||||
608 | /// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3 | ||||||
609 | inline unsigned Log2_32_Ceil(uint32_t Value) { | ||||||
610 | return 32 - countLeadingZeros(Value - 1); | ||||||
611 | } | ||||||
612 | |||||||
613 | /// Return the ceil log base 2 of the specified value, 64 if the value is zero. | ||||||
614 | /// (64 bit edition.) | ||||||
615 | inline unsigned Log2_64_Ceil(uint64_t Value) { | ||||||
616 | return 64 - countLeadingZeros(Value - 1); | ||||||
617 | } | ||||||
618 | |||||||
619 | /// Return the greatest common divisor of the values using Euclid's algorithm. | ||||||
620 | template <typename T> | ||||||
621 | inline T greatestCommonDivisor(T A, T B) { | ||||||
622 | while (B) { | ||||||
623 | T Tmp = B; | ||||||
624 | B = A % B; | ||||||
625 | A = Tmp; | ||||||
626 | } | ||||||
627 | return A; | ||||||
628 | } | ||||||
629 | |||||||
630 | inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) { | ||||||
631 | return greatestCommonDivisor<uint64_t>(A, B); | ||||||
632 | } | ||||||
633 | |||||||
634 | /// This function takes a 64-bit integer and returns the bit equivalent double. | ||||||
635 | inline double BitsToDouble(uint64_t Bits) { | ||||||
636 | double D; | ||||||
637 | static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes"); | ||||||
638 | memcpy(&D, &Bits, sizeof(Bits)); | ||||||
639 | return D; | ||||||
640 | } | ||||||
641 | |||||||
642 | /// This function takes a 32-bit integer and returns the bit equivalent float. | ||||||
643 | inline float BitsToFloat(uint32_t Bits) { | ||||||
644 | float F; | ||||||
645 | static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes"); | ||||||
646 | memcpy(&F, &Bits, sizeof(Bits)); | ||||||
647 | return F; | ||||||
648 | } | ||||||
649 | |||||||
650 | /// This function takes a double and returns the bit equivalent 64-bit integer. | ||||||
651 | /// Note that copying doubles around changes the bits of NaNs on some hosts, | ||||||
652 | /// notably x86, so this routine cannot be used if these bits are needed. | ||||||
653 | inline uint64_t DoubleToBits(double Double) { | ||||||
654 | uint64_t Bits; | ||||||
655 | static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes"); | ||||||
656 | memcpy(&Bits, &Double, sizeof(Double)); | ||||||
657 | return Bits; | ||||||
658 | } | ||||||
659 | |||||||
660 | /// This function takes a float and returns the bit equivalent 32-bit integer. | ||||||
661 | /// Note that copying floats around changes the bits of NaNs on some hosts, | ||||||
662 | /// notably x86, so this routine cannot be used if these bits are needed. | ||||||
663 | inline uint32_t FloatToBits(float Float) { | ||||||
664 | uint32_t Bits; | ||||||
665 | static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes"); | ||||||
666 | memcpy(&Bits, &Float, sizeof(Float)); | ||||||
667 | return Bits; | ||||||
668 | } | ||||||
669 | |||||||
670 | /// A and B are either alignments or offsets. Return the minimum alignment that | ||||||
671 | /// may be assumed after adding the two together. | ||||||
672 | constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) { | ||||||
673 | // The largest power of 2 that divides both A and B. | ||||||
674 | // | ||||||
675 | // Replace "-Value" by "1+~Value" in the following commented code to avoid | ||||||
676 | // MSVC warning C4146 | ||||||
677 | // return (A | B) & -(A | B); | ||||||
678 | return (A | B) & (1 + ~(A | B)); | ||||||
679 | } | ||||||
680 | |||||||
681 | /// Returns the next power of two (in 64-bits) that is strictly greater than A. | ||||||
682 | /// Returns zero on overflow. | ||||||
683 | inline uint64_t NextPowerOf2(uint64_t A) { | ||||||
684 | A |= (A >> 1); | ||||||
685 | A |= (A >> 2); | ||||||
686 | A |= (A >> 4); | ||||||
687 | A |= (A >> 8); | ||||||
688 | A |= (A >> 16); | ||||||
689 | A |= (A >> 32); | ||||||
690 | return A + 1; | ||||||
691 | } | ||||||
692 | |||||||
693 | /// Returns the power of two which is less than or equal to the given value. | ||||||
694 | /// Essentially, it is a floor operation across the domain of powers of two. | ||||||
695 | inline uint64_t PowerOf2Floor(uint64_t A) { | ||||||
696 | if (!A) return 0; | ||||||
697 | return 1ull << (63 - countLeadingZeros(A, ZB_Undefined)); | ||||||
698 | } | ||||||
699 | |||||||
700 | /// Returns the power of two which is greater than or equal to the given value. | ||||||
701 | /// Essentially, it is a ceil operation across the domain of powers of two. | ||||||
702 | inline uint64_t PowerOf2Ceil(uint64_t A) { | ||||||
703 | if (!A) | ||||||
704 | return 0; | ||||||
705 | return NextPowerOf2(A - 1); | ||||||
706 | } | ||||||
707 | |||||||
708 | /// Returns the next integer (mod 2**64) that is greater than or equal to | ||||||
709 | /// \p Value and is a multiple of \p Align. \p Align must be non-zero. | ||||||
710 | /// | ||||||
711 | /// If non-zero \p Skew is specified, the return value will be a minimal | ||||||
712 | /// integer that is greater than or equal to \p Value and equal to | ||||||
713 | /// \p Align * N + \p Skew for some integer N. If \p Skew is larger than | ||||||
714 | /// \p Align, its value is adjusted to '\p Skew mod \p Align'. | ||||||
715 | /// | ||||||
716 | /// Examples: | ||||||
717 | /// \code | ||||||
718 | /// alignTo(5, 8) = 8 | ||||||
719 | /// alignTo(17, 8) = 24 | ||||||
720 | /// alignTo(~0LL, 8) = 0 | ||||||
721 | /// alignTo(321, 255) = 510 | ||||||
722 | /// | ||||||
723 | /// alignTo(5, 8, 7) = 7 | ||||||
724 | /// alignTo(17, 8, 1) = 17 | ||||||
725 | /// alignTo(~0LL, 8, 3) = 3 | ||||||
726 | /// alignTo(321, 255, 42) = 552 | ||||||
727 | /// \endcode | ||||||
728 | inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { | ||||||
729 | assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0." ) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 729, __extension__ __PRETTY_FUNCTION__)); | ||||||
730 | Skew %= Align; | ||||||
731 | return (Value + Align - 1 - Skew) / Align * Align + Skew; | ||||||
732 | } | ||||||
733 | |||||||
734 | /// Returns the next integer (mod 2**64) that is greater than or equal to | ||||||
735 | /// \p Value and is a multiple of \c Align. \c Align must be non-zero. | ||||||
736 | template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) { | ||||||
737 | static_assert(Align != 0u, "Align must be non-zero"); | ||||||
738 | return (Value + Align - 1) / Align * Align; | ||||||
739 | } | ||||||
740 | |||||||
741 | /// Returns the integer ceil(Numerator / Denominator). | ||||||
742 | inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) { | ||||||
743 | return alignTo(Numerator, Denominator) / Denominator; | ||||||
744 | } | ||||||
745 | |||||||
746 | /// Returns the integer nearest(Numerator / Denominator). | ||||||
747 | inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) { | ||||||
748 | return (Numerator + (Denominator / 2)) / Denominator; | ||||||
749 | } | ||||||
750 | |||||||
751 | /// Returns the largest uint64_t less than or equal to \p Value and is | ||||||
752 | /// \p Skew mod \p Align. \p Align must be non-zero | ||||||
753 | inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { | ||||||
754 | assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0." ) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 754, __extension__ __PRETTY_FUNCTION__)); | ||||||
755 | Skew %= Align; | ||||||
756 | return (Value - Skew) / Align * Align + Skew; | ||||||
757 | } | ||||||
758 | |||||||
759 | /// Sign-extend the number in the bottom B bits of X to a 32-bit integer. | ||||||
760 | /// Requires 0 < B <= 32. | ||||||
761 | template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) { | ||||||
762 | static_assert(B > 0, "Bit width can't be 0."); | ||||||
763 | static_assert(B <= 32, "Bit width out of range."); | ||||||
764 | return int32_t(X << (32 - B)) >> (32 - B); | ||||||
765 | } | ||||||
766 | |||||||
767 | /// Sign-extend the number in the bottom B bits of X to a 32-bit integer. | ||||||
768 | /// Requires 0 < B <= 32. | ||||||
769 | inline int32_t SignExtend32(uint32_t X, unsigned B) { | ||||||
770 | assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0." ) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 770, __extension__ __PRETTY_FUNCTION__)); | ||||||
771 | assert(B <= 32 && "Bit width out of range.")(static_cast <bool> (B <= 32 && "Bit width out of range." ) ? void (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 771, __extension__ __PRETTY_FUNCTION__)); | ||||||
772 | return int32_t(X << (32 - B)) >> (32 - B); | ||||||
773 | } | ||||||
774 | |||||||
775 | /// Sign-extend the number in the bottom B bits of X to a 64-bit integer. | ||||||
776 | /// Requires 0 < B <= 64. | ||||||
777 | template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) { | ||||||
778 | static_assert(B > 0, "Bit width can't be 0."); | ||||||
779 | static_assert(B <= 64, "Bit width out of range."); | ||||||
780 | return int64_t(x << (64 - B)) >> (64 - B); | ||||||
781 | } | ||||||
782 | |||||||
783 | /// Sign-extend the number in the bottom B bits of X to a 64-bit integer. | ||||||
784 | /// Requires 0 < B <= 64. | ||||||
785 | inline int64_t SignExtend64(uint64_t X, unsigned B) { | ||||||
786 | assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0." ) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 786, __extension__ __PRETTY_FUNCTION__)); | ||||||
787 | assert(B <= 64 && "Bit width out of range.")(static_cast <bool> (B <= 64 && "Bit width out of range." ) ? void (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 787, __extension__ __PRETTY_FUNCTION__)); | ||||||
788 | return int64_t(X << (64 - B)) >> (64 - B); | ||||||
789 | } | ||||||
790 | |||||||
791 | /// Subtract two unsigned integers, X and Y, of type T and return the absolute | ||||||
792 | /// value of the result. | ||||||
793 | template <typename T> | ||||||
794 | std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) { | ||||||
795 | return X > Y ? (X - Y) : (Y - X); | ||||||
796 | } | ||||||
797 | |||||||
798 | /// Add two unsigned integers, X and Y, of type T. Clamp the result to the | ||||||
799 | /// maximum representable value of T on overflow. ResultOverflowed indicates if | ||||||
800 | /// the result is larger than the maximum representable value of type T. | ||||||
801 | template <typename T> | ||||||
802 | std::enable_if_t<std::is_unsigned<T>::value, T> | ||||||
803 | SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) { | ||||||
804 | bool Dummy; | ||||||
805 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; | ||||||
806 | // Hacker's Delight, p. 29 | ||||||
807 | T Z = X + Y; | ||||||
808 | Overflowed = (Z < X || Z < Y); | ||||||
809 | if (Overflowed) | ||||||
810 | return std::numeric_limits<T>::max(); | ||||||
811 | else | ||||||
812 | return Z; | ||||||
813 | } | ||||||
814 | |||||||
815 | /// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the | ||||||
816 | /// maximum representable value of T on overflow. ResultOverflowed indicates if | ||||||
817 | /// the result is larger than the maximum representable value of type T. | ||||||
818 | template <typename T> | ||||||
819 | std::enable_if_t<std::is_unsigned<T>::value, T> | ||||||
820 | SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) { | ||||||
821 | bool Dummy; | ||||||
822 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; | ||||||
823 | |||||||
824 | // Hacker's Delight, p. 30 has a different algorithm, but we don't use that | ||||||
825 | // because it fails for uint16_t (where multiplication can have undefined | ||||||
826 | // behavior due to promotion to int), and requires a division in addition | ||||||
827 | // to the multiplication. | ||||||
828 | |||||||
829 | Overflowed = false; | ||||||
830 | |||||||
831 | // Log2(Z) would be either Log2Z or Log2Z + 1. | ||||||
832 | // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z | ||||||
833 | // will necessarily be less than Log2Max as desired. | ||||||
834 | int Log2Z = Log2_64(X) + Log2_64(Y); | ||||||
835 | const T Max = std::numeric_limits<T>::max(); | ||||||
836 | int Log2Max = Log2_64(Max); | ||||||
837 | if (Log2Z < Log2Max) { | ||||||
838 | return X * Y; | ||||||
839 | } | ||||||
840 | if (Log2Z > Log2Max) { | ||||||
841 | Overflowed = true; | ||||||
842 | return Max; | ||||||
843 | } | ||||||
844 | |||||||
845 | // We're going to use the top bit, and maybe overflow one | ||||||
846 | // bit past it. Multiply all but the bottom bit then add | ||||||
847 | // that on at the end. | ||||||
848 | T Z = (X >> 1) * Y; | ||||||
849 | if (Z & ~(Max >> 1)) { | ||||||
850 | Overflowed = true; | ||||||
851 | return Max; | ||||||
852 | } | ||||||
853 | Z <<= 1; | ||||||
854 | if (X & 1) | ||||||
855 | return SaturatingAdd(Z, Y, ResultOverflowed); | ||||||
856 | |||||||
857 | return Z; | ||||||
858 | } | ||||||
859 | |||||||
860 | /// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to | ||||||
861 | /// the product. Clamp the result to the maximum representable value of T on | ||||||
862 | /// overflow. ResultOverflowed indicates if the result is larger than the | ||||||
863 | /// maximum representable value of type T. | ||||||
864 | template <typename T> | ||||||
865 | std::enable_if_t<std::is_unsigned<T>::value, T> | ||||||
866 | SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) { | ||||||
867 | bool Dummy; | ||||||
868 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; | ||||||
869 | |||||||
870 | T Product = SaturatingMultiply(X, Y, &Overflowed); | ||||||
871 | if (Overflowed) | ||||||
872 | return Product; | ||||||
873 | |||||||
874 | return SaturatingAdd(A, Product, &Overflowed); | ||||||
875 | } | ||||||
876 | |||||||
877 | /// Use this rather than HUGE_VALF; the latter causes warnings on MSVC. | ||||||
878 | extern const float huge_valf; | ||||||
879 | |||||||
880 | |||||||
881 | /// Add two signed integers, computing the two's complement truncated result, | ||||||
882 | /// returning true if overflow occured. | ||||||
883 | template <typename T> | ||||||
884 | std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) { | ||||||
885 | #if __has_builtin(__builtin_add_overflow)1 | ||||||
886 | return __builtin_add_overflow(X, Y, &Result); | ||||||
887 | #else | ||||||
888 | // Perform the unsigned addition. | ||||||
889 | using U = std::make_unsigned_t<T>; | ||||||
890 | const U UX = static_cast<U>(X); | ||||||
891 | const U UY = static_cast<U>(Y); | ||||||
892 | const U UResult = UX + UY; | ||||||
893 | |||||||
894 | // Convert to signed. | ||||||
895 | Result = static_cast<T>(UResult); | ||||||
896 | |||||||
897 | // Adding two positive numbers should result in a positive number. | ||||||
898 | if (X > 0 && Y > 0) | ||||||
899 | return Result <= 0; | ||||||
900 | // Adding two negatives should result in a negative number. | ||||||
901 | if (X < 0 && Y < 0) | ||||||
902 | return Result >= 0; | ||||||
903 | return false; | ||||||
904 | #endif | ||||||
905 | } | ||||||
906 | |||||||
907 | /// Subtract two signed integers, computing the two's complement truncated | ||||||
908 | /// result, returning true if an overflow ocurred. | ||||||
909 | template <typename T> | ||||||
910 | std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) { | ||||||
911 | #if __has_builtin(__builtin_sub_overflow)1 | ||||||
912 | return __builtin_sub_overflow(X, Y, &Result); | ||||||
913 | #else | ||||||
914 | // Perform the unsigned addition. | ||||||
915 | using U = std::make_unsigned_t<T>; | ||||||
916 | const U UX = static_cast<U>(X); | ||||||
917 | const U UY = static_cast<U>(Y); | ||||||
918 | const U UResult = UX - UY; | ||||||
919 | |||||||
920 | // Convert to signed. | ||||||
921 | Result = static_cast<T>(UResult); | ||||||
922 | |||||||
923 | // Subtracting a positive number from a negative results in a negative number. | ||||||
924 | if (X <= 0 && Y > 0) | ||||||
925 | return Result >= 0; | ||||||
926 | // Subtracting a negative number from a positive results in a positive number. | ||||||
927 | if (X >= 0 && Y < 0) | ||||||
928 | return Result <= 0; | ||||||
929 | return false; | ||||||
930 | #endif | ||||||
931 | } | ||||||
932 | |||||||
933 | /// Multiply two signed integers, computing the two's complement truncated | ||||||
934 | /// result, returning true if an overflow ocurred. | ||||||
935 | template <typename T> | ||||||
936 | std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) { | ||||||
937 | // Perform the unsigned multiplication on absolute values. | ||||||
938 | using U = std::make_unsigned_t<T>; | ||||||
939 | const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X); | ||||||
940 | const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y); | ||||||
941 | const U UResult = UX * UY; | ||||||
942 | |||||||
943 | // Convert to signed. | ||||||
944 | const bool IsNegative = (X < 0) ^ (Y < 0); | ||||||
945 | Result = IsNegative ? (0 - UResult) : UResult; | ||||||
946 | |||||||
947 | // If any of the args was 0, result is 0 and no overflow occurs. | ||||||
948 | if (UX == 0 || UY == 0) | ||||||
949 | return false; | ||||||
950 | |||||||
951 | // UX and UY are in [1, 2^n], where n is the number of digits. | ||||||
952 | // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for | ||||||
953 | // positive) divided by an argument compares to the other. | ||||||
954 | if (IsNegative) | ||||||
955 | return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY; | ||||||
956 | else | ||||||
957 | return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY; | ||||||
958 | } | ||||||
959 | |||||||
960 | } // End llvm namespace | ||||||
961 | |||||||
962 | #endif |