File: | llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp |
Warning: | line 1699, column 21 The result of the left shift is undefined due to shifting by '4294967295', which is greater or equal to the width of type 'int' |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===// | |||
2 | // | |||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
4 | // See https://llvm.org/LICENSE.txt for license information. | |||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
6 | // | |||
7 | //===----------------------------------------------------------------------===// | |||
8 | ||||
9 | #include "AArch64TargetTransformInfo.h" | |||
10 | #include "AArch64ExpandImm.h" | |||
11 | #include "MCTargetDesc/AArch64AddressingModes.h" | |||
12 | #include "llvm/Analysis/LoopInfo.h" | |||
13 | #include "llvm/Analysis/TargetTransformInfo.h" | |||
14 | #include "llvm/CodeGen/BasicTTIImpl.h" | |||
15 | #include "llvm/CodeGen/CostTable.h" | |||
16 | #include "llvm/CodeGen/TargetLowering.h" | |||
17 | #include "llvm/IR/Intrinsics.h" | |||
18 | #include "llvm/IR/IntrinsicInst.h" | |||
19 | #include "llvm/IR/IntrinsicsAArch64.h" | |||
20 | #include "llvm/IR/PatternMatch.h" | |||
21 | #include "llvm/Support/Debug.h" | |||
22 | #include "llvm/Transforms/InstCombine/InstCombiner.h" | |||
23 | #include <algorithm> | |||
24 | using namespace llvm; | |||
25 | using namespace llvm::PatternMatch; | |||
26 | ||||
27 | #define DEBUG_TYPE"aarch64tti" "aarch64tti" | |||
28 | ||||
29 | static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", | |||
30 | cl::init(true), cl::Hidden); | |||
31 | ||||
32 | bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, | |||
33 | const Function *Callee) const { | |||
34 | const TargetMachine &TM = getTLI()->getTargetMachine(); | |||
35 | ||||
36 | const FeatureBitset &CallerBits = | |||
37 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); | |||
38 | const FeatureBitset &CalleeBits = | |||
39 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); | |||
40 | ||||
41 | // Inline a callee if its target-features are a subset of the callers | |||
42 | // target-features. | |||
43 | return (CallerBits & CalleeBits) == CalleeBits; | |||
44 | } | |||
45 | ||||
46 | /// Calculate the cost of materializing a 64-bit value. This helper | |||
47 | /// method might only calculate a fraction of a larger immediate. Therefore it | |||
48 | /// is valid to return a cost of ZERO. | |||
49 | InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) { | |||
50 | // Check if the immediate can be encoded within an instruction. | |||
51 | if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) | |||
52 | return 0; | |||
53 | ||||
54 | if (Val < 0) | |||
55 | Val = ~Val; | |||
56 | ||||
57 | // Calculate how many moves we will need to materialize this constant. | |||
58 | SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; | |||
59 | AArch64_IMM::expandMOVImm(Val, 64, Insn); | |||
60 | return Insn.size(); | |||
61 | } | |||
62 | ||||
63 | /// Calculate the cost of materializing the given constant. | |||
64 | InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, | |||
65 | TTI::TargetCostKind CostKind) { | |||
66 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp" , 66, __extension__ __PRETTY_FUNCTION__)); | |||
67 | ||||
68 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | |||
69 | if (BitSize == 0) | |||
70 | return ~0U; | |||
71 | ||||
72 | // Sign-extend all constants to a multiple of 64-bit. | |||
73 | APInt ImmVal = Imm; | |||
74 | if (BitSize & 0x3f) | |||
75 | ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); | |||
76 | ||||
77 | // Split the constant into 64-bit chunks and calculate the cost for each | |||
78 | // chunk. | |||
79 | InstructionCost Cost = 0; | |||
80 | for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { | |||
81 | APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); | |||
82 | int64_t Val = Tmp.getSExtValue(); | |||
83 | Cost += getIntImmCost(Val); | |||
84 | } | |||
85 | // We need at least one instruction to materialze the constant. | |||
86 | return std::max<InstructionCost>(1, Cost); | |||
87 | } | |||
88 | ||||
89 | InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, | |||
90 | const APInt &Imm, Type *Ty, | |||
91 | TTI::TargetCostKind CostKind, | |||
92 | Instruction *Inst) { | |||
93 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp" , 93, __extension__ __PRETTY_FUNCTION__)); | |||
94 | ||||
95 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | |||
96 | // There is no cost model for constants with a bit size of 0. Return TCC_Free | |||
97 | // here, so that constant hoisting will ignore this constant. | |||
98 | if (BitSize == 0) | |||
99 | return TTI::TCC_Free; | |||
100 | ||||
101 | unsigned ImmIdx = ~0U; | |||
102 | switch (Opcode) { | |||
103 | default: | |||
104 | return TTI::TCC_Free; | |||
105 | case Instruction::GetElementPtr: | |||
106 | // Always hoist the base address of a GetElementPtr. | |||
107 | if (Idx == 0) | |||
108 | return 2 * TTI::TCC_Basic; | |||
109 | return TTI::TCC_Free; | |||
110 | case Instruction::Store: | |||
111 | ImmIdx = 0; | |||
112 | break; | |||
113 | case Instruction::Add: | |||
114 | case Instruction::Sub: | |||
115 | case Instruction::Mul: | |||
116 | case Instruction::UDiv: | |||
117 | case Instruction::SDiv: | |||
118 | case Instruction::URem: | |||
119 | case Instruction::SRem: | |||
120 | case Instruction::And: | |||
121 | case Instruction::Or: | |||
122 | case Instruction::Xor: | |||
123 | case Instruction::ICmp: | |||
124 | ImmIdx = 1; | |||
125 | break; | |||
126 | // Always return TCC_Free for the shift value of a shift instruction. | |||
127 | case Instruction::Shl: | |||
128 | case Instruction::LShr: | |||
129 | case Instruction::AShr: | |||
130 | if (Idx == 1) | |||
131 | return TTI::TCC_Free; | |||
132 | break; | |||
133 | case Instruction::Trunc: | |||
134 | case Instruction::ZExt: | |||
135 | case Instruction::SExt: | |||
136 | case Instruction::IntToPtr: | |||
137 | case Instruction::PtrToInt: | |||
138 | case Instruction::BitCast: | |||
139 | case Instruction::PHI: | |||
140 | case Instruction::Call: | |||
141 | case Instruction::Select: | |||
142 | case Instruction::Ret: | |||
143 | case Instruction::Load: | |||
144 | break; | |||
145 | } | |||
146 | ||||
147 | if (Idx == ImmIdx) { | |||
148 | int NumConstants = (BitSize + 63) / 64; | |||
149 | InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); | |||
150 | return (Cost <= NumConstants * TTI::TCC_Basic) | |||
151 | ? static_cast<int>(TTI::TCC_Free) | |||
152 | : Cost; | |||
153 | } | |||
154 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); | |||
155 | } | |||
156 | ||||
157 | InstructionCost | |||
158 | AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, | |||
159 | const APInt &Imm, Type *Ty, | |||
160 | TTI::TargetCostKind CostKind) { | |||
161 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp" , 161, __extension__ __PRETTY_FUNCTION__)); | |||
162 | ||||
163 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | |||
164 | // There is no cost model for constants with a bit size of 0. Return TCC_Free | |||
165 | // here, so that constant hoisting will ignore this constant. | |||
166 | if (BitSize == 0) | |||
167 | return TTI::TCC_Free; | |||
168 | ||||
169 | // Most (all?) AArch64 intrinsics do not support folding immediates into the | |||
170 | // selected instruction, so we compute the materialization cost for the | |||
171 | // immediate directly. | |||
172 | if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv) | |||
173 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); | |||
174 | ||||
175 | switch (IID) { | |||
176 | default: | |||
177 | return TTI::TCC_Free; | |||
178 | case Intrinsic::sadd_with_overflow: | |||
179 | case Intrinsic::uadd_with_overflow: | |||
180 | case Intrinsic::ssub_with_overflow: | |||
181 | case Intrinsic::usub_with_overflow: | |||
182 | case Intrinsic::smul_with_overflow: | |||
183 | case Intrinsic::umul_with_overflow: | |||
184 | if (Idx == 1) { | |||
185 | int NumConstants = (BitSize + 63) / 64; | |||
186 | InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); | |||
187 | return (Cost <= NumConstants * TTI::TCC_Basic) | |||
188 | ? static_cast<int>(TTI::TCC_Free) | |||
189 | : Cost; | |||
190 | } | |||
191 | break; | |||
192 | case Intrinsic::experimental_stackmap: | |||
193 | if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) | |||
194 | return TTI::TCC_Free; | |||
195 | break; | |||
196 | case Intrinsic::experimental_patchpoint_void: | |||
197 | case Intrinsic::experimental_patchpoint_i64: | |||
198 | if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) | |||
199 | return TTI::TCC_Free; | |||
200 | break; | |||
201 | case Intrinsic::experimental_gc_statepoint: | |||
202 | if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) | |||
203 | return TTI::TCC_Free; | |||
204 | break; | |||
205 | } | |||
206 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); | |||
207 | } | |||
208 | ||||
209 | TargetTransformInfo::PopcntSupportKind | |||
210 | AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { | |||
211 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2")(static_cast <bool> (isPowerOf2_32(TyWidth) && "Ty width must be power of 2" ) ? void (0) : __assert_fail ("isPowerOf2_32(TyWidth) && \"Ty width must be power of 2\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp" , 211, __extension__ __PRETTY_FUNCTION__)); | |||
212 | if (TyWidth == 32 || TyWidth == 64) | |||
213 | return TTI::PSK_FastHardware; | |||
214 | // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. | |||
215 | return TTI::PSK_Software; | |||
216 | } | |||
217 | ||||
218 | InstructionCost | |||
219 | AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, | |||
220 | TTI::TargetCostKind CostKind) { | |||
221 | auto *RetTy = ICA.getReturnType(); | |||
222 | switch (ICA.getID()) { | |||
223 | case Intrinsic::umin: | |||
224 | case Intrinsic::umax: | |||
225 | case Intrinsic::smin: | |||
226 | case Intrinsic::smax: { | |||
227 | static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, | |||
228 | MVT::v8i16, MVT::v2i32, MVT::v4i32}; | |||
229 | auto LT = TLI->getTypeLegalizationCost(DL, RetTy); | |||
230 | // v2i64 types get converted to cmp+bif hence the cost of 2 | |||
231 | if (LT.second == MVT::v2i64) | |||
232 | return LT.first * 2; | |||
233 | if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) | |||
234 | return LT.first; | |||
235 | break; | |||
236 | } | |||
237 | case Intrinsic::sadd_sat: | |||
238 | case Intrinsic::ssub_sat: | |||
239 | case Intrinsic::uadd_sat: | |||
240 | case Intrinsic::usub_sat: { | |||
241 | static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, | |||
242 | MVT::v8i16, MVT::v2i32, MVT::v4i32, | |||
243 | MVT::v2i64}; | |||
244 | auto LT = TLI->getTypeLegalizationCost(DL, RetTy); | |||
245 | // This is a base cost of 1 for the vadd, plus 3 extract shifts if we | |||
246 | // need to extend the type, as it uses shr(qadd(shl, shl)). | |||
247 | unsigned Instrs = | |||
248 | LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4; | |||
249 | if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; })) | |||
250 | return LT.first * Instrs; | |||
251 | break; | |||
252 | } | |||
253 | case Intrinsic::abs: { | |||
254 | static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, | |||
255 | MVT::v8i16, MVT::v2i32, MVT::v4i32, | |||
256 | MVT::v2i64}; | |||
257 | auto LT = TLI->getTypeLegalizationCost(DL, RetTy); | |||
258 | if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; })) | |||
259 | return LT.first; | |||
260 | break; | |||
261 | } | |||
262 | case Intrinsic::experimental_stepvector: { | |||
263 | InstructionCost Cost = 1; // Cost of the `index' instruction | |||
264 | auto LT = TLI->getTypeLegalizationCost(DL, RetTy); | |||
265 | // Legalisation of illegal vectors involves an `index' instruction plus | |||
266 | // (LT.first - 1) vector adds. | |||
267 | if (LT.first > 1) { | |||
268 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext()); | |||
269 | InstructionCost AddCost = | |||
270 | getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind); | |||
271 | Cost += AddCost * (LT.first - 1); | |||
272 | } | |||
273 | return Cost; | |||
274 | } | |||
275 | case Intrinsic::bitreverse: { | |||
276 | static const CostTblEntry BitreverseTbl[] = { | |||
277 | {Intrinsic::bitreverse, MVT::i32, 1}, | |||
278 | {Intrinsic::bitreverse, MVT::i64, 1}, | |||
279 | {Intrinsic::bitreverse, MVT::v8i8, 1}, | |||
280 | {Intrinsic::bitreverse, MVT::v16i8, 1}, | |||
281 | {Intrinsic::bitreverse, MVT::v4i16, 2}, | |||
282 | {Intrinsic::bitreverse, MVT::v8i16, 2}, | |||
283 | {Intrinsic::bitreverse, MVT::v2i32, 2}, | |||
284 | {Intrinsic::bitreverse, MVT::v4i32, 2}, | |||
285 | {Intrinsic::bitreverse, MVT::v1i64, 2}, | |||
286 | {Intrinsic::bitreverse, MVT::v2i64, 2}, | |||
287 | }; | |||
288 | const auto LegalisationCost = TLI->getTypeLegalizationCost(DL, RetTy); | |||
289 | const auto *Entry = | |||
290 | CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second); | |||
291 | // Cost Model is using the legal type(i32) that i8 and i16 will be converted | |||
292 | // to +1 so that we match the actual lowering cost | |||
293 | if (TLI->getValueType(DL, RetTy, true) == MVT::i8 || | |||
294 | TLI->getValueType(DL, RetTy, true) == MVT::i16) | |||
295 | return LegalisationCost.first * Entry->Cost + 1; | |||
296 | if (Entry) | |||
297 | return LegalisationCost.first * Entry->Cost; | |||
298 | break; | |||
299 | } | |||
300 | case Intrinsic::ctpop: { | |||
301 | static const CostTblEntry CtpopCostTbl[] = { | |||
302 | {ISD::CTPOP, MVT::v2i64, 4}, | |||
303 | {ISD::CTPOP, MVT::v4i32, 3}, | |||
304 | {ISD::CTPOP, MVT::v8i16, 2}, | |||
305 | {ISD::CTPOP, MVT::v16i8, 1}, | |||
306 | {ISD::CTPOP, MVT::i64, 4}, | |||
307 | {ISD::CTPOP, MVT::v2i32, 3}, | |||
308 | {ISD::CTPOP, MVT::v4i16, 2}, | |||
309 | {ISD::CTPOP, MVT::v8i8, 1}, | |||
310 | {ISD::CTPOP, MVT::i32, 5}, | |||
311 | }; | |||
312 | auto LT = TLI->getTypeLegalizationCost(DL, RetTy); | |||
313 | MVT MTy = LT.second; | |||
314 | if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) { | |||
315 | // Extra cost of +1 when illegal vector types are legalized by promoting | |||
316 | // the integer type. | |||
317 | int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() != | |||
318 | RetTy->getScalarSizeInBits() | |||
319 | ? 1 | |||
320 | : 0; | |||
321 | return LT.first * Entry->Cost + ExtraCost; | |||
322 | } | |||
323 | break; | |||
324 | } | |||
325 | default: | |||
326 | break; | |||
327 | } | |||
328 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); | |||
329 | } | |||
330 | ||||
331 | /// The function will remove redundant reinterprets casting in the presence | |||
332 | /// of the control flow | |||
333 | static Optional<Instruction *> processPhiNode(InstCombiner &IC, | |||
334 | IntrinsicInst &II) { | |||
335 | SmallVector<Instruction *, 32> Worklist; | |||
336 | auto RequiredType = II.getType(); | |||
337 | ||||
338 | auto *PN = dyn_cast<PHINode>(II.getArgOperand(0)); | |||
339 | assert(PN && "Expected Phi Node!")(static_cast <bool> (PN && "Expected Phi Node!" ) ? void (0) : __assert_fail ("PN && \"Expected Phi Node!\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp" , 339, __extension__ __PRETTY_FUNCTION__)); | |||
340 | ||||
341 | // Don't create a new Phi unless we can remove the old one. | |||
342 | if (!PN->hasOneUse()) | |||
343 | return None; | |||
344 | ||||
345 | for (Value *IncValPhi : PN->incoming_values()) { | |||
346 | auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi); | |||
347 | if (!Reinterpret || | |||
348 | Reinterpret->getIntrinsicID() != | |||
349 | Intrinsic::aarch64_sve_convert_to_svbool || | |||
350 | RequiredType != Reinterpret->getArgOperand(0)->getType()) | |||
351 | return None; | |||
352 | } | |||
353 | ||||
354 | // Create the new Phi | |||
355 | LLVMContext &Ctx = PN->getContext(); | |||
356 | IRBuilder<> Builder(Ctx); | |||
357 | Builder.SetInsertPoint(PN); | |||
358 | PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues()); | |||
359 | Worklist.push_back(PN); | |||
360 | ||||
361 | for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) { | |||
362 | auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I)); | |||
363 | NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I)); | |||
364 | Worklist.push_back(Reinterpret); | |||
365 | } | |||
366 | ||||
367 | // Cleanup Phi Node and reinterprets | |||
368 | return IC.replaceInstUsesWith(II, NPN); | |||
369 | } | |||
370 | ||||
371 | static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC, | |||
372 | IntrinsicInst &II) { | |||
373 | // If the reinterpret instruction operand is a PHI Node | |||
374 | if (isa<PHINode>(II.getArgOperand(0))) | |||
375 | return processPhiNode(IC, II); | |||
376 | ||||
377 | SmallVector<Instruction *, 32> CandidatesForRemoval; | |||
378 | Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr; | |||
379 | ||||
380 | const auto *IVTy = cast<VectorType>(II.getType()); | |||
381 | ||||
382 | // Walk the chain of conversions. | |||
383 | while (Cursor) { | |||
384 | // If the type of the cursor has fewer lanes than the final result, zeroing | |||
385 | // must take place, which breaks the equivalence chain. | |||
386 | const auto *CursorVTy = cast<VectorType>(Cursor->getType()); | |||
387 | if (CursorVTy->getElementCount().getKnownMinValue() < | |||
388 | IVTy->getElementCount().getKnownMinValue()) | |||
389 | break; | |||
390 | ||||
391 | // If the cursor has the same type as I, it is a viable replacement. | |||
392 | if (Cursor->getType() == IVTy) | |||
393 | EarliestReplacement = Cursor; | |||
394 | ||||
395 | auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor); | |||
396 | ||||
397 | // If this is not an SVE conversion intrinsic, this is the end of the chain. | |||
398 | if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() == | |||
399 | Intrinsic::aarch64_sve_convert_to_svbool || | |||
400 | IntrinsicCursor->getIntrinsicID() == | |||
401 | Intrinsic::aarch64_sve_convert_from_svbool)) | |||
402 | break; | |||
403 | ||||
404 | CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor); | |||
405 | Cursor = IntrinsicCursor->getOperand(0); | |||
406 | } | |||
407 | ||||
408 | // If no viable replacement in the conversion chain was found, there is | |||
409 | // nothing to do. | |||
410 | if (!EarliestReplacement) | |||
411 | return None; | |||
412 | ||||
413 | return IC.replaceInstUsesWith(II, EarliestReplacement); | |||
414 | } | |||
415 | ||||
416 | static Optional<Instruction *> instCombineSVEDup(InstCombiner &IC, | |||
417 | IntrinsicInst &II) { | |||
418 | IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); | |||
419 | if (!Pg) | |||
420 | return None; | |||
421 | ||||
422 | if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) | |||
423 | return None; | |||
424 | ||||
425 | const auto PTruePattern = | |||
426 | cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); | |||
427 | if (PTruePattern != AArch64SVEPredPattern::vl1) | |||
428 | return None; | |||
429 | ||||
430 | // The intrinsic is inserting into lane zero so use an insert instead. | |||
431 | auto *IdxTy = Type::getInt64Ty(II.getContext()); | |||
432 | auto *Insert = InsertElementInst::Create( | |||
433 | II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0)); | |||
434 | Insert->insertBefore(&II); | |||
435 | Insert->takeName(&II); | |||
436 | ||||
437 | return IC.replaceInstUsesWith(II, Insert); | |||
438 | } | |||
439 | ||||
440 | static Optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC, | |||
441 | IntrinsicInst &II) { | |||
442 | LLVMContext &Ctx = II.getContext(); | |||
443 | IRBuilder<> Builder(Ctx); | |||
444 | Builder.SetInsertPoint(&II); | |||
445 | ||||
446 | // Check that the predicate is all active | |||
447 | auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); | |||
448 | if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) | |||
449 | return None; | |||
450 | ||||
451 | const auto PTruePattern = | |||
452 | cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); | |||
453 | if (PTruePattern != AArch64SVEPredPattern::all) | |||
454 | return None; | |||
455 | ||||
456 | // Check that we have a compare of zero.. | |||
457 | auto *DupX = dyn_cast<IntrinsicInst>(II.getArgOperand(2)); | |||
458 | if (!DupX || DupX->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x) | |||
459 | return None; | |||
460 | ||||
461 | auto *DupXArg = dyn_cast<ConstantInt>(DupX->getArgOperand(0)); | |||
462 | if (!DupXArg || !DupXArg->isZero()) | |||
463 | return None; | |||
464 | ||||
465 | // ..against a dupq | |||
466 | auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); | |||
467 | if (!DupQLane || | |||
468 | DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane) | |||
469 | return None; | |||
470 | ||||
471 | // Where the dupq is a lane 0 replicate of a vector insert | |||
472 | if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero()) | |||
473 | return None; | |||
474 | ||||
475 | auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0)); | |||
476 | if (!VecIns || | |||
477 | VecIns->getIntrinsicID() != Intrinsic::experimental_vector_insert) | |||
478 | return None; | |||
479 | ||||
480 | // Where the vector insert is a fixed constant vector insert into undef at | |||
481 | // index zero | |||
482 | if (!isa<UndefValue>(VecIns->getArgOperand(0))) | |||
483 | return None; | |||
484 | ||||
485 | if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero()) | |||
486 | return None; | |||
487 | ||||
488 | auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1)); | |||
489 | if (!ConstVec) | |||
490 | return None; | |||
491 | ||||
492 | auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType()); | |||
493 | auto *OutTy = dyn_cast<ScalableVectorType>(II.getType()); | |||
494 | if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements()) | |||
495 | return None; | |||
496 | ||||
497 | unsigned NumElts = VecTy->getNumElements(); | |||
498 | unsigned PredicateBits = 0; | |||
499 | ||||
500 | // Expand intrinsic operands to a 16-bit byte level predicate | |||
501 | for (unsigned I = 0; I < NumElts; ++I) { | |||
502 | auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I)); | |||
503 | if (!Arg) | |||
504 | return None; | |||
505 | if (!Arg->isZero()) | |||
506 | PredicateBits |= 1 << (I * (16 / NumElts)); | |||
507 | } | |||
508 | ||||
509 | // If all bits are zero bail early with an empty predicate | |||
510 | if (PredicateBits == 0) { | |||
511 | auto *PFalse = Constant::getNullValue(II.getType()); | |||
512 | PFalse->takeName(&II); | |||
513 | return IC.replaceInstUsesWith(II, PFalse); | |||
514 | } | |||
515 | ||||
516 | // Calculate largest predicate type used (where byte predicate is largest) | |||
517 | unsigned Mask = 8; | |||
518 | for (unsigned I = 0; I < 16; ++I) | |||
519 | if ((PredicateBits & (1 << I)) != 0) | |||
520 | Mask |= (I % 8); | |||
521 | ||||
522 | unsigned PredSize = Mask & -Mask; | |||
523 | auto *PredType = ScalableVectorType::get( | |||
524 | Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8)); | |||
525 | ||||
526 | // Ensure all relevant bits are set | |||
527 | for (unsigned I = 0; I < 16; I += PredSize) | |||
528 | if ((PredicateBits & (1 << I)) == 0) | |||
529 | return None; | |||
530 | ||||
531 | auto *PTruePat = | |||
532 | ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); | |||
533 | auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, | |||
534 | {PredType}, {PTruePat}); | |||
535 | auto *ConvertToSVBool = Builder.CreateIntrinsic( | |||
536 | Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue}); | |||
537 | auto *ConvertFromSVBool = | |||
538 | Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, | |||
539 | {II.getType()}, {ConvertToSVBool}); | |||
540 | ||||
541 | ConvertFromSVBool->takeName(&II); | |||
542 | return IC.replaceInstUsesWith(II, ConvertFromSVBool); | |||
543 | } | |||
544 | ||||
545 | static Optional<Instruction *> instCombineSVELast(InstCombiner &IC, | |||
546 | IntrinsicInst &II) { | |||
547 | IRBuilder<> Builder(II.getContext()); | |||
548 | Builder.SetInsertPoint(&II); | |||
549 | Value *Pg = II.getArgOperand(0); | |||
550 | Value *Vec = II.getArgOperand(1); | |||
551 | auto IntrinsicID = II.getIntrinsicID(); | |||
552 | bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta; | |||
553 | ||||
554 | // lastX(splat(X)) --> X | |||
555 | if (auto *SplatVal = getSplatValue(Vec)) | |||
556 | return IC.replaceInstUsesWith(II, SplatVal); | |||
557 | ||||
558 | // If x and/or y is a splat value then: | |||
559 | // lastX (binop (x, y)) --> binop(lastX(x), lastX(y)) | |||
560 | Value *LHS, *RHS; | |||
561 | if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) { | |||
562 | if (isSplatValue(LHS) || isSplatValue(RHS)) { | |||
563 | auto *OldBinOp = cast<BinaryOperator>(Vec); | |||
564 | auto OpC = OldBinOp->getOpcode(); | |||
565 | auto *NewLHS = | |||
566 | Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS}); | |||
567 | auto *NewRHS = | |||
568 | Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS}); | |||
569 | auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags( | |||
570 | OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II); | |||
571 | return IC.replaceInstUsesWith(II, NewBinOp); | |||
572 | } | |||
573 | } | |||
574 | ||||
575 | auto *C = dyn_cast<Constant>(Pg); | |||
576 | if (IsAfter && C && C->isNullValue()) { | |||
577 | // The intrinsic is extracting lane 0 so use an extract instead. | |||
578 | auto *IdxTy = Type::getInt64Ty(II.getContext()); | |||
579 | auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0)); | |||
580 | Extract->insertBefore(&II); | |||
581 | Extract->takeName(&II); | |||
582 | return IC.replaceInstUsesWith(II, Extract); | |||
583 | } | |||
584 | ||||
585 | auto *IntrPG = dyn_cast<IntrinsicInst>(Pg); | |||
586 | if (!IntrPG) | |||
587 | return None; | |||
588 | ||||
589 | if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) | |||
590 | return None; | |||
591 | ||||
592 | const auto PTruePattern = | |||
593 | cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue(); | |||
594 | ||||
595 | // Can the intrinsic's predicate be converted to a known constant index? | |||
596 | unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern); | |||
597 | if (!MinNumElts) | |||
598 | return None; | |||
599 | ||||
600 | unsigned Idx = MinNumElts - 1; | |||
601 | // Increment the index if extracting the element after the last active | |||
602 | // predicate element. | |||
603 | if (IsAfter) | |||
604 | ++Idx; | |||
605 | ||||
606 | // Ignore extracts whose index is larger than the known minimum vector | |||
607 | // length. NOTE: This is an artificial constraint where we prefer to | |||
608 | // maintain what the user asked for until an alternative is proven faster. | |||
609 | auto *PgVTy = cast<ScalableVectorType>(Pg->getType()); | |||
610 | if (Idx >= PgVTy->getMinNumElements()) | |||
611 | return None; | |||
612 | ||||
613 | // The intrinsic is extracting a fixed lane so use an extract instead. | |||
614 | auto *IdxTy = Type::getInt64Ty(II.getContext()); | |||
615 | auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx)); | |||
616 | Extract->insertBefore(&II); | |||
617 | Extract->takeName(&II); | |||
618 | return IC.replaceInstUsesWith(II, Extract); | |||
619 | } | |||
620 | ||||
621 | static Optional<Instruction *> instCombineRDFFR(InstCombiner &IC, | |||
622 | IntrinsicInst &II) { | |||
623 | LLVMContext &Ctx = II.getContext(); | |||
624 | IRBuilder<> Builder(Ctx); | |||
625 | Builder.SetInsertPoint(&II); | |||
626 | // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr | |||
627 | // can work with RDFFR_PP for ptest elimination. | |||
628 | auto *AllPat = | |||
629 | ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); | |||
630 | auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, | |||
631 | {II.getType()}, {AllPat}); | |||
632 | auto *RDFFR = | |||
633 | Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue}); | |||
634 | RDFFR->takeName(&II); | |||
635 | return IC.replaceInstUsesWith(II, RDFFR); | |||
636 | } | |||
637 | ||||
638 | static Optional<Instruction *> | |||
639 | instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) { | |||
640 | const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue(); | |||
641 | ||||
642 | if (Pattern == AArch64SVEPredPattern::all) { | |||
643 | LLVMContext &Ctx = II.getContext(); | |||
644 | IRBuilder<> Builder(Ctx); | |||
645 | Builder.SetInsertPoint(&II); | |||
646 | ||||
647 | Constant *StepVal = ConstantInt::get(II.getType(), NumElts); | |||
648 | auto *VScale = Builder.CreateVScale(StepVal); | |||
649 | VScale->takeName(&II); | |||
650 | return IC.replaceInstUsesWith(II, VScale); | |||
651 | } | |||
652 | ||||
653 | unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern); | |||
654 | ||||
655 | return MinNumElts && NumElts >= MinNumElts | |||
656 | ? Optional<Instruction *>(IC.replaceInstUsesWith( | |||
657 | II, ConstantInt::get(II.getType(), MinNumElts))) | |||
658 | : None; | |||
659 | } | |||
660 | ||||
661 | static Optional<Instruction *> instCombineSVEPTest(InstCombiner &IC, | |||
662 | IntrinsicInst &II) { | |||
663 | IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); | |||
664 | IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); | |||
665 | ||||
666 | if (Op1 && Op2 && | |||
667 | Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && | |||
668 | Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && | |||
669 | Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) { | |||
670 | ||||
671 | IRBuilder<> Builder(II.getContext()); | |||
672 | Builder.SetInsertPoint(&II); | |||
673 | ||||
674 | Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)}; | |||
675 | Type *Tys[] = {Op1->getArgOperand(0)->getType()}; | |||
676 | ||||
677 | auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops); | |||
678 | ||||
679 | PTest->takeName(&II); | |||
680 | return IC.replaceInstUsesWith(II, PTest); | |||
681 | } | |||
682 | ||||
683 | return None; | |||
684 | } | |||
685 | ||||
686 | static Optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC, | |||
687 | IntrinsicInst &II) { | |||
688 | auto *OpPredicate = II.getOperand(0); | |||
689 | auto *OpMultiplicand = II.getOperand(1); | |||
690 | auto *OpMultiplier = II.getOperand(2); | |||
691 | ||||
692 | IRBuilder<> Builder(II.getContext()); | |||
693 | Builder.SetInsertPoint(&II); | |||
694 | ||||
695 | // Return true if a given instruction is an aarch64_sve_dup_x intrinsic call | |||
696 | // with a unit splat value, false otherwise. | |||
697 | auto IsUnitDupX = [](auto *I) { | |||
698 | auto *IntrI = dyn_cast<IntrinsicInst>(I); | |||
699 | if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x) | |||
700 | return false; | |||
701 | ||||
702 | auto *SplatValue = IntrI->getOperand(0); | |||
703 | return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); | |||
704 | }; | |||
705 | ||||
706 | // Return true if a given instruction is an aarch64_sve_dup intrinsic call | |||
707 | // with a unit splat value, false otherwise. | |||
708 | auto IsUnitDup = [](auto *I) { | |||
709 | auto *IntrI = dyn_cast<IntrinsicInst>(I); | |||
710 | if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup) | |||
711 | return false; | |||
712 | ||||
713 | auto *SplatValue = IntrI->getOperand(2); | |||
714 | return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); | |||
715 | }; | |||
716 | ||||
717 | // The OpMultiplier variable should always point to the dup (if any), so | |||
718 | // swap if necessary. | |||
719 | if (IsUnitDup(OpMultiplicand) || IsUnitDupX(OpMultiplicand)) | |||
720 | std::swap(OpMultiplier, OpMultiplicand); | |||
721 | ||||
722 | if (IsUnitDupX(OpMultiplier)) { | |||
723 | // [f]mul pg (dupx 1) %n => %n | |||
724 | OpMultiplicand->takeName(&II); | |||
725 | return IC.replaceInstUsesWith(II, OpMultiplicand); | |||
726 | } else if (IsUnitDup(OpMultiplier)) { | |||
727 | // [f]mul pg (dup pg 1) %n => %n | |||
728 | auto *DupInst = cast<IntrinsicInst>(OpMultiplier); | |||
729 | auto *DupPg = DupInst->getOperand(1); | |||
730 | // TODO: this is naive. The optimization is still valid if DupPg | |||
731 | // 'encompasses' OpPredicate, not only if they're the same predicate. | |||
732 | if (OpPredicate == DupPg) { | |||
733 | OpMultiplicand->takeName(&II); | |||
734 | return IC.replaceInstUsesWith(II, OpMultiplicand); | |||
735 | } | |||
736 | } | |||
737 | ||||
738 | return None; | |||
739 | } | |||
740 | ||||
741 | static Optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC, | |||
742 | IntrinsicInst &II) { | |||
743 | IRBuilder<> Builder(II.getContext()); | |||
744 | Builder.SetInsertPoint(&II); | |||
745 | Value *UnpackArg = II.getArgOperand(0); | |||
746 | auto *RetTy = cast<ScalableVectorType>(II.getType()); | |||
747 | bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi || | |||
748 | II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo; | |||
749 | ||||
750 | // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X)) | |||
751 | // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X)) | |||
752 | if (auto *ScalarArg = getSplatValue(UnpackArg)) { | |||
753 | ScalarArg = | |||
754 | Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned); | |||
755 | Value *NewVal = | |||
756 | Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg); | |||
757 | NewVal->takeName(&II); | |||
758 | return IC.replaceInstUsesWith(II, NewVal); | |||
759 | } | |||
760 | ||||
761 | return None; | |||
762 | } | |||
763 | static Optional<Instruction *> instCombineSVETBL(InstCombiner &IC, | |||
764 | IntrinsicInst &II) { | |||
765 | auto *OpVal = II.getOperand(0); | |||
766 | auto *OpIndices = II.getOperand(1); | |||
767 | VectorType *VTy = cast<VectorType>(II.getType()); | |||
768 | ||||
769 | // Check whether OpIndices is an aarch64_sve_dup_x intrinsic call with | |||
770 | // constant splat value < minimal element count of result. | |||
771 | auto *DupXIntrI = dyn_cast<IntrinsicInst>(OpIndices); | |||
772 | if (!DupXIntrI || DupXIntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x) | |||
773 | return None; | |||
774 | ||||
775 | auto *SplatValue = dyn_cast<ConstantInt>(DupXIntrI->getOperand(0)); | |||
776 | if (!SplatValue || | |||
777 | SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue())) | |||
778 | return None; | |||
779 | ||||
780 | // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to | |||
781 | // splat_vector(extractelement(OpVal, SplatValue)) for further optimization. | |||
782 | IRBuilder<> Builder(II.getContext()); | |||
783 | Builder.SetInsertPoint(&II); | |||
784 | auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue); | |||
785 | auto *VectorSplat = | |||
786 | Builder.CreateVectorSplat(VTy->getElementCount(), Extract); | |||
787 | ||||
788 | VectorSplat->takeName(&II); | |||
789 | return IC.replaceInstUsesWith(II, VectorSplat); | |||
790 | } | |||
791 | ||||
792 | Optional<Instruction *> | |||
793 | AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, | |||
794 | IntrinsicInst &II) const { | |||
795 | Intrinsic::ID IID = II.getIntrinsicID(); | |||
796 | switch (IID) { | |||
797 | default: | |||
798 | break; | |||
799 | case Intrinsic::aarch64_sve_convert_from_svbool: | |||
800 | return instCombineConvertFromSVBool(IC, II); | |||
801 | case Intrinsic::aarch64_sve_dup: | |||
802 | return instCombineSVEDup(IC, II); | |||
803 | case Intrinsic::aarch64_sve_cmpne: | |||
804 | case Intrinsic::aarch64_sve_cmpne_wide: | |||
805 | return instCombineSVECmpNE(IC, II); | |||
806 | case Intrinsic::aarch64_sve_rdffr: | |||
807 | return instCombineRDFFR(IC, II); | |||
808 | case Intrinsic::aarch64_sve_lasta: | |||
809 | case Intrinsic::aarch64_sve_lastb: | |||
810 | return instCombineSVELast(IC, II); | |||
811 | case Intrinsic::aarch64_sve_cntd: | |||
812 | return instCombineSVECntElts(IC, II, 2); | |||
813 | case Intrinsic::aarch64_sve_cntw: | |||
814 | return instCombineSVECntElts(IC, II, 4); | |||
815 | case Intrinsic::aarch64_sve_cnth: | |||
816 | return instCombineSVECntElts(IC, II, 8); | |||
817 | case Intrinsic::aarch64_sve_cntb: | |||
818 | return instCombineSVECntElts(IC, II, 16); | |||
819 | case Intrinsic::aarch64_sve_ptest_any: | |||
820 | case Intrinsic::aarch64_sve_ptest_first: | |||
821 | case Intrinsic::aarch64_sve_ptest_last: | |||
822 | return instCombineSVEPTest(IC, II); | |||
823 | case Intrinsic::aarch64_sve_mul: | |||
824 | case Intrinsic::aarch64_sve_fmul: | |||
825 | return instCombineSVEVectorMul(IC, II); | |||
826 | case Intrinsic::aarch64_sve_tbl: | |||
827 | return instCombineSVETBL(IC, II); | |||
828 | case Intrinsic::aarch64_sve_uunpkhi: | |||
829 | case Intrinsic::aarch64_sve_uunpklo: | |||
830 | case Intrinsic::aarch64_sve_sunpkhi: | |||
831 | case Intrinsic::aarch64_sve_sunpklo: | |||
832 | return instCombineSVEUnpack(IC, II); | |||
833 | } | |||
834 | ||||
835 | return None; | |||
836 | } | |||
837 | ||||
838 | bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, | |||
839 | ArrayRef<const Value *> Args) { | |||
840 | ||||
841 | // A helper that returns a vector type from the given type. The number of | |||
842 | // elements in type Ty determine the vector width. | |||
843 | auto toVectorTy = [&](Type *ArgTy) { | |||
844 | return VectorType::get(ArgTy->getScalarType(), | |||
845 | cast<VectorType>(DstTy)->getElementCount()); | |||
846 | }; | |||
847 | ||||
848 | // Exit early if DstTy is not a vector type whose elements are at least | |||
849 | // 16-bits wide. | |||
850 | if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16) | |||
851 | return false; | |||
852 | ||||
853 | // Determine if the operation has a widening variant. We consider both the | |||
854 | // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the | |||
855 | // instructions. | |||
856 | // | |||
857 | // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we | |||
858 | // verify that their extending operands are eliminated during code | |||
859 | // generation. | |||
860 | switch (Opcode) { | |||
861 | case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). | |||
862 | case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). | |||
863 | break; | |||
864 | default: | |||
865 | return false; | |||
866 | } | |||
867 | ||||
868 | // To be a widening instruction (either the "wide" or "long" versions), the | |||
869 | // second operand must be a sign- or zero extend having a single user. We | |||
870 | // only consider extends having a single user because they may otherwise not | |||
871 | // be eliminated. | |||
872 | if (Args.size() != 2 || | |||
873 | (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) || | |||
874 | !Args[1]->hasOneUse()) | |||
875 | return false; | |||
876 | auto *Extend = cast<CastInst>(Args[1]); | |||
877 | ||||
878 | // Legalize the destination type and ensure it can be used in a widening | |||
879 | // operation. | |||
880 | auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy); | |||
881 | unsigned DstElTySize = DstTyL.second.getScalarSizeInBits(); | |||
882 | if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits()) | |||
883 | return false; | |||
884 | ||||
885 | // Legalize the source type and ensure it can be used in a widening | |||
886 | // operation. | |||
887 | auto *SrcTy = toVectorTy(Extend->getSrcTy()); | |||
888 | auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy); | |||
889 | unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); | |||
890 | if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) | |||
891 | return false; | |||
892 | ||||
893 | // Get the total number of vector elements in the legalized types. | |||
894 | InstructionCost NumDstEls = | |||
895 | DstTyL.first * DstTyL.second.getVectorMinNumElements(); | |||
896 | InstructionCost NumSrcEls = | |||
897 | SrcTyL.first * SrcTyL.second.getVectorMinNumElements(); | |||
898 | ||||
899 | // Return true if the legalized types have the same number of vector elements | |||
900 | // and the destination element type size is twice that of the source type. | |||
901 | return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize; | |||
902 | } | |||
903 | ||||
904 | InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, | |||
905 | Type *Src, | |||
906 | TTI::CastContextHint CCH, | |||
907 | TTI::TargetCostKind CostKind, | |||
908 | const Instruction *I) { | |||
909 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
910 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp" , 910, __extension__ __PRETTY_FUNCTION__)); | |||
911 | ||||
912 | // If the cast is observable, and it is used by a widening instruction (e.g., | |||
913 | // uaddl, saddw, etc.), it may be free. | |||
914 | if (I && I->hasOneUse()) { | |||
915 | auto *SingleUser = cast<Instruction>(*I->user_begin()); | |||
916 | SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); | |||
917 | if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) { | |||
918 | // If the cast is the second operand, it is free. We will generate either | |||
919 | // a "wide" or "long" version of the widening instruction. | |||
920 | if (I == SingleUser->getOperand(1)) | |||
921 | return 0; | |||
922 | // If the cast is not the second operand, it will be free if it looks the | |||
923 | // same as the second operand. In this case, we will generate a "long" | |||
924 | // version of the widening instruction. | |||
925 | if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1))) | |||
926 | if (I->getOpcode() == unsigned(Cast->getOpcode()) && | |||
927 | cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy()) | |||
928 | return 0; | |||
929 | } | |||
930 | } | |||
931 | ||||
932 | // TODO: Allow non-throughput costs that aren't binary. | |||
933 | auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { | |||
934 | if (CostKind != TTI::TCK_RecipThroughput) | |||
935 | return Cost == 0 ? 0 : 1; | |||
936 | return Cost; | |||
937 | }; | |||
938 | ||||
939 | EVT SrcTy = TLI->getValueType(DL, Src); | |||
940 | EVT DstTy = TLI->getValueType(DL, Dst); | |||
941 | ||||
942 | if (!SrcTy.isSimple() || !DstTy.isSimple()) | |||
943 | return AdjustCost( | |||
944 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); | |||
945 | ||||
946 | static const TypeConversionCostTblEntry | |||
947 | ConversionTbl[] = { | |||
948 | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, | |||
949 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, | |||
950 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, | |||
951 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, | |||
952 | ||||
953 | // Truncations on nxvmiN | |||
954 | { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 }, | |||
955 | { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 }, | |||
956 | { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 }, | |||
957 | { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 }, | |||
958 | { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 }, | |||
959 | { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 }, | |||
960 | { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 }, | |||
961 | { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 }, | |||
962 | { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 }, | |||
963 | { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 }, | |||
964 | { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 }, | |||
965 | { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 }, | |||
966 | { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 }, | |||
967 | { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 }, | |||
968 | { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 }, | |||
969 | { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 }, | |||
970 | ||||
971 | // The number of shll instructions for the extension. | |||
972 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, | |||
973 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, | |||
974 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, | |||
975 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, | |||
976 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, | |||
977 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, | |||
978 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, | |||
979 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, | |||
980 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, | |||
981 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, | |||
982 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, | |||
983 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, | |||
984 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, | |||
985 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, | |||
986 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, | |||
987 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, | |||
988 | ||||
989 | // LowerVectorINT_TO_FP: | |||
990 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, | |||
991 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, | |||
992 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, | |||
993 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, | |||
994 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, | |||
995 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, | |||
996 | ||||
997 | // Complex: to v2f32 | |||
998 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, | |||
999 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, | |||
1000 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, | |||
1001 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, | |||
1002 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, | |||
1003 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, | |||
1004 | ||||
1005 | // Complex: to v4f32 | |||
1006 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 }, | |||
1007 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, | |||
1008 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, | |||
1009 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, | |||
1010 | ||||
1011 | // Complex: to v8f32 | |||
1012 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, | |||
1013 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, | |||
1014 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, | |||
1015 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, | |||
1016 | ||||
1017 | // Complex: to v16f32 | |||
1018 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, | |||
1019 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, | |||
1020 | ||||
1021 | // Complex: to v2f64 | |||
1022 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, | |||
1023 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, | |||
1024 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, | |||
1025 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, | |||
1026 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, | |||
1027 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, | |||
1028 | ||||
1029 | ||||
1030 | // LowerVectorFP_TO_INT | |||
1031 | { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 }, | |||
1032 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, | |||
1033 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, | |||
1034 | { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, | |||
1035 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, | |||
1036 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, | |||
1037 | ||||
1038 | // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). | |||
1039 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 }, | |||
1040 | { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 }, | |||
1041 | { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 }, | |||
1042 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 }, | |||
1043 | { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 }, | |||
1044 | { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 }, | |||
1045 | ||||
1046 | // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 | |||
1047 | { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, | |||
1048 | { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 }, | |||
1049 | { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, | |||
1050 | { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 }, | |||
1051 | ||||
1052 | // Complex, from nxv2f32. | |||
1053 | { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, | |||
1054 | { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, | |||
1055 | { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, | |||
1056 | { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, | |||
1057 | { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, | |||
1058 | { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, | |||
1059 | { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, | |||
1060 | { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, | |||
1061 | ||||
1062 | // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. | |||
1063 | { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, | |||
1064 | { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, | |||
1065 | { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 }, | |||
1066 | { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, | |||
1067 | { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, | |||
1068 | { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, | |||
1069 | ||||
1070 | // Complex, from nxv2f64. | |||
1071 | { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, | |||
1072 | { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, | |||
1073 | { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, | |||
1074 | { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, | |||
1075 | { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, | |||
1076 | { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, | |||
1077 | { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, | |||
1078 | { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, | |||
1079 | ||||
1080 | // Complex, from nxv4f32. | |||
1081 | { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, | |||
1082 | { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, | |||
1083 | { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, | |||
1084 | { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, | |||
1085 | { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, | |||
1086 | { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, | |||
1087 | { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, | |||
1088 | { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, | |||
1089 | ||||
1090 | // Complex, from nxv8f64. Illegal -> illegal conversions not required. | |||
1091 | { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, | |||
1092 | { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, | |||
1093 | { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, | |||
1094 | { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, | |||
1095 | ||||
1096 | // Complex, from nxv4f64. Illegal -> illegal conversions not required. | |||
1097 | { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, | |||
1098 | { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, | |||
1099 | { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, | |||
1100 | { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, | |||
1101 | { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, | |||
1102 | { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, | |||
1103 | ||||
1104 | // Complex, from nxv8f32. Illegal -> illegal conversions not required. | |||
1105 | { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, | |||
1106 | { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, | |||
1107 | { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, | |||
1108 | { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, | |||
1109 | ||||
1110 | // Complex, from nxv8f16. | |||
1111 | { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, | |||
1112 | { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, | |||
1113 | { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, | |||
1114 | { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, | |||
1115 | { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, | |||
1116 | { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, | |||
1117 | { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, | |||
1118 | { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, | |||
1119 | ||||
1120 | // Complex, from nxv4f16. | |||
1121 | { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, | |||
1122 | { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, | |||
1123 | { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, | |||
1124 | { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, | |||
1125 | { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, | |||
1126 | { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, | |||
1127 | { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, | |||
1128 | { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, | |||
1129 | ||||
1130 | // Complex, from nxv2f16. | |||
1131 | { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, | |||
1132 | { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, | |||
1133 | { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, | |||
1134 | { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, | |||
1135 | { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, | |||
1136 | { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, | |||
1137 | { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, | |||
1138 | { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, | |||
1139 | ||||
1140 | // Truncate from nxvmf32 to nxvmf16. | |||
1141 | { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 }, | |||
1142 | { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 }, | |||
1143 | { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 }, | |||
1144 | ||||
1145 | // Truncate from nxvmf64 to nxvmf16. | |||
1146 | { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 }, | |||
1147 | { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 }, | |||
1148 | { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 }, | |||
1149 | ||||
1150 | // Truncate from nxvmf64 to nxvmf32. | |||
1151 | { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 }, | |||
1152 | { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 }, | |||
1153 | { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 }, | |||
1154 | ||||
1155 | // Extend from nxvmf16 to nxvmf32. | |||
1156 | { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1}, | |||
1157 | { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1}, | |||
1158 | { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2}, | |||
1159 | ||||
1160 | // Extend from nxvmf16 to nxvmf64. | |||
1161 | { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1}, | |||
1162 | { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2}, | |||
1163 | { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4}, | |||
1164 | ||||
1165 | // Extend from nxvmf32 to nxvmf64. | |||
1166 | { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1}, | |||
1167 | { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2}, | |||
1168 | { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6}, | |||
1169 | ||||
1170 | }; | |||
1171 | ||||
1172 | if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, | |||
1173 | DstTy.getSimpleVT(), | |||
1174 | SrcTy.getSimpleVT())) | |||
1175 | return AdjustCost(Entry->Cost); | |||
1176 | ||||
1177 | return AdjustCost( | |||
1178 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); | |||
1179 | } | |||
1180 | ||||
1181 | InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, | |||
1182 | Type *Dst, | |||
1183 | VectorType *VecTy, | |||
1184 | unsigned Index) { | |||
1185 | ||||
1186 | // Make sure we were given a valid extend opcode. | |||
1187 | assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&(static_cast <bool> ((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && "Invalid opcode") ? void (0 ) : __assert_fail ("(Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && \"Invalid opcode\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp" , 1188, __extension__ __PRETTY_FUNCTION__)) | |||
1188 | "Invalid opcode")(static_cast <bool> ((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && "Invalid opcode") ? void (0 ) : __assert_fail ("(Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && \"Invalid opcode\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp" , 1188, __extension__ __PRETTY_FUNCTION__)); | |||
1189 | ||||
1190 | // We are extending an element we extract from a vector, so the source type | |||
1191 | // of the extend is the element type of the vector. | |||
1192 | auto *Src = VecTy->getElementType(); | |||
1193 | ||||
1194 | // Sign- and zero-extends are for integer types only. | |||
1195 | assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type")(static_cast <bool> (isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type") ? void (0) : __assert_fail ("isa<IntegerType>(Dst) && isa<IntegerType>(Src) && \"Invalid type\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp" , 1195, __extension__ __PRETTY_FUNCTION__)); | |||
1196 | ||||
1197 | // Get the cost for the extract. We compute the cost (if any) for the extend | |||
1198 | // below. | |||
1199 | InstructionCost Cost = | |||
1200 | getVectorInstrCost(Instruction::ExtractElement, VecTy, Index); | |||
1201 | ||||
1202 | // Legalize the types. | |||
1203 | auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy); | |||
1204 | auto DstVT = TLI->getValueType(DL, Dst); | |||
1205 | auto SrcVT = TLI->getValueType(DL, Src); | |||
1206 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | |||
1207 | ||||
1208 | // If the resulting type is still a vector and the destination type is legal, | |||
1209 | // we may get the extension for free. If not, get the default cost for the | |||
1210 | // extend. | |||
1211 | if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT)) | |||
1212 | return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, | |||
1213 | CostKind); | |||
1214 | ||||
1215 | // The destination type should be larger than the element type. If not, get | |||
1216 | // the default cost for the extend. | |||
1217 | if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits()) | |||
1218 | return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, | |||
1219 | CostKind); | |||
1220 | ||||
1221 | switch (Opcode) { | |||
1222 | default: | |||
1223 | llvm_unreachable("Opcode should be either SExt or ZExt")::llvm::llvm_unreachable_internal("Opcode should be either SExt or ZExt" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp" , 1223); | |||
1224 | ||||
1225 | // For sign-extends, we only need a smov, which performs the extension | |||
1226 | // automatically. | |||
1227 | case Instruction::SExt: | |||
1228 | return Cost; | |||
1229 | ||||
1230 | // For zero-extends, the extend is performed automatically by a umov unless | |||
1231 | // the destination type is i64 and the element type is i8 or i16. | |||
1232 | case Instruction::ZExt: | |||
1233 | if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u) | |||
1234 | return Cost; | |||
1235 | } | |||
1236 | ||||
1237 | // If we are unable to perform the extend for free, get the default cost. | |||
1238 | return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, | |||
1239 | CostKind); | |||
1240 | } | |||
1241 | ||||
1242 | InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode, | |||
1243 | TTI::TargetCostKind CostKind, | |||
1244 | const Instruction *I) { | |||
1245 | if (CostKind != TTI::TCK_RecipThroughput) | |||
1246 | return Opcode == Instruction::PHI ? 0 : 1; | |||
1247 | assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind")(static_cast <bool> (CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind") ? void (0) : __assert_fail ("CostKind == TTI::TCK_RecipThroughput && \"unexpected CostKind\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp" , 1247, __extension__ __PRETTY_FUNCTION__)); | |||
1248 | // Branches are assumed to be predicted. | |||
1249 | return 0; | |||
1250 | } | |||
1251 | ||||
1252 | InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, | |||
1253 | unsigned Index) { | |||
1254 | assert(Val->isVectorTy() && "This must be a vector type")(static_cast <bool> (Val->isVectorTy() && "This must be a vector type" ) ? void (0) : __assert_fail ("Val->isVectorTy() && \"This must be a vector type\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp" , 1254, __extension__ __PRETTY_FUNCTION__)); | |||
1255 | ||||
1256 | if (Index != -1U) { | |||
1257 | // Legalize the type. | |||
1258 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); | |||
1259 | ||||
1260 | // This type is legalized to a scalar type. | |||
1261 | if (!LT.second.isVector()) | |||
1262 | return 0; | |||
1263 | ||||
1264 | // The type may be split. Normalize the index to the new type. | |||
1265 | unsigned Width = LT.second.getVectorNumElements(); | |||
1266 | Index = Index % Width; | |||
1267 | ||||
1268 | // The element at index zero is already inside the vector. | |||
1269 | if (Index == 0) | |||
1270 | return 0; | |||
1271 | } | |||
1272 | ||||
1273 | // All other insert/extracts cost this much. | |||
1274 | return ST->getVectorInsertExtractBaseCost(); | |||
1275 | } | |||
1276 | ||||
1277 | InstructionCost AArch64TTIImpl::getArithmeticInstrCost( | |||
1278 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, | |||
1279 | TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, | |||
1280 | TTI::OperandValueProperties Opd1PropInfo, | |||
1281 | TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, | |||
1282 | const Instruction *CxtI) { | |||
1283 | // TODO: Handle more cost kinds. | |||
1284 | if (CostKind != TTI::TCK_RecipThroughput) | |||
1285 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, | |||
1286 | Opd2Info, Opd1PropInfo, | |||
1287 | Opd2PropInfo, Args, CxtI); | |||
1288 | ||||
1289 | // Legalize the type. | |||
1290 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); | |||
1291 | ||||
1292 | // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.), | |||
1293 | // add in the widening overhead specified by the sub-target. Since the | |||
1294 | // extends feeding widening instructions are performed automatically, they | |||
1295 | // aren't present in the generated code and have a zero cost. By adding a | |||
1296 | // widening overhead here, we attach the total cost of the combined operation | |||
1297 | // to the widening instruction. | |||
1298 | InstructionCost Cost = 0; | |||
1299 | if (isWideningInstruction(Ty, Opcode, Args)) | |||
1300 | Cost += ST->getWideningBaseCost(); | |||
1301 | ||||
1302 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
1303 | ||||
1304 | switch (ISD) { | |||
1305 | default: | |||
1306 | return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, | |||
1307 | Opd2Info, | |||
1308 | Opd1PropInfo, Opd2PropInfo); | |||
1309 | case ISD::SDIV: | |||
1310 | if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue && | |||
1311 | Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { | |||
1312 | // On AArch64, scalar signed division by constants power-of-two are | |||
1313 | // normally expanded to the sequence ADD + CMP + SELECT + SRA. | |||
1314 | // The OperandValue properties many not be same as that of previous | |||
1315 | // operation; conservatively assume OP_None. | |||
1316 | Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, | |||
1317 | Opd1Info, Opd2Info, | |||
1318 | TargetTransformInfo::OP_None, | |||
1319 | TargetTransformInfo::OP_None); | |||
1320 | Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, | |||
1321 | Opd1Info, Opd2Info, | |||
1322 | TargetTransformInfo::OP_None, | |||
1323 | TargetTransformInfo::OP_None); | |||
1324 | Cost += getArithmeticInstrCost(Instruction::Select, Ty, CostKind, | |||
1325 | Opd1Info, Opd2Info, | |||
1326 | TargetTransformInfo::OP_None, | |||
1327 | TargetTransformInfo::OP_None); | |||
1328 | Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, | |||
1329 | Opd1Info, Opd2Info, | |||
1330 | TargetTransformInfo::OP_None, | |||
1331 | TargetTransformInfo::OP_None); | |||
1332 | return Cost; | |||
1333 | } | |||
1334 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; | |||
1335 | case ISD::UDIV: | |||
1336 | if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) { | |||
1337 | auto VT = TLI->getValueType(DL, Ty); | |||
1338 | if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) { | |||
1339 | // Vector signed division by constant are expanded to the | |||
1340 | // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division | |||
1341 | // to MULHS + SUB + SRL + ADD + SRL. | |||
1342 | InstructionCost MulCost = getArithmeticInstrCost( | |||
1343 | Instruction::Mul, Ty, CostKind, Opd1Info, Opd2Info, | |||
1344 | TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); | |||
1345 | InstructionCost AddCost = getArithmeticInstrCost( | |||
1346 | Instruction::Add, Ty, CostKind, Opd1Info, Opd2Info, | |||
1347 | TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); | |||
1348 | InstructionCost ShrCost = getArithmeticInstrCost( | |||
1349 | Instruction::AShr, Ty, CostKind, Opd1Info, Opd2Info, | |||
1350 | TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); | |||
1351 | return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1; | |||
1352 | } | |||
1353 | } | |||
1354 | ||||
1355 | Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, | |||
1356 | Opd2Info, | |||
1357 | Opd1PropInfo, Opd2PropInfo); | |||
1358 | if (Ty->isVectorTy()) { | |||
1359 | // On AArch64, vector divisions are not supported natively and are | |||
1360 | // expanded into scalar divisions of each pair of elements. | |||
1361 | Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind, | |||
1362 | Opd1Info, Opd2Info, Opd1PropInfo, | |||
1363 | Opd2PropInfo); | |||
1364 | Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind, | |||
1365 | Opd1Info, Opd2Info, Opd1PropInfo, | |||
1366 | Opd2PropInfo); | |||
1367 | // TODO: if one of the arguments is scalar, then it's not necessary to | |||
1368 | // double the cost of handling the vector elements. | |||
1369 | Cost += Cost; | |||
1370 | } | |||
1371 | return Cost; | |||
1372 | ||||
1373 | case ISD::MUL: | |||
1374 | if (LT.second != MVT::v2i64) | |||
1375 | return (Cost + 1) * LT.first; | |||
1376 | // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive | |||
1377 | // as elements are extracted from the vectors and the muls scalarized. | |||
1378 | // As getScalarizationOverhead is a bit too pessimistic, we estimate the | |||
1379 | // cost for a i64 vector directly here, which is: | |||
1380 | // - four i64 extracts, | |||
1381 | // - two i64 inserts, and | |||
1382 | // - two muls. | |||
1383 | // So, for a v2i64 with LT.First = 1 the cost is 8, and for a v4i64 with | |||
1384 | // LT.first = 2 the cost is 16. | |||
1385 | return LT.first * 8; | |||
1386 | case ISD::ADD: | |||
1387 | case ISD::XOR: | |||
1388 | case ISD::OR: | |||
1389 | case ISD::AND: | |||
1390 | // These nodes are marked as 'custom' for combining purposes only. | |||
1391 | // We know that they are legal. See LowerAdd in ISelLowering. | |||
1392 | return (Cost + 1) * LT.first; | |||
1393 | ||||
1394 | case ISD::FADD: | |||
1395 | // These nodes are marked as 'custom' just to lower them to SVE. | |||
1396 | // We know said lowering will incur no additional cost. | |||
1397 | if (isa<FixedVectorType>(Ty) && !Ty->getScalarType()->isFP128Ty()) | |||
1398 | return (Cost + 2) * LT.first; | |||
1399 | ||||
1400 | return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, | |||
1401 | Opd2Info, | |||
1402 | Opd1PropInfo, Opd2PropInfo); | |||
1403 | } | |||
1404 | } | |||
1405 | ||||
1406 | InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty, | |||
1407 | ScalarEvolution *SE, | |||
1408 | const SCEV *Ptr) { | |||
1409 | // Address computations in vectorized code with non-consecutive addresses will | |||
1410 | // likely result in more instructions compared to scalar code where the | |||
1411 | // computation can more often be merged into the index mode. The resulting | |||
1412 | // extra micro-ops can significantly decrease throughput. | |||
1413 | unsigned NumVectorInstToHideOverhead = 10; | |||
1414 | int MaxMergeDistance = 64; | |||
1415 | ||||
1416 | if (Ty->isVectorTy() && SE && | |||
1417 | !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) | |||
1418 | return NumVectorInstToHideOverhead; | |||
1419 | ||||
1420 | // In many cases the address computation is not merged into the instruction | |||
1421 | // addressing mode. | |||
1422 | return 1; | |||
1423 | } | |||
1424 | ||||
1425 | InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, | |||
1426 | Type *CondTy, | |||
1427 | CmpInst::Predicate VecPred, | |||
1428 | TTI::TargetCostKind CostKind, | |||
1429 | const Instruction *I) { | |||
1430 | // TODO: Handle other cost kinds. | |||
1431 | if (CostKind != TTI::TCK_RecipThroughput) | |||
1432 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, | |||
1433 | I); | |||
1434 | ||||
1435 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
1436 | // We don't lower some vector selects well that are wider than the register | |||
1437 | // width. | |||
1438 | if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) { | |||
1439 | // We would need this many instructions to hide the scalarization happening. | |||
1440 | const int AmortizationCost = 20; | |||
1441 | ||||
1442 | // If VecPred is not set, check if we can get a predicate from the context | |||
1443 | // instruction, if its type matches the requested ValTy. | |||
1444 | if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) { | |||
1445 | CmpInst::Predicate CurrentPred; | |||
1446 | if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(), | |||
1447 | m_Value()))) | |||
1448 | VecPred = CurrentPred; | |||
1449 | } | |||
1450 | // Check if we have a compare/select chain that can be lowered using CMxx & | |||
1451 | // BFI pair. | |||
1452 | if (CmpInst::isIntPredicate(VecPred)) { | |||
1453 | static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, | |||
1454 | MVT::v8i16, MVT::v2i32, MVT::v4i32, | |||
1455 | MVT::v2i64}; | |||
1456 | auto LT = TLI->getTypeLegalizationCost(DL, ValTy); | |||
1457 | if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) | |||
1458 | return LT.first; | |||
1459 | } | |||
1460 | ||||
1461 | static const TypeConversionCostTblEntry | |||
1462 | VectorSelectTbl[] = { | |||
1463 | { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, | |||
1464 | { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 }, | |||
1465 | { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 }, | |||
1466 | { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, | |||
1467 | { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, | |||
1468 | { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } | |||
1469 | }; | |||
1470 | ||||
1471 | EVT SelCondTy = TLI->getValueType(DL, CondTy); | |||
1472 | EVT SelValTy = TLI->getValueType(DL, ValTy); | |||
1473 | if (SelCondTy.isSimple() && SelValTy.isSimple()) { | |||
1474 | if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD, | |||
1475 | SelCondTy.getSimpleVT(), | |||
1476 | SelValTy.getSimpleVT())) | |||
1477 | return Entry->Cost; | |||
1478 | } | |||
1479 | } | |||
1480 | // The base case handles scalable vectors fine for now, since it treats the | |||
1481 | // cost as 1 * legalization cost. | |||
1482 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); | |||
1483 | } | |||
1484 | ||||
1485 | AArch64TTIImpl::TTI::MemCmpExpansionOptions | |||
1486 | AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { | |||
1487 | TTI::MemCmpExpansionOptions Options; | |||
1488 | if (ST->requiresStrictAlign()) { | |||
1489 | // TODO: Add cost modeling for strict align. Misaligned loads expand to | |||
1490 | // a bunch of instructions when strict align is enabled. | |||
1491 | return Options; | |||
1492 | } | |||
1493 | Options.AllowOverlappingLoads = true; | |||
1494 | Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); | |||
1495 | Options.NumLoadsPerBlock = Options.MaxNumLoads; | |||
1496 | // TODO: Though vector loads usually perform well on AArch64, in some targets | |||
1497 | // they may wake up the FP unit, which raises the power consumption. Perhaps | |||
1498 | // they could be used with no holds barred (-O3). | |||
1499 | Options.LoadSizes = {8, 4, 2, 1}; | |||
1500 | return Options; | |||
1501 | } | |||
1502 | ||||
1503 | InstructionCost | |||
1504 | AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, | |||
1505 | Align Alignment, unsigned AddressSpace, | |||
1506 | TTI::TargetCostKind CostKind) { | |||
1507 | if (useNeonVector(Src)) | |||
1508 | return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | |||
1509 | CostKind); | |||
1510 | auto LT = TLI->getTypeLegalizationCost(DL, Src); | |||
1511 | if (!LT.first.isValid()) | |||
1512 | return InstructionCost::getInvalid(); | |||
1513 | ||||
1514 | // The code-generator is currently not able to handle scalable vectors | |||
1515 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting | |||
1516 | // it. This change will be removed when code-generation for these types is | |||
1517 | // sufficiently reliable. | |||
1518 | if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1)) | |||
1519 | return InstructionCost::getInvalid(); | |||
1520 | ||||
1521 | return LT.first * 2; | |||
1522 | } | |||
1523 | ||||
1524 | InstructionCost AArch64TTIImpl::getGatherScatterOpCost( | |||
1525 | unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, | |||
1526 | Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { | |||
1527 | if (useNeonVector(DataTy)) | |||
1528 | return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, | |||
1529 | Alignment, CostKind, I); | |||
1530 | auto *VT = cast<VectorType>(DataTy); | |||
1531 | auto LT = TLI->getTypeLegalizationCost(DL, DataTy); | |||
1532 | if (!LT.first.isValid()) | |||
1533 | return InstructionCost::getInvalid(); | |||
1534 | ||||
1535 | // The code-generator is currently not able to handle scalable vectors | |||
1536 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting | |||
1537 | // it. This change will be removed when code-generation for these types is | |||
1538 | // sufficiently reliable. | |||
1539 | if (cast<VectorType>(DataTy)->getElementCount() == | |||
1540 | ElementCount::getScalable(1)) | |||
1541 | return InstructionCost::getInvalid(); | |||
1542 | ||||
1543 | ElementCount LegalVF = LT.second.getVectorElementCount(); | |||
1544 | InstructionCost MemOpCost = | |||
1545 | getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I); | |||
1546 | return LT.first * MemOpCost * getMaxNumElements(LegalVF, I->getFunction()); | |||
1547 | } | |||
1548 | ||||
1549 | bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { | |||
1550 | return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors(); | |||
1551 | } | |||
1552 | ||||
1553 | InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, | |||
1554 | MaybeAlign Alignment, | |||
1555 | unsigned AddressSpace, | |||
1556 | TTI::TargetCostKind CostKind, | |||
1557 | const Instruction *I) { | |||
1558 | EVT VT = TLI->getValueType(DL, Ty, true); | |||
1559 | // Type legalization can't handle structs | |||
1560 | if (VT == MVT::Other) | |||
1561 | return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, | |||
1562 | CostKind); | |||
1563 | ||||
1564 | auto LT = TLI->getTypeLegalizationCost(DL, Ty); | |||
1565 | if (!LT.first.isValid()) | |||
1566 | return InstructionCost::getInvalid(); | |||
1567 | ||||
1568 | // The code-generator is currently not able to handle scalable vectors | |||
1569 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting | |||
1570 | // it. This change will be removed when code-generation for these types is | |||
1571 | // sufficiently reliable. | |||
1572 | if (auto *VTy = dyn_cast<ScalableVectorType>(Ty)) | |||
1573 | if (VTy->getElementCount() == ElementCount::getScalable(1)) | |||
1574 | return InstructionCost::getInvalid(); | |||
1575 | ||||
1576 | // TODO: consider latency as well for TCK_SizeAndLatency. | |||
1577 | if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) | |||
1578 | return LT.first; | |||
1579 | ||||
1580 | if (CostKind != TTI::TCK_RecipThroughput) | |||
1581 | return 1; | |||
1582 | ||||
1583 | if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && | |||
1584 | LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) { | |||
1585 | // Unaligned stores are extremely inefficient. We don't split all | |||
1586 | // unaligned 128-bit stores because the negative impact that has shown in | |||
1587 | // practice on inlined block copy code. | |||
1588 | // We make such stores expensive so that we will only vectorize if there | |||
1589 | // are 6 other instructions getting vectorized. | |||
1590 | const int AmortizationCost = 6; | |||
1591 | ||||
1592 | return LT.first * 2 * AmortizationCost; | |||
1593 | } | |||
1594 | ||||
1595 | // Check truncating stores and extending loads. | |||
1596 | if (useNeonVector(Ty) && | |||
1597 | Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) { | |||
1598 | // v4i8 types are lowered to scalar a load/store and sshll/xtn. | |||
1599 | if (VT == MVT::v4i8) | |||
1600 | return 2; | |||
1601 | // Otherwise we need to scalarize. | |||
1602 | return cast<FixedVectorType>(Ty)->getNumElements() * 2; | |||
1603 | } | |||
1604 | ||||
1605 | return LT.first; | |||
1606 | } | |||
1607 | ||||
1608 | InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( | |||
1609 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, | |||
1610 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, | |||
1611 | bool UseMaskForCond, bool UseMaskForGaps) { | |||
1612 | assert(Factor >= 2 && "Invalid interleave factor")(static_cast <bool> (Factor >= 2 && "Invalid interleave factor" ) ? void (0) : __assert_fail ("Factor >= 2 && \"Invalid interleave factor\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp" , 1612, __extension__ __PRETTY_FUNCTION__)); | |||
1613 | auto *VecVTy = cast<FixedVectorType>(VecTy); | |||
1614 | ||||
1615 | if (!UseMaskForCond && !UseMaskForGaps && | |||
1616 | Factor <= TLI->getMaxSupportedInterleaveFactor()) { | |||
1617 | unsigned NumElts = VecVTy->getNumElements(); | |||
1618 | auto *SubVecTy = | |||
1619 | FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor); | |||
1620 | ||||
1621 | // ldN/stN only support legal vector types of size 64 or 128 in bits. | |||
1622 | // Accesses having vector types that are a multiple of 128 bits can be | |||
1623 | // matched to more than one ldN/stN instruction. | |||
1624 | if (NumElts % Factor == 0 && | |||
1625 | TLI->isLegalInterleavedAccessType(SubVecTy, DL)) | |||
1626 | return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL); | |||
1627 | } | |||
1628 | ||||
1629 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | |||
1630 | Alignment, AddressSpace, CostKind, | |||
1631 | UseMaskForCond, UseMaskForGaps); | |||
1632 | } | |||
1633 | ||||
1634 | InstructionCost | |||
1635 | AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { | |||
1636 | InstructionCost Cost = 0; | |||
1637 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | |||
1638 | for (auto *I : Tys) { | |||
1639 | if (!I->isVectorTy()) | |||
1640 | continue; | |||
1641 | if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() == | |||
1642 | 128) | |||
1643 | Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) + | |||
1644 | getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind); | |||
1645 | } | |||
1646 | return Cost; | |||
1647 | } | |||
1648 | ||||
1649 | unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) { | |||
1650 | return ST->getMaxInterleaveFactor(); | |||
1651 | } | |||
1652 | ||||
1653 | // For Falkor, we want to avoid having too many strided loads in a loop since | |||
1654 | // that can exhaust the HW prefetcher resources. We adjust the unroller | |||
1655 | // MaxCount preference below to attempt to ensure unrolling doesn't create too | |||
1656 | // many strided loads. | |||
1657 | static void | |||
1658 | getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, | |||
1659 | TargetTransformInfo::UnrollingPreferences &UP) { | |||
1660 | enum { MaxStridedLoads = 7 }; | |||
1661 | auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) { | |||
1662 | int StridedLoads = 0; | |||
1663 | // FIXME? We could make this more precise by looking at the CFG and | |||
1664 | // e.g. not counting loads in each side of an if-then-else diamond. | |||
1665 | for (const auto BB : L->blocks()) { | |||
1666 | for (auto &I : *BB) { | |||
1667 | LoadInst *LMemI = dyn_cast<LoadInst>(&I); | |||
1668 | if (!LMemI) | |||
1669 | continue; | |||
1670 | ||||
1671 | Value *PtrValue = LMemI->getPointerOperand(); | |||
1672 | if (L->isLoopInvariant(PtrValue)) | |||
1673 | continue; | |||
1674 | ||||
1675 | const SCEV *LSCEV = SE.getSCEV(PtrValue); | |||
1676 | const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); | |||
1677 | if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) | |||
1678 | continue; | |||
1679 | ||||
1680 | // FIXME? We could take pairing of unrolled load copies into account | |||
1681 | // by looking at the AddRec, but we would probably have to limit this | |||
1682 | // to loops with no stores or other memory optimization barriers. | |||
1683 | ++StridedLoads; | |||
1684 | // We've seen enough strided loads that seeing more won't make a | |||
1685 | // difference. | |||
1686 | if (StridedLoads > MaxStridedLoads / 2) | |||
1687 | return StridedLoads; | |||
1688 | } | |||
1689 | } | |||
1690 | return StridedLoads; | |||
1691 | }; | |||
1692 | ||||
1693 | int StridedLoads = countStridedLoads(L, SE); | |||
1694 | LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoadsdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("aarch64tti")) { dbgs() << "falkor-hwpf: detected " << StridedLoads << " strided loads\n"; } } while (false) | |||
1695 | << " strided loads\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("aarch64tti")) { dbgs() << "falkor-hwpf: detected " << StridedLoads << " strided loads\n"; } } while (false); | |||
1696 | // Pick the largest power of 2 unroll count that won't result in too many | |||
1697 | // strided loads. | |||
1698 | if (StridedLoads) { | |||
1699 | UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads); | |||
| ||||
1700 | LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("aarch64tti")) { dbgs() << "falkor-hwpf: setting unroll MaxCount to " << UP.MaxCount << '\n'; } } while (false) | |||
1701 | << UP.MaxCount << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("aarch64tti")) { dbgs() << "falkor-hwpf: setting unroll MaxCount to " << UP.MaxCount << '\n'; } } while (false); | |||
1702 | } | |||
1703 | } | |||
1704 | ||||
1705 | void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, | |||
1706 | TTI::UnrollingPreferences &UP, | |||
1707 | OptimizationRemarkEmitter *ORE) { | |||
1708 | // Enable partial unrolling and runtime unrolling. | |||
1709 | BaseT::getUnrollingPreferences(L, SE, UP, ORE); | |||
1710 | ||||
1711 | UP.UpperBound = true; | |||
1712 | ||||
1713 | // For inner loop, it is more likely to be a hot one, and the runtime check | |||
1714 | // can be promoted out from LICM pass, so the overhead is less, let's try | |||
1715 | // a larger threshold to unroll more loops. | |||
1716 | if (L->getLoopDepth() > 1) | |||
| ||||
1717 | UP.PartialThreshold *= 2; | |||
1718 | ||||
1719 | // Disable partial & runtime unrolling on -Os. | |||
1720 | UP.PartialOptSizeThreshold = 0; | |||
1721 | ||||
1722 | if (ST->getProcFamily() == AArch64Subtarget::Falkor && | |||
1723 | EnableFalkorHWPFUnrollFix) | |||
1724 | getFalkorUnrollingPreferences(L, SE, UP); | |||
1725 | ||||
1726 | // Scan the loop: don't unroll loops with calls as this could prevent | |||
1727 | // inlining. Don't unroll vector loops either, as they don't benefit much from | |||
1728 | // unrolling. | |||
1729 | for (auto *BB : L->getBlocks()) { | |||
1730 | for (auto &I : *BB) { | |||
1731 | // Don't unroll vectorised loop. | |||
1732 | if (I.getType()->isVectorTy()) | |||
1733 | return; | |||
1734 | ||||
1735 | if (isa<CallInst>(I) || isa<InvokeInst>(I)) { | |||
1736 | if (const Function *F = cast<CallBase>(I).getCalledFunction()) { | |||
1737 | if (!isLoweredToCall(F)) | |||
1738 | continue; | |||
1739 | } | |||
1740 | return; | |||
1741 | } | |||
1742 | } | |||
1743 | } | |||
1744 | ||||
1745 | // Enable runtime unrolling for in-order models | |||
1746 | // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by | |||
1747 | // checking for that case, we can ensure that the default behaviour is | |||
1748 | // unchanged | |||
1749 | if (ST->getProcFamily() != AArch64Subtarget::Others && | |||
1750 | !ST->getSchedModel().isOutOfOrder()) { | |||
1751 | UP.Runtime = true; | |||
1752 | UP.Partial = true; | |||
1753 | UP.UnrollRemainder = true; | |||
1754 | UP.DefaultUnrollRuntimeCount = 4; | |||
1755 | ||||
1756 | UP.UnrollAndJam = true; | |||
1757 | UP.UnrollAndJamInnerLoopThreshold = 60; | |||
1758 | } | |||
1759 | } | |||
1760 | ||||
1761 | void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, | |||
1762 | TTI::PeelingPreferences &PP) { | |||
1763 | BaseT::getPeelingPreferences(L, SE, PP); | |||
1764 | } | |||
1765 | ||||
1766 | Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, | |||
1767 | Type *ExpectedType) { | |||
1768 | switch (Inst->getIntrinsicID()) { | |||
1769 | default: | |||
1770 | return nullptr; | |||
1771 | case Intrinsic::aarch64_neon_st2: | |||
1772 | case Intrinsic::aarch64_neon_st3: | |||
1773 | case Intrinsic::aarch64_neon_st4: { | |||
1774 | // Create a struct type | |||
1775 | StructType *ST = dyn_cast<StructType>(ExpectedType); | |||
1776 | if (!ST) | |||
1777 | return nullptr; | |||
1778 | unsigned NumElts = Inst->getNumArgOperands() - 1; | |||
1779 | if (ST->getNumElements() != NumElts) | |||
1780 | return nullptr; | |||
1781 | for (unsigned i = 0, e = NumElts; i != e; ++i) { | |||
1782 | if (Inst->getArgOperand(i)->getType() != ST->getElementType(i)) | |||
1783 | return nullptr; | |||
1784 | } | |||
1785 | Value *Res = UndefValue::get(ExpectedType); | |||
1786 | IRBuilder<> Builder(Inst); | |||
1787 | for (unsigned i = 0, e = NumElts; i != e; ++i) { | |||
1788 | Value *L = Inst->getArgOperand(i); | |||
1789 | Res = Builder.CreateInsertValue(Res, L, i); | |||
1790 | } | |||
1791 | return Res; | |||
1792 | } | |||
1793 | case Intrinsic::aarch64_neon_ld2: | |||
1794 | case Intrinsic::aarch64_neon_ld3: | |||
1795 | case Intrinsic::aarch64_neon_ld4: | |||
1796 | if (Inst->getType() == ExpectedType) | |||
1797 | return Inst; | |||
1798 | return nullptr; | |||
1799 | } | |||
1800 | } | |||
1801 | ||||
1802 | bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, | |||
1803 | MemIntrinsicInfo &Info) { | |||
1804 | switch (Inst->getIntrinsicID()) { | |||
1805 | default: | |||
1806 | break; | |||
1807 | case Intrinsic::aarch64_neon_ld2: | |||
1808 | case Intrinsic::aarch64_neon_ld3: | |||
1809 | case Intrinsic::aarch64_neon_ld4: | |||
1810 | Info.ReadMem = true; | |||
1811 | Info.WriteMem = false; | |||
1812 | Info.PtrVal = Inst->getArgOperand(0); | |||
1813 | break; | |||
1814 | case Intrinsic::aarch64_neon_st2: | |||
1815 | case Intrinsic::aarch64_neon_st3: | |||
1816 | case Intrinsic::aarch64_neon_st4: | |||
1817 | Info.ReadMem = false; | |||
1818 | Info.WriteMem = true; | |||
1819 | Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1); | |||
1820 | break; | |||
1821 | } | |||
1822 | ||||
1823 | switch (Inst->getIntrinsicID()) { | |||
1824 | default: | |||
1825 | return false; | |||
1826 | case Intrinsic::aarch64_neon_ld2: | |||
1827 | case Intrinsic::aarch64_neon_st2: | |||
1828 | Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; | |||
1829 | break; | |||
1830 | case Intrinsic::aarch64_neon_ld3: | |||
1831 | case Intrinsic::aarch64_neon_st3: | |||
1832 | Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; | |||
1833 | break; | |||
1834 | case Intrinsic::aarch64_neon_ld4: | |||
1835 | case Intrinsic::aarch64_neon_st4: | |||
1836 | Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; | |||
1837 | break; | |||
1838 | } | |||
1839 | return true; | |||
1840 | } | |||
1841 | ||||
1842 | /// See if \p I should be considered for address type promotion. We check if \p | |||
1843 | /// I is a sext with right type and used in memory accesses. If it used in a | |||
1844 | /// "complex" getelementptr, we allow it to be promoted without finding other | |||
1845 | /// sext instructions that sign extended the same initial value. A getelementptr | |||
1846 | /// is considered as "complex" if it has more than 2 operands. | |||
1847 | bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( | |||
1848 | const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { | |||
1849 | bool Considerable = false; | |||
1850 | AllowPromotionWithoutCommonHeader = false; | |||
1851 | if (!isa<SExtInst>(&I)) | |||
1852 | return false; | |||
1853 | Type *ConsideredSExtType = | |||
1854 | Type::getInt64Ty(I.getParent()->getParent()->getContext()); | |||
1855 | if (I.getType() != ConsideredSExtType) | |||
1856 | return false; | |||
1857 | // See if the sext is the one with the right type and used in at least one | |||
1858 | // GetElementPtrInst. | |||
1859 | for (const User *U : I.users()) { | |||
1860 | if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) { | |||
1861 | Considerable = true; | |||
1862 | // A getelementptr is considered as "complex" if it has more than 2 | |||
1863 | // operands. We will promote a SExt used in such complex GEP as we | |||
1864 | // expect some computation to be merged if they are done on 64 bits. | |||
1865 | if (GEPInst->getNumOperands() > 2) { | |||
1866 | AllowPromotionWithoutCommonHeader = true; | |||
1867 | break; | |||
1868 | } | |||
1869 | } | |||
1870 | } | |||
1871 | return Considerable; | |||
1872 | } | |||
1873 | ||||
1874 | bool AArch64TTIImpl::isLegalToVectorizeReduction( | |||
1875 | const RecurrenceDescriptor &RdxDesc, ElementCount VF) const { | |||
1876 | if (!VF.isScalable()) | |||
1877 | return true; | |||
1878 | ||||
1879 | Type *Ty = RdxDesc.getRecurrenceType(); | |||
1880 | if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty)) | |||
1881 | return false; | |||
1882 | ||||
1883 | switch (RdxDesc.getRecurrenceKind()) { | |||
1884 | case RecurKind::Add: | |||
1885 | case RecurKind::FAdd: | |||
1886 | case RecurKind::And: | |||
1887 | case RecurKind::Or: | |||
1888 | case RecurKind::Xor: | |||
1889 | case RecurKind::SMin: | |||
1890 | case RecurKind::SMax: | |||
1891 | case RecurKind::UMin: | |||
1892 | case RecurKind::UMax: | |||
1893 | case RecurKind::FMin: | |||
1894 | case RecurKind::FMax: | |||
1895 | return true; | |||
1896 | default: | |||
1897 | return false; | |||
1898 | } | |||
1899 | } | |||
1900 | ||||
1901 | InstructionCost | |||
1902 | AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, | |||
1903 | bool IsUnsigned, | |||
1904 | TTI::TargetCostKind CostKind) { | |||
1905 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); | |||
1906 | ||||
1907 | if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) | |||
1908 | return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); | |||
1909 | ||||
1910 | assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) &&(static_cast <bool> ((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && "Both vector needs to be equally scalable" ) ? void (0) : __assert_fail ("(isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && \"Both vector needs to be equally scalable\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp" , 1911, __extension__ __PRETTY_FUNCTION__)) | |||
1911 | "Both vector needs to be equally scalable")(static_cast <bool> ((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && "Both vector needs to be equally scalable" ) ? void (0) : __assert_fail ("(isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && \"Both vector needs to be equally scalable\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp" , 1911, __extension__ __PRETTY_FUNCTION__)); | |||
1912 | ||||
1913 | InstructionCost LegalizationCost = 0; | |||
1914 | if (LT.first > 1) { | |||
1915 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext()); | |||
1916 | unsigned MinMaxOpcode = | |||
1917 | Ty->isFPOrFPVectorTy() | |||
1918 | ? Intrinsic::maxnum | |||
1919 | : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin); | |||
1920 | IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy}); | |||
1921 | LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1); | |||
1922 | } | |||
1923 | ||||
1924 | return LegalizationCost + /*Cost of horizontal reduction*/ 2; | |||
1925 | } | |||
1926 | ||||
1927 | InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE( | |||
1928 | unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) { | |||
1929 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); | |||
1930 | InstructionCost LegalizationCost = 0; | |||
1931 | if (LT.first > 1) { | |||
1932 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext()); | |||
1933 | LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind); | |||
1934 | LegalizationCost *= LT.first - 1; | |||
1935 | } | |||
1936 | ||||
1937 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
1938 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp" , 1938, __extension__ __PRETTY_FUNCTION__)); | |||
1939 | // Add the final reduction cost for the legal horizontal reduction | |||
1940 | switch (ISD) { | |||
1941 | case ISD::ADD: | |||
1942 | case ISD::AND: | |||
1943 | case ISD::OR: | |||
1944 | case ISD::XOR: | |||
1945 | case ISD::FADD: | |||
1946 | return LegalizationCost + 2; | |||
1947 | default: | |||
1948 | return InstructionCost::getInvalid(); | |||
1949 | } | |||
1950 | } | |||
1951 | ||||
1952 | InstructionCost | |||
1953 | AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, | |||
1954 | Optional<FastMathFlags> FMF, | |||
1955 | TTI::TargetCostKind CostKind) { | |||
1956 | if (TTI::requiresOrderedReduction(FMF)) { | |||
1957 | if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) { | |||
1958 | InstructionCost BaseCost = | |||
1959 | BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); | |||
1960 | // Add on extra cost to reflect the extra overhead on some CPUs. We still | |||
1961 | // end up vectorizing for more computationally intensive loops. | |||
1962 | return BaseCost + FixedVTy->getNumElements(); | |||
1963 | } | |||
1964 | ||||
1965 | if (Opcode != Instruction::FAdd) | |||
1966 | return InstructionCost::getInvalid(); | |||
1967 | ||||
1968 | auto *VTy = cast<ScalableVectorType>(ValTy); | |||
1969 | InstructionCost Cost = | |||
1970 | getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind); | |||
1971 | Cost *= getMaxNumElements(VTy->getElementCount()); | |||
1972 | return Cost; | |||
1973 | } | |||
1974 | ||||
1975 | if (isa<ScalableVectorType>(ValTy)) | |||
1976 | return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); | |||
1977 | ||||
1978 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); | |||
1979 | MVT MTy = LT.second; | |||
1980 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
1981 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp" , 1981, __extension__ __PRETTY_FUNCTION__)); | |||
1982 | ||||
1983 | // Horizontal adds can use the 'addv' instruction. We model the cost of these | |||
1984 | // instructions as twice a normal vector add, plus 1 for each legalization | |||
1985 | // step (LT.first). This is the only arithmetic vector reduction operation for | |||
1986 | // which we have an instruction. | |||
1987 | // OR, XOR and AND costs should match the codegen from: | |||
1988 | // OR: llvm/test/CodeGen/AArch64/reduce-or.ll | |||
1989 | // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll | |||
1990 | // AND: llvm/test/CodeGen/AArch64/reduce-and.ll | |||
1991 | static const CostTblEntry CostTblNoPairwise[]{ | |||
1992 | {ISD::ADD, MVT::v8i8, 2}, | |||
1993 | {ISD::ADD, MVT::v16i8, 2}, | |||
1994 | {ISD::ADD, MVT::v4i16, 2}, | |||
1995 | {ISD::ADD, MVT::v8i16, 2}, | |||
1996 | {ISD::ADD, MVT::v4i32, 2}, | |||
1997 | {ISD::OR, MVT::v8i8, 15}, | |||
1998 | {ISD::OR, MVT::v16i8, 17}, | |||
1999 | {ISD::OR, MVT::v4i16, 7}, | |||
2000 | {ISD::OR, MVT::v8i16, 9}, | |||
2001 | {ISD::OR, MVT::v2i32, 3}, | |||
2002 | {ISD::OR, MVT::v4i32, 5}, | |||
2003 | {ISD::OR, MVT::v2i64, 3}, | |||
2004 | {ISD::XOR, MVT::v8i8, 15}, | |||
2005 | {ISD::XOR, MVT::v16i8, 17}, | |||
2006 | {ISD::XOR, MVT::v4i16, 7}, | |||
2007 | {ISD::XOR, MVT::v8i16, 9}, | |||
2008 | {ISD::XOR, MVT::v2i32, 3}, | |||
2009 | {ISD::XOR, MVT::v4i32, 5}, | |||
2010 | {ISD::XOR, MVT::v2i64, 3}, | |||
2011 | {ISD::AND, MVT::v8i8, 15}, | |||
2012 | {ISD::AND, MVT::v16i8, 17}, | |||
2013 | {ISD::AND, MVT::v4i16, 7}, | |||
2014 | {ISD::AND, MVT::v8i16, 9}, | |||
2015 | {ISD::AND, MVT::v2i32, 3}, | |||
2016 | {ISD::AND, MVT::v4i32, 5}, | |||
2017 | {ISD::AND, MVT::v2i64, 3}, | |||
2018 | }; | |||
2019 | switch (ISD) { | |||
2020 | default: | |||
2021 | break; | |||
2022 | case ISD::ADD: | |||
2023 | if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy)) | |||
2024 | return (LT.first - 1) + Entry->Cost; | |||
2025 | break; | |||
2026 | case ISD::XOR: | |||
2027 | case ISD::AND: | |||
2028 | case ISD::OR: | |||
2029 | const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy); | |||
2030 | if (!Entry) | |||
2031 | break; | |||
2032 | auto *ValVTy = cast<FixedVectorType>(ValTy); | |||
2033 | if (!ValVTy->getElementType()->isIntegerTy(1) && | |||
2034 | MTy.getVectorNumElements() <= ValVTy->getNumElements() && | |||
2035 | isPowerOf2_32(ValVTy->getNumElements())) { | |||
2036 | InstructionCost ExtraCost = 0; | |||
2037 | if (LT.first != 1) { | |||
2038 | // Type needs to be split, so there is an extra cost of LT.first - 1 | |||
2039 | // arithmetic ops. | |||
2040 | auto *Ty = FixedVectorType::get(ValTy->getElementType(), | |||
2041 | MTy.getVectorNumElements()); | |||
2042 | ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind); | |||
2043 | ExtraCost *= LT.first - 1; | |||
2044 | } | |||
2045 | return Entry->Cost + ExtraCost; | |||
2046 | } | |||
2047 | break; | |||
2048 | } | |||
2049 | return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); | |||
2050 | } | |||
2051 | ||||
2052 | InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { | |||
2053 | static const CostTblEntry ShuffleTbl[] = { | |||
2054 | { TTI::SK_Splice, MVT::nxv16i8, 1 }, | |||
2055 | { TTI::SK_Splice, MVT::nxv8i16, 1 }, | |||
2056 | { TTI::SK_Splice, MVT::nxv4i32, 1 }, | |||
2057 | { TTI::SK_Splice, MVT::nxv2i64, 1 }, | |||
2058 | { TTI::SK_Splice, MVT::nxv2f16, 1 }, | |||
2059 | { TTI::SK_Splice, MVT::nxv4f16, 1 }, | |||
2060 | { TTI::SK_Splice, MVT::nxv8f16, 1 }, | |||
2061 | { TTI::SK_Splice, MVT::nxv2bf16, 1 }, | |||
2062 | { TTI::SK_Splice, MVT::nxv4bf16, 1 }, | |||
2063 | { TTI::SK_Splice, MVT::nxv8bf16, 1 }, | |||
2064 | { TTI::SK_Splice, MVT::nxv2f32, 1 }, | |||
2065 | { TTI::SK_Splice, MVT::nxv4f32, 1 }, | |||
2066 | { TTI::SK_Splice, MVT::nxv2f64, 1 }, | |||
2067 | }; | |||
2068 | ||||
2069 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); | |||
2070 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext()); | |||
2071 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | |||
2072 | EVT PromotedVT = LT.second.getScalarType() == MVT::i1 | |||
2073 | ? TLI->getPromotedVTForPredicate(EVT(LT.second)) | |||
2074 | : LT.second; | |||
2075 | Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext()); | |||
2076 | InstructionCost LegalizationCost = 0; | |||
2077 | if (Index < 0) { | |||
2078 | LegalizationCost = | |||
2079 | getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy, | |||
2080 | CmpInst::BAD_ICMP_PREDICATE, CostKind) + | |||
2081 | getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy, | |||
2082 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | |||
2083 | } | |||
2084 | ||||
2085 | // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp | |||
2086 | // Cost performed on a promoted type. | |||
2087 | if (LT.second.getScalarType() == MVT::i1) { | |||
2088 | LegalizationCost += | |||
2089 | getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy, | |||
2090 | TTI::CastContextHint::None, CostKind) + | |||
2091 | getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy, | |||
2092 | TTI::CastContextHint::None, CostKind); | |||
2093 | } | |||
2094 | const auto *Entry = | |||
2095 | CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT()); | |||
2096 | assert(Entry && "Illegal Type for Splice")(static_cast <bool> (Entry && "Illegal Type for Splice" ) ? void (0) : __assert_fail ("Entry && \"Illegal Type for Splice\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp" , 2096, __extension__ __PRETTY_FUNCTION__)); | |||
2097 | LegalizationCost += Entry->Cost; | |||
2098 | return LegalizationCost * LT.first; | |||
2099 | } | |||
2100 | ||||
2101 | InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, | |||
2102 | VectorType *Tp, | |||
2103 | ArrayRef<int> Mask, int Index, | |||
2104 | VectorType *SubTp) { | |||
2105 | Kind = improveShuffleKindFromMask(Kind, Mask); | |||
2106 | if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || | |||
2107 | Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || | |||
2108 | Kind == TTI::SK_Reverse) { | |||
2109 | static const CostTblEntry ShuffleTbl[] = { | |||
2110 | // Broadcast shuffle kinds can be performed with 'dup'. | |||
2111 | { TTI::SK_Broadcast, MVT::v8i8, 1 }, | |||
2112 | { TTI::SK_Broadcast, MVT::v16i8, 1 }, | |||
2113 | { TTI::SK_Broadcast, MVT::v4i16, 1 }, | |||
2114 | { TTI::SK_Broadcast, MVT::v8i16, 1 }, | |||
2115 | { TTI::SK_Broadcast, MVT::v2i32, 1 }, | |||
2116 | { TTI::SK_Broadcast, MVT::v4i32, 1 }, | |||
2117 | { TTI::SK_Broadcast, MVT::v2i64, 1 }, | |||
2118 | { TTI::SK_Broadcast, MVT::v2f32, 1 }, | |||
2119 | { TTI::SK_Broadcast, MVT::v4f32, 1 }, | |||
2120 | { TTI::SK_Broadcast, MVT::v2f64, 1 }, | |||
2121 | // Transpose shuffle kinds can be performed with 'trn1/trn2' and | |||
2122 | // 'zip1/zip2' instructions. | |||
2123 | { TTI::SK_Transpose, MVT::v8i8, 1 }, | |||
2124 | { TTI::SK_Transpose, MVT::v16i8, 1 }, | |||
2125 | { TTI::SK_Transpose, MVT::v4i16, 1 }, | |||
2126 | { TTI::SK_Transpose, MVT::v8i16, 1 }, | |||
2127 | { TTI::SK_Transpose, MVT::v2i32, 1 }, | |||
2128 | { TTI::SK_Transpose, MVT::v4i32, 1 }, | |||
2129 | { TTI::SK_Transpose, MVT::v2i64, 1 }, | |||
2130 | { TTI::SK_Transpose, MVT::v2f32, 1 }, | |||
2131 | { TTI::SK_Transpose, MVT::v4f32, 1 }, | |||
2132 | { TTI::SK_Transpose, MVT::v2f64, 1 }, | |||
2133 | // Select shuffle kinds. | |||
2134 | // TODO: handle vXi8/vXi16. | |||
2135 | { TTI::SK_Select, MVT::v2i32, 1 }, // mov. | |||
2136 | { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar). | |||
2137 | { TTI::SK_Select, MVT::v2i64, 1 }, // mov. | |||
2138 | { TTI::SK_Select, MVT::v2f32, 1 }, // mov. | |||
2139 | { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar). | |||
2140 | { TTI::SK_Select, MVT::v2f64, 1 }, // mov. | |||
2141 | // PermuteSingleSrc shuffle kinds. | |||
2142 | { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov. | |||
2143 | { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case. | |||
2144 | { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov. | |||
2145 | { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov. | |||
2146 | { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case. | |||
2147 | { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov. | |||
2148 | { TTI::SK_PermuteSingleSrc, MVT::v4i16, 3 }, // perfectshuffle worst case. | |||
2149 | { TTI::SK_PermuteSingleSrc, MVT::v4f16, 3 }, // perfectshuffle worst case. | |||
2150 | { TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3 }, // perfectshuffle worst case. | |||
2151 | { TTI::SK_PermuteSingleSrc, MVT::v8i16, 8 }, // constpool + load + tbl | |||
2152 | { TTI::SK_PermuteSingleSrc, MVT::v8f16, 8 }, // constpool + load + tbl | |||
2153 | { TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8 }, // constpool + load + tbl | |||
2154 | { TTI::SK_PermuteSingleSrc, MVT::v8i8, 8 }, // constpool + load + tbl | |||
2155 | { TTI::SK_PermuteSingleSrc, MVT::v16i8, 8 }, // constpool + load + tbl | |||
2156 | // Reverse can be lowered with `rev`. | |||
2157 | { TTI::SK_Reverse, MVT::v2i32, 1 }, // mov. | |||
2158 | { TTI::SK_Reverse, MVT::v4i32, 2 }, // REV64; EXT | |||
2159 | { TTI::SK_Reverse, MVT::v2i64, 1 }, // mov. | |||
2160 | { TTI::SK_Reverse, MVT::v2f32, 1 }, // mov. | |||
2161 | { TTI::SK_Reverse, MVT::v4f32, 2 }, // REV64; EXT | |||
2162 | { TTI::SK_Reverse, MVT::v2f64, 1 }, // mov. | |||
2163 | // Broadcast shuffle kinds for scalable vectors | |||
2164 | { TTI::SK_Broadcast, MVT::nxv16i8, 1 }, | |||
2165 | { TTI::SK_Broadcast, MVT::nxv8i16, 1 }, | |||
2166 | { TTI::SK_Broadcast, MVT::nxv4i32, 1 }, | |||
2167 | { TTI::SK_Broadcast, MVT::nxv2i64, 1 }, | |||
2168 | { TTI::SK_Broadcast, MVT::nxv2f16, 1 }, | |||
2169 | { TTI::SK_Broadcast, MVT::nxv4f16, 1 }, | |||
2170 | { TTI::SK_Broadcast, MVT::nxv8f16, 1 }, | |||
2171 | { TTI::SK_Broadcast, MVT::nxv2bf16, 1 }, | |||
2172 | { TTI::SK_Broadcast, MVT::nxv4bf16, 1 }, | |||
2173 | { TTI::SK_Broadcast, MVT::nxv8bf16, 1 }, | |||
2174 | { TTI::SK_Broadcast, MVT::nxv2f32, 1 }, | |||
2175 | { TTI::SK_Broadcast, MVT::nxv4f32, 1 }, | |||
2176 | { TTI::SK_Broadcast, MVT::nxv2f64, 1 }, | |||
2177 | { TTI::SK_Broadcast, MVT::nxv16i1, 1 }, | |||
2178 | { TTI::SK_Broadcast, MVT::nxv8i1, 1 }, | |||
2179 | { TTI::SK_Broadcast, MVT::nxv4i1, 1 }, | |||
2180 | { TTI::SK_Broadcast, MVT::nxv2i1, 1 }, | |||
2181 | // Handle the cases for vector.reverse with scalable vectors | |||
2182 | { TTI::SK_Reverse, MVT::nxv16i8, 1 }, | |||
2183 | { TTI::SK_Reverse, MVT::nxv8i16, 1 }, | |||
2184 | { TTI::SK_Reverse, MVT::nxv4i32, 1 }, | |||
2185 | { TTI::SK_Reverse, MVT::nxv2i64, 1 }, | |||
2186 | { TTI::SK_Reverse, MVT::nxv2f16, 1 }, | |||
2187 | { TTI::SK_Reverse, MVT::nxv4f16, 1 }, | |||
2188 | { TTI::SK_Reverse, MVT::nxv8f16, 1 }, | |||
2189 | { TTI::SK_Reverse, MVT::nxv2bf16, 1 }, | |||
2190 | { TTI::SK_Reverse, MVT::nxv4bf16, 1 }, | |||
2191 | { TTI::SK_Reverse, MVT::nxv8bf16, 1 }, | |||
2192 | { TTI::SK_Reverse, MVT::nxv2f32, 1 }, | |||
2193 | { TTI::SK_Reverse, MVT::nxv4f32, 1 }, | |||
2194 | { TTI::SK_Reverse, MVT::nxv2f64, 1 }, | |||
2195 | { TTI::SK_Reverse, MVT::nxv16i1, 1 }, | |||
2196 | { TTI::SK_Reverse, MVT::nxv8i1, 1 }, | |||
2197 | { TTI::SK_Reverse, MVT::nxv4i1, 1 }, | |||
2198 | { TTI::SK_Reverse, MVT::nxv2i1, 1 }, | |||
2199 | }; | |||
2200 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); | |||
2201 | if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second)) | |||
2202 | return LT.first * Entry->Cost; | |||
2203 | } | |||
2204 | if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp)) | |||
2205 | return getSpliceCost(Tp, Index); | |||
2206 | return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); | |||
2207 | } |
1 | //===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file contains some functions that are useful for math stuff. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #ifndef LLVM_SUPPORT_MATHEXTRAS_H |
14 | #define LLVM_SUPPORT_MATHEXTRAS_H |
15 | |
16 | #include "llvm/Support/Compiler.h" |
17 | #include <cassert> |
18 | #include <climits> |
19 | #include <cmath> |
20 | #include <cstdint> |
21 | #include <cstring> |
22 | #include <limits> |
23 | #include <type_traits> |
24 | |
25 | #ifdef __ANDROID_NDK__ |
26 | #include <android/api-level.h> |
27 | #endif |
28 | |
29 | #ifdef _MSC_VER |
30 | // Declare these intrinsics manually rather including intrin.h. It's very |
31 | // expensive, and MathExtras.h is popular. |
32 | // #include <intrin.h> |
33 | extern "C" { |
34 | unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask); |
35 | unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask); |
36 | unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask); |
37 | unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask); |
38 | } |
39 | #endif |
40 | |
41 | namespace llvm { |
42 | |
43 | /// The behavior an operation has on an input of 0. |
44 | enum ZeroBehavior { |
45 | /// The returned value is undefined. |
46 | ZB_Undefined, |
47 | /// The returned value is numeric_limits<T>::max() |
48 | ZB_Max, |
49 | /// The returned value is numeric_limits<T>::digits |
50 | ZB_Width |
51 | }; |
52 | |
53 | /// Mathematical constants. |
54 | namespace numbers { |
55 | // TODO: Track C++20 std::numbers. |
56 | // TODO: Favor using the hexadecimal FP constants (requires C++17). |
57 | constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113 |
58 | egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620 |
59 | ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162 |
60 | ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392 |
61 | log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0) |
62 | log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2) |
63 | pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796 |
64 | inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541 |
65 | sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161 |
66 | inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197 |
67 | sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219 |
68 | inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1) |
69 | sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194 |
70 | inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1) |
71 | phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622 |
72 | constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113 |
73 | egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620 |
74 | ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162 |
75 | ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392 |
76 | log2ef = 1.44269504F, // (0x1.715476P+0) |
77 | log10ef = .434294482F, // (0x1.bcb7b2P-2) |
78 | pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796 |
79 | inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541 |
80 | sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161 |
81 | inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197 |
82 | sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193 |
83 | inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1) |
84 | sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194 |
85 | inv_sqrt3f = .577350269F, // (0x1.279a74P-1) |
86 | phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622 |
87 | } // namespace numbers |
88 | |
89 | namespace detail { |
90 | template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter { |
91 | static unsigned count(T Val, ZeroBehavior) { |
92 | if (!Val) |
93 | return std::numeric_limits<T>::digits; |
94 | if (Val & 0x1) |
95 | return 0; |
96 | |
97 | // Bisection method. |
98 | unsigned ZeroBits = 0; |
99 | T Shift = std::numeric_limits<T>::digits >> 1; |
100 | T Mask = std::numeric_limits<T>::max() >> Shift; |
101 | while (Shift) { |
102 | if ((Val & Mask) == 0) { |
103 | Val >>= Shift; |
104 | ZeroBits |= Shift; |
105 | } |
106 | Shift >>= 1; |
107 | Mask >>= Shift; |
108 | } |
109 | return ZeroBits; |
110 | } |
111 | }; |
112 | |
113 | #if defined(__GNUC__4) || defined(_MSC_VER) |
114 | template <typename T> struct TrailingZerosCounter<T, 4> { |
115 | static unsigned count(T Val, ZeroBehavior ZB) { |
116 | if (ZB != ZB_Undefined && Val == 0) |
117 | return 32; |
118 | |
119 | #if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4) |
120 | return __builtin_ctz(Val); |
121 | #elif defined(_MSC_VER) |
122 | unsigned long Index; |
123 | _BitScanForward(&Index, Val); |
124 | return Index; |
125 | #endif |
126 | } |
127 | }; |
128 | |
129 | #if !defined(_MSC_VER) || defined(_M_X64) |
130 | template <typename T> struct TrailingZerosCounter<T, 8> { |
131 | static unsigned count(T Val, ZeroBehavior ZB) { |
132 | if (ZB != ZB_Undefined && Val == 0) |
133 | return 64; |
134 | |
135 | #if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4) |
136 | return __builtin_ctzll(Val); |
137 | #elif defined(_MSC_VER) |
138 | unsigned long Index; |
139 | _BitScanForward64(&Index, Val); |
140 | return Index; |
141 | #endif |
142 | } |
143 | }; |
144 | #endif |
145 | #endif |
146 | } // namespace detail |
147 | |
148 | /// Count number of 0's from the least significant bit to the most |
149 | /// stopping at the first 1. |
150 | /// |
151 | /// Only unsigned integral types are allowed. |
152 | /// |
153 | /// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are |
154 | /// valid arguments. |
155 | template <typename T> |
156 | unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) { |
157 | static_assert(std::numeric_limits<T>::is_integer && |
158 | !std::numeric_limits<T>::is_signed, |
159 | "Only unsigned integral types are allowed."); |
160 | return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB); |
161 | } |
162 | |
163 | namespace detail { |
164 | template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter { |
165 | static unsigned count(T Val, ZeroBehavior) { |
166 | if (!Val) |
167 | return std::numeric_limits<T>::digits; |
168 | |
169 | // Bisection method. |
170 | unsigned ZeroBits = 0; |
171 | for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) { |
172 | T Tmp = Val >> Shift; |
173 | if (Tmp) |
174 | Val = Tmp; |
175 | else |
176 | ZeroBits |= Shift; |
177 | } |
178 | return ZeroBits; |
179 | } |
180 | }; |
181 | |
182 | #if defined(__GNUC__4) || defined(_MSC_VER) |
183 | template <typename T> struct LeadingZerosCounter<T, 4> { |
184 | static unsigned count(T Val, ZeroBehavior ZB) { |
185 | if (ZB != ZB_Undefined && Val == 0) |
186 | return 32; |
187 | |
188 | #if __has_builtin(__builtin_clz)1 || defined(__GNUC__4) |
189 | return __builtin_clz(Val); |
190 | #elif defined(_MSC_VER) |
191 | unsigned long Index; |
192 | _BitScanReverse(&Index, Val); |
193 | return Index ^ 31; |
194 | #endif |
195 | } |
196 | }; |
197 | |
198 | #if !defined(_MSC_VER) || defined(_M_X64) |
199 | template <typename T> struct LeadingZerosCounter<T, 8> { |
200 | static unsigned count(T Val, ZeroBehavior ZB) { |
201 | if (ZB != ZB_Undefined && Val == 0) |
202 | return 64; |
203 | |
204 | #if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4) |
205 | return __builtin_clzll(Val); |
206 | #elif defined(_MSC_VER) |
207 | unsigned long Index; |
208 | _BitScanReverse64(&Index, Val); |
209 | return Index ^ 63; |
210 | #endif |
211 | } |
212 | }; |
213 | #endif |
214 | #endif |
215 | } // namespace detail |
216 | |
217 | /// Count number of 0's from the most significant bit to the least |
218 | /// stopping at the first 1. |
219 | /// |
220 | /// Only unsigned integral types are allowed. |
221 | /// |
222 | /// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are |
223 | /// valid arguments. |
224 | template <typename T> |
225 | unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) { |
226 | static_assert(std::numeric_limits<T>::is_integer && |
227 | !std::numeric_limits<T>::is_signed, |
228 | "Only unsigned integral types are allowed."); |
229 | return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB); |
230 | } |
231 | |
232 | /// Get the index of the first set bit starting from the least |
233 | /// significant bit. |
234 | /// |
235 | /// Only unsigned integral types are allowed. |
236 | /// |
237 | /// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are |
238 | /// valid arguments. |
239 | template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) { |
240 | if (ZB == ZB_Max && Val == 0) |
241 | return std::numeric_limits<T>::max(); |
242 | |
243 | return countTrailingZeros(Val, ZB_Undefined); |
244 | } |
245 | |
246 | /// Create a bitmask with the N right-most bits set to 1, and all other |
247 | /// bits set to 0. Only unsigned types are allowed. |
248 | template <typename T> T maskTrailingOnes(unsigned N) { |
249 | static_assert(std::is_unsigned<T>::value, "Invalid type!"); |
250 | const unsigned Bits = CHAR_BIT8 * sizeof(T); |
251 | assert(N <= Bits && "Invalid bit index")(static_cast <bool> (N <= Bits && "Invalid bit index" ) ? void (0) : __assert_fail ("N <= Bits && \"Invalid bit index\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 251, __extension__ __PRETTY_FUNCTION__)); |
252 | return N == 0 ? 0 : (T(-1) >> (Bits - N)); |
253 | } |
254 | |
255 | /// Create a bitmask with the N left-most bits set to 1, and all other |
256 | /// bits set to 0. Only unsigned types are allowed. |
257 | template <typename T> T maskLeadingOnes(unsigned N) { |
258 | return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N); |
259 | } |
260 | |
261 | /// Create a bitmask with the N right-most bits set to 0, and all other |
262 | /// bits set to 1. Only unsigned types are allowed. |
263 | template <typename T> T maskTrailingZeros(unsigned N) { |
264 | return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N); |
265 | } |
266 | |
267 | /// Create a bitmask with the N left-most bits set to 0, and all other |
268 | /// bits set to 1. Only unsigned types are allowed. |
269 | template <typename T> T maskLeadingZeros(unsigned N) { |
270 | return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N); |
271 | } |
272 | |
273 | /// Get the index of the last set bit starting from the least |
274 | /// significant bit. |
275 | /// |
276 | /// Only unsigned integral types are allowed. |
277 | /// |
278 | /// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are |
279 | /// valid arguments. |
280 | template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) { |
281 | if (ZB == ZB_Max && Val == 0) |
282 | return std::numeric_limits<T>::max(); |
283 | |
284 | // Use ^ instead of - because both gcc and llvm can remove the associated ^ |
285 | // in the __builtin_clz intrinsic on x86. |
286 | return countLeadingZeros(Val, ZB_Undefined) ^ |
287 | (std::numeric_limits<T>::digits - 1); |
288 | } |
289 | |
290 | /// Macro compressed bit reversal table for 256 bits. |
291 | /// |
292 | /// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable |
293 | static const unsigned char BitReverseTable256[256] = { |
294 | #define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64 |
295 | #define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16) |
296 | #define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4) |
297 | R6(0), R6(2), R6(1), R6(3) |
298 | #undef R2 |
299 | #undef R4 |
300 | #undef R6 |
301 | }; |
302 | |
303 | /// Reverse the bits in \p Val. |
304 | template <typename T> |
305 | T reverseBits(T Val) { |
306 | unsigned char in[sizeof(Val)]; |
307 | unsigned char out[sizeof(Val)]; |
308 | std::memcpy(in, &Val, sizeof(Val)); |
309 | for (unsigned i = 0; i < sizeof(Val); ++i) |
310 | out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]]; |
311 | std::memcpy(&Val, out, sizeof(Val)); |
312 | return Val; |
313 | } |
314 | |
315 | #if __has_builtin(__builtin_bitreverse8)1 |
316 | template<> |
317 | inline uint8_t reverseBits<uint8_t>(uint8_t Val) { |
318 | return __builtin_bitreverse8(Val); |
319 | } |
320 | #endif |
321 | |
322 | #if __has_builtin(__builtin_bitreverse16)1 |
323 | template<> |
324 | inline uint16_t reverseBits<uint16_t>(uint16_t Val) { |
325 | return __builtin_bitreverse16(Val); |
326 | } |
327 | #endif |
328 | |
329 | #if __has_builtin(__builtin_bitreverse32)1 |
330 | template<> |
331 | inline uint32_t reverseBits<uint32_t>(uint32_t Val) { |
332 | return __builtin_bitreverse32(Val); |
333 | } |
334 | #endif |
335 | |
336 | #if __has_builtin(__builtin_bitreverse64)1 |
337 | template<> |
338 | inline uint64_t reverseBits<uint64_t>(uint64_t Val) { |
339 | return __builtin_bitreverse64(Val); |
340 | } |
341 | #endif |
342 | |
343 | // NOTE: The following support functions use the _32/_64 extensions instead of |
344 | // type overloading so that signed and unsigned integers can be used without |
345 | // ambiguity. |
346 | |
347 | /// Return the high 32 bits of a 64 bit value. |
348 | constexpr inline uint32_t Hi_32(uint64_t Value) { |
349 | return static_cast<uint32_t>(Value >> 32); |
350 | } |
351 | |
352 | /// Return the low 32 bits of a 64 bit value. |
353 | constexpr inline uint32_t Lo_32(uint64_t Value) { |
354 | return static_cast<uint32_t>(Value); |
355 | } |
356 | |
357 | /// Make a 64-bit integer from a high / low pair of 32-bit integers. |
358 | constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) { |
359 | return ((uint64_t)High << 32) | (uint64_t)Low; |
360 | } |
361 | |
362 | /// Checks if an integer fits into the given bit width. |
363 | template <unsigned N> constexpr inline bool isInt(int64_t x) { |
364 | return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1))); |
365 | } |
366 | // Template specializations to get better code for common cases. |
367 | template <> constexpr inline bool isInt<8>(int64_t x) { |
368 | return static_cast<int8_t>(x) == x; |
369 | } |
370 | template <> constexpr inline bool isInt<16>(int64_t x) { |
371 | return static_cast<int16_t>(x) == x; |
372 | } |
373 | template <> constexpr inline bool isInt<32>(int64_t x) { |
374 | return static_cast<int32_t>(x) == x; |
375 | } |
376 | |
377 | /// Checks if a signed integer is an N bit number shifted left by S. |
378 | template <unsigned N, unsigned S> |
379 | constexpr inline bool isShiftedInt(int64_t x) { |
380 | static_assert( |
381 | N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number."); |
382 | static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide."); |
383 | return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0); |
384 | } |
385 | |
386 | /// Checks if an unsigned integer fits into the given bit width. |
387 | /// |
388 | /// This is written as two functions rather than as simply |
389 | /// |
390 | /// return N >= 64 || X < (UINT64_C(1) << N); |
391 | /// |
392 | /// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting |
393 | /// left too many places. |
394 | template <unsigned N> |
395 | constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) { |
396 | static_assert(N > 0, "isUInt<0> doesn't make sense"); |
397 | return X < (UINT64_C(1)1UL << (N)); |
398 | } |
399 | template <unsigned N> |
400 | constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t) { |
401 | return true; |
402 | } |
403 | |
404 | // Template specializations to get better code for common cases. |
405 | template <> constexpr inline bool isUInt<8>(uint64_t x) { |
406 | return static_cast<uint8_t>(x) == x; |
407 | } |
408 | template <> constexpr inline bool isUInt<16>(uint64_t x) { |
409 | return static_cast<uint16_t>(x) == x; |
410 | } |
411 | template <> constexpr inline bool isUInt<32>(uint64_t x) { |
412 | return static_cast<uint32_t>(x) == x; |
413 | } |
414 | |
415 | /// Checks if a unsigned integer is an N bit number shifted left by S. |
416 | template <unsigned N, unsigned S> |
417 | constexpr inline bool isShiftedUInt(uint64_t x) { |
418 | static_assert( |
419 | N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)"); |
420 | static_assert(N + S <= 64, |
421 | "isShiftedUInt<N, S> with N + S > 64 is too wide."); |
422 | // Per the two static_asserts above, S must be strictly less than 64. So |
423 | // 1 << S is not undefined behavior. |
424 | return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0); |
425 | } |
426 | |
427 | /// Gets the maximum value for a N-bit unsigned integer. |
428 | inline uint64_t maxUIntN(uint64_t N) { |
429 | assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 && "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 429, __extension__ __PRETTY_FUNCTION__)); |
430 | |
431 | // uint64_t(1) << 64 is undefined behavior, so we can't do |
432 | // (uint64_t(1) << N) - 1 |
433 | // without checking first that N != 64. But this works and doesn't have a |
434 | // branch. |
435 | return UINT64_MAX(18446744073709551615UL) >> (64 - N); |
436 | } |
437 | |
438 | /// Gets the minimum value for a N-bit signed integer. |
439 | inline int64_t minIntN(int64_t N) { |
440 | assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 && "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 440, __extension__ __PRETTY_FUNCTION__)); |
441 | |
442 | return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1)); |
443 | } |
444 | |
445 | /// Gets the maximum value for a N-bit signed integer. |
446 | inline int64_t maxIntN(int64_t N) { |
447 | assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 && "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 447, __extension__ __PRETTY_FUNCTION__)); |
448 | |
449 | // This relies on two's complement wraparound when N == 64, so we convert to |
450 | // int64_t only at the very end to avoid UB. |
451 | return (UINT64_C(1)1UL << (N - 1)) - 1; |
452 | } |
453 | |
454 | /// Checks if an unsigned integer fits into the given (dynamic) bit width. |
455 | inline bool isUIntN(unsigned N, uint64_t x) { |
456 | return N >= 64 || x <= maxUIntN(N); |
457 | } |
458 | |
459 | /// Checks if an signed integer fits into the given (dynamic) bit width. |
460 | inline bool isIntN(unsigned N, int64_t x) { |
461 | return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N)); |
462 | } |
463 | |
464 | /// Return true if the argument is a non-empty sequence of ones starting at the |
465 | /// least significant bit with the remainder zero (32 bit version). |
466 | /// Ex. isMask_32(0x0000FFFFU) == true. |
467 | constexpr inline bool isMask_32(uint32_t Value) { |
468 | return Value && ((Value + 1) & Value) == 0; |
469 | } |
470 | |
471 | /// Return true if the argument is a non-empty sequence of ones starting at the |
472 | /// least significant bit with the remainder zero (64 bit version). |
473 | constexpr inline bool isMask_64(uint64_t Value) { |
474 | return Value && ((Value + 1) & Value) == 0; |
475 | } |
476 | |
477 | /// Return true if the argument contains a non-empty sequence of ones with the |
478 | /// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true. |
479 | constexpr inline bool isShiftedMask_32(uint32_t Value) { |
480 | return Value && isMask_32((Value - 1) | Value); |
481 | } |
482 | |
483 | /// Return true if the argument contains a non-empty sequence of ones with the |
484 | /// remainder zero (64 bit version.) |
485 | constexpr inline bool isShiftedMask_64(uint64_t Value) { |
486 | return Value && isMask_64((Value - 1) | Value); |
487 | } |
488 | |
489 | /// Return true if the argument is a power of two > 0. |
490 | /// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.) |
491 | constexpr inline bool isPowerOf2_32(uint32_t Value) { |
492 | return Value && !(Value & (Value - 1)); |
493 | } |
494 | |
495 | /// Return true if the argument is a power of two > 0 (64 bit edition.) |
496 | constexpr inline bool isPowerOf2_64(uint64_t Value) { |
497 | return Value && !(Value & (Value - 1)); |
498 | } |
499 | |
500 | /// Count the number of ones from the most significant bit to the first |
501 | /// zero bit. |
502 | /// |
503 | /// Ex. countLeadingOnes(0xFF0FFF00) == 8. |
504 | /// Only unsigned integral types are allowed. |
505 | /// |
506 | /// \param ZB the behavior on an input of all ones. Only ZB_Width and |
507 | /// ZB_Undefined are valid arguments. |
508 | template <typename T> |
509 | unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) { |
510 | static_assert(std::numeric_limits<T>::is_integer && |
511 | !std::numeric_limits<T>::is_signed, |
512 | "Only unsigned integral types are allowed."); |
513 | return countLeadingZeros<T>(~Value, ZB); |
514 | } |
515 | |
516 | /// Count the number of ones from the least significant bit to the first |
517 | /// zero bit. |
518 | /// |
519 | /// Ex. countTrailingOnes(0x00FF00FF) == 8. |
520 | /// Only unsigned integral types are allowed. |
521 | /// |
522 | /// \param ZB the behavior on an input of all ones. Only ZB_Width and |
523 | /// ZB_Undefined are valid arguments. |
524 | template <typename T> |
525 | unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) { |
526 | static_assert(std::numeric_limits<T>::is_integer && |
527 | !std::numeric_limits<T>::is_signed, |
528 | "Only unsigned integral types are allowed."); |
529 | return countTrailingZeros<T>(~Value, ZB); |
530 | } |
531 | |
532 | namespace detail { |
533 | template <typename T, std::size_t SizeOfT> struct PopulationCounter { |
534 | static unsigned count(T Value) { |
535 | // Generic version, forward to 32 bits. |
536 | static_assert(SizeOfT <= 4, "Not implemented!"); |
537 | #if defined(__GNUC__4) |
538 | return __builtin_popcount(Value); |
539 | #else |
540 | uint32_t v = Value; |
541 | v = v - ((v >> 1) & 0x55555555); |
542 | v = (v & 0x33333333) + ((v >> 2) & 0x33333333); |
543 | return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; |
544 | #endif |
545 | } |
546 | }; |
547 | |
548 | template <typename T> struct PopulationCounter<T, 8> { |
549 | static unsigned count(T Value) { |
550 | #if defined(__GNUC__4) |
551 | return __builtin_popcountll(Value); |
552 | #else |
553 | uint64_t v = Value; |
554 | v = v - ((v >> 1) & 0x5555555555555555ULL); |
555 | v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL); |
556 | v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL; |
557 | return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56); |
558 | #endif |
559 | } |
560 | }; |
561 | } // namespace detail |
562 | |
563 | /// Count the number of set bits in a value. |
564 | /// Ex. countPopulation(0xF000F000) = 8 |
565 | /// Returns 0 if the word is zero. |
566 | template <typename T> |
567 | inline unsigned countPopulation(T Value) { |
568 | static_assert(std::numeric_limits<T>::is_integer && |
569 | !std::numeric_limits<T>::is_signed, |
570 | "Only unsigned integral types are allowed."); |
571 | return detail::PopulationCounter<T, sizeof(T)>::count(Value); |
572 | } |
573 | |
574 | /// Compile time Log2. |
575 | /// Valid only for positive powers of two. |
576 | template <size_t kValue> constexpr inline size_t CTLog2() { |
577 | static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue), |
578 | "Value is not a valid power of 2"); |
579 | return 1 + CTLog2<kValue / 2>(); |
580 | } |
581 | |
582 | template <> constexpr inline size_t CTLog2<1>() { return 0; } |
583 | |
584 | /// Return the log base 2 of the specified value. |
585 | inline double Log2(double Value) { |
586 | #if defined(__ANDROID_API__) && __ANDROID_API__ < 18 |
587 | return __builtin_log(Value) / __builtin_log(2.0); |
588 | #else |
589 | return log2(Value); |
590 | #endif |
591 | } |
592 | |
593 | /// Return the floor log base 2 of the specified value, -1 if the value is zero. |
594 | /// (32 bit edition.) |
595 | /// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2 |
596 | inline unsigned Log2_32(uint32_t Value) { |
597 | return 31 - countLeadingZeros(Value); |
598 | } |
599 | |
600 | /// Return the floor log base 2 of the specified value, -1 if the value is zero. |
601 | /// (64 bit edition.) |
602 | inline unsigned Log2_64(uint64_t Value) { |
603 | return 63 - countLeadingZeros(Value); |
604 | } |
605 | |
606 | /// Return the ceil log base 2 of the specified value, 32 if the value is zero. |
607 | /// (32 bit edition). |
608 | /// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3 |
609 | inline unsigned Log2_32_Ceil(uint32_t Value) { |
610 | return 32 - countLeadingZeros(Value - 1); |
611 | } |
612 | |
613 | /// Return the ceil log base 2 of the specified value, 64 if the value is zero. |
614 | /// (64 bit edition.) |
615 | inline unsigned Log2_64_Ceil(uint64_t Value) { |
616 | return 64 - countLeadingZeros(Value - 1); |
617 | } |
618 | |
619 | /// Return the greatest common divisor of the values using Euclid's algorithm. |
620 | template <typename T> |
621 | inline T greatestCommonDivisor(T A, T B) { |
622 | while (B) { |
623 | T Tmp = B; |
624 | B = A % B; |
625 | A = Tmp; |
626 | } |
627 | return A; |
628 | } |
629 | |
630 | inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) { |
631 | return greatestCommonDivisor<uint64_t>(A, B); |
632 | } |
633 | |
634 | /// This function takes a 64-bit integer and returns the bit equivalent double. |
635 | inline double BitsToDouble(uint64_t Bits) { |
636 | double D; |
637 | static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes"); |
638 | memcpy(&D, &Bits, sizeof(Bits)); |
639 | return D; |
640 | } |
641 | |
642 | /// This function takes a 32-bit integer and returns the bit equivalent float. |
643 | inline float BitsToFloat(uint32_t Bits) { |
644 | float F; |
645 | static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes"); |
646 | memcpy(&F, &Bits, sizeof(Bits)); |
647 | return F; |
648 | } |
649 | |
650 | /// This function takes a double and returns the bit equivalent 64-bit integer. |
651 | /// Note that copying doubles around changes the bits of NaNs on some hosts, |
652 | /// notably x86, so this routine cannot be used if these bits are needed. |
653 | inline uint64_t DoubleToBits(double Double) { |
654 | uint64_t Bits; |
655 | static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes"); |
656 | memcpy(&Bits, &Double, sizeof(Double)); |
657 | return Bits; |
658 | } |
659 | |
660 | /// This function takes a float and returns the bit equivalent 32-bit integer. |
661 | /// Note that copying floats around changes the bits of NaNs on some hosts, |
662 | /// notably x86, so this routine cannot be used if these bits are needed. |
663 | inline uint32_t FloatToBits(float Float) { |
664 | uint32_t Bits; |
665 | static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes"); |
666 | memcpy(&Bits, &Float, sizeof(Float)); |
667 | return Bits; |
668 | } |
669 | |
670 | /// A and B are either alignments or offsets. Return the minimum alignment that |
671 | /// may be assumed after adding the two together. |
672 | constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) { |
673 | // The largest power of 2 that divides both A and B. |
674 | // |
675 | // Replace "-Value" by "1+~Value" in the following commented code to avoid |
676 | // MSVC warning C4146 |
677 | // return (A | B) & -(A | B); |
678 | return (A | B) & (1 + ~(A | B)); |
679 | } |
680 | |
681 | /// Returns the next power of two (in 64-bits) that is strictly greater than A. |
682 | /// Returns zero on overflow. |
683 | inline uint64_t NextPowerOf2(uint64_t A) { |
684 | A |= (A >> 1); |
685 | A |= (A >> 2); |
686 | A |= (A >> 4); |
687 | A |= (A >> 8); |
688 | A |= (A >> 16); |
689 | A |= (A >> 32); |
690 | return A + 1; |
691 | } |
692 | |
693 | /// Returns the power of two which is less than or equal to the given value. |
694 | /// Essentially, it is a floor operation across the domain of powers of two. |
695 | inline uint64_t PowerOf2Floor(uint64_t A) { |
696 | if (!A) return 0; |
697 | return 1ull << (63 - countLeadingZeros(A, ZB_Undefined)); |
698 | } |
699 | |
700 | /// Returns the power of two which is greater than or equal to the given value. |
701 | /// Essentially, it is a ceil operation across the domain of powers of two. |
702 | inline uint64_t PowerOf2Ceil(uint64_t A) { |
703 | if (!A) |
704 | return 0; |
705 | return NextPowerOf2(A - 1); |
706 | } |
707 | |
708 | /// Returns the next integer (mod 2**64) that is greater than or equal to |
709 | /// \p Value and is a multiple of \p Align. \p Align must be non-zero. |
710 | /// |
711 | /// If non-zero \p Skew is specified, the return value will be a minimal |
712 | /// integer that is greater than or equal to \p Value and equal to |
713 | /// \p Align * N + \p Skew for some integer N. If \p Skew is larger than |
714 | /// \p Align, its value is adjusted to '\p Skew mod \p Align'. |
715 | /// |
716 | /// Examples: |
717 | /// \code |
718 | /// alignTo(5, 8) = 8 |
719 | /// alignTo(17, 8) = 24 |
720 | /// alignTo(~0LL, 8) = 0 |
721 | /// alignTo(321, 255) = 510 |
722 | /// |
723 | /// alignTo(5, 8, 7) = 7 |
724 | /// alignTo(17, 8, 1) = 17 |
725 | /// alignTo(~0LL, 8, 3) = 3 |
726 | /// alignTo(321, 255, 42) = 552 |
727 | /// \endcode |
728 | inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { |
729 | assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0." ) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 729, __extension__ __PRETTY_FUNCTION__)); |
730 | Skew %= Align; |
731 | return (Value + Align - 1 - Skew) / Align * Align + Skew; |
732 | } |
733 | |
734 | /// Returns the next integer (mod 2**64) that is greater than or equal to |
735 | /// \p Value and is a multiple of \c Align. \c Align must be non-zero. |
736 | template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) { |
737 | static_assert(Align != 0u, "Align must be non-zero"); |
738 | return (Value + Align - 1) / Align * Align; |
739 | } |
740 | |
741 | /// Returns the integer ceil(Numerator / Denominator). |
742 | inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) { |
743 | return alignTo(Numerator, Denominator) / Denominator; |
744 | } |
745 | |
746 | /// Returns the integer nearest(Numerator / Denominator). |
747 | inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) { |
748 | return (Numerator + (Denominator / 2)) / Denominator; |
749 | } |
750 | |
751 | /// Returns the largest uint64_t less than or equal to \p Value and is |
752 | /// \p Skew mod \p Align. \p Align must be non-zero |
753 | inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { |
754 | assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0." ) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 754, __extension__ __PRETTY_FUNCTION__)); |
755 | Skew %= Align; |
756 | return (Value - Skew) / Align * Align + Skew; |
757 | } |
758 | |
759 | /// Sign-extend the number in the bottom B bits of X to a 32-bit integer. |
760 | /// Requires 0 < B <= 32. |
761 | template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) { |
762 | static_assert(B > 0, "Bit width can't be 0."); |
763 | static_assert(B <= 32, "Bit width out of range."); |
764 | return int32_t(X << (32 - B)) >> (32 - B); |
765 | } |
766 | |
767 | /// Sign-extend the number in the bottom B bits of X to a 32-bit integer. |
768 | /// Requires 0 < B <= 32. |
769 | inline int32_t SignExtend32(uint32_t X, unsigned B) { |
770 | assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0." ) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 770, __extension__ __PRETTY_FUNCTION__)); |
771 | assert(B <= 32 && "Bit width out of range.")(static_cast <bool> (B <= 32 && "Bit width out of range." ) ? void (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 771, __extension__ __PRETTY_FUNCTION__)); |
772 | return int32_t(X << (32 - B)) >> (32 - B); |
773 | } |
774 | |
775 | /// Sign-extend the number in the bottom B bits of X to a 64-bit integer. |
776 | /// Requires 0 < B <= 64. |
777 | template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) { |
778 | static_assert(B > 0, "Bit width can't be 0."); |
779 | static_assert(B <= 64, "Bit width out of range."); |
780 | return int64_t(x << (64 - B)) >> (64 - B); |
781 | } |
782 | |
783 | /// Sign-extend the number in the bottom B bits of X to a 64-bit integer. |
784 | /// Requires 0 < B <= 64. |
785 | inline int64_t SignExtend64(uint64_t X, unsigned B) { |
786 | assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0." ) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 786, __extension__ __PRETTY_FUNCTION__)); |
787 | assert(B <= 64 && "Bit width out of range.")(static_cast <bool> (B <= 64 && "Bit width out of range." ) ? void (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include/llvm/Support/MathExtras.h" , 787, __extension__ __PRETTY_FUNCTION__)); |
788 | return int64_t(X << (64 - B)) >> (64 - B); |
789 | } |
790 | |
791 | /// Subtract two unsigned integers, X and Y, of type T and return the absolute |
792 | /// value of the result. |
793 | template <typename T> |
794 | std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) { |
795 | return X > Y ? (X - Y) : (Y - X); |
796 | } |
797 | |
798 | /// Add two unsigned integers, X and Y, of type T. Clamp the result to the |
799 | /// maximum representable value of T on overflow. ResultOverflowed indicates if |
800 | /// the result is larger than the maximum representable value of type T. |
801 | template <typename T> |
802 | std::enable_if_t<std::is_unsigned<T>::value, T> |
803 | SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) { |
804 | bool Dummy; |
805 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; |
806 | // Hacker's Delight, p. 29 |
807 | T Z = X + Y; |
808 | Overflowed = (Z < X || Z < Y); |
809 | if (Overflowed) |
810 | return std::numeric_limits<T>::max(); |
811 | else |
812 | return Z; |
813 | } |
814 | |
815 | /// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the |
816 | /// maximum representable value of T on overflow. ResultOverflowed indicates if |
817 | /// the result is larger than the maximum representable value of type T. |
818 | template <typename T> |
819 | std::enable_if_t<std::is_unsigned<T>::value, T> |
820 | SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) { |
821 | bool Dummy; |
822 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; |
823 | |
824 | // Hacker's Delight, p. 30 has a different algorithm, but we don't use that |
825 | // because it fails for uint16_t (where multiplication can have undefined |
826 | // behavior due to promotion to int), and requires a division in addition |
827 | // to the multiplication. |
828 | |
829 | Overflowed = false; |
830 | |
831 | // Log2(Z) would be either Log2Z or Log2Z + 1. |
832 | // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z |
833 | // will necessarily be less than Log2Max as desired. |
834 | int Log2Z = Log2_64(X) + Log2_64(Y); |
835 | const T Max = std::numeric_limits<T>::max(); |
836 | int Log2Max = Log2_64(Max); |
837 | if (Log2Z < Log2Max) { |
838 | return X * Y; |
839 | } |
840 | if (Log2Z > Log2Max) { |
841 | Overflowed = true; |
842 | return Max; |
843 | } |
844 | |
845 | // We're going to use the top bit, and maybe overflow one |
846 | // bit past it. Multiply all but the bottom bit then add |
847 | // that on at the end. |
848 | T Z = (X >> 1) * Y; |
849 | if (Z & ~(Max >> 1)) { |
850 | Overflowed = true; |
851 | return Max; |
852 | } |
853 | Z <<= 1; |
854 | if (X & 1) |
855 | return SaturatingAdd(Z, Y, ResultOverflowed); |
856 | |
857 | return Z; |
858 | } |
859 | |
860 | /// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to |
861 | /// the product. Clamp the result to the maximum representable value of T on |
862 | /// overflow. ResultOverflowed indicates if the result is larger than the |
863 | /// maximum representable value of type T. |
864 | template <typename T> |
865 | std::enable_if_t<std::is_unsigned<T>::value, T> |
866 | SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) { |
867 | bool Dummy; |
868 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; |
869 | |
870 | T Product = SaturatingMultiply(X, Y, &Overflowed); |
871 | if (Overflowed) |
872 | return Product; |
873 | |
874 | return SaturatingAdd(A, Product, &Overflowed); |
875 | } |
876 | |
877 | /// Use this rather than HUGE_VALF; the latter causes warnings on MSVC. |
878 | extern const float huge_valf; |
879 | |
880 | |
881 | /// Add two signed integers, computing the two's complement truncated result, |
882 | /// returning true if overflow occured. |
883 | template <typename T> |
884 | std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) { |
885 | #if __has_builtin(__builtin_add_overflow)1 |
886 | return __builtin_add_overflow(X, Y, &Result); |
887 | #else |
888 | // Perform the unsigned addition. |
889 | using U = std::make_unsigned_t<T>; |
890 | const U UX = static_cast<U>(X); |
891 | const U UY = static_cast<U>(Y); |
892 | const U UResult = UX + UY; |
893 | |
894 | // Convert to signed. |
895 | Result = static_cast<T>(UResult); |
896 | |
897 | // Adding two positive numbers should result in a positive number. |
898 | if (X > 0 && Y > 0) |
899 | return Result <= 0; |
900 | // Adding two negatives should result in a negative number. |
901 | if (X < 0 && Y < 0) |
902 | return Result >= 0; |
903 | return false; |
904 | #endif |
905 | } |
906 | |
907 | /// Subtract two signed integers, computing the two's complement truncated |
908 | /// result, returning true if an overflow ocurred. |
909 | template <typename T> |
910 | std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) { |
911 | #if __has_builtin(__builtin_sub_overflow)1 |
912 | return __builtin_sub_overflow(X, Y, &Result); |
913 | #else |
914 | // Perform the unsigned addition. |
915 | using U = std::make_unsigned_t<T>; |
916 | const U UX = static_cast<U>(X); |
917 | const U UY = static_cast<U>(Y); |
918 | const U UResult = UX - UY; |
919 | |
920 | // Convert to signed. |
921 | Result = static_cast<T>(UResult); |
922 | |
923 | // Subtracting a positive number from a negative results in a negative number. |
924 | if (X <= 0 && Y > 0) |
925 | return Result >= 0; |
926 | // Subtracting a negative number from a positive results in a positive number. |
927 | if (X >= 0 && Y < 0) |
928 | return Result <= 0; |
929 | return false; |
930 | #endif |
931 | } |
932 | |
933 | /// Multiply two signed integers, computing the two's complement truncated |
934 | /// result, returning true if an overflow ocurred. |
935 | template <typename T> |
936 | std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) { |
937 | // Perform the unsigned multiplication on absolute values. |
938 | using U = std::make_unsigned_t<T>; |
939 | const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X); |
940 | const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y); |
941 | const U UResult = UX * UY; |
942 | |
943 | // Convert to signed. |
944 | const bool IsNegative = (X < 0) ^ (Y < 0); |
945 | Result = IsNegative ? (0 - UResult) : UResult; |
946 | |
947 | // If any of the args was 0, result is 0 and no overflow occurs. |
948 | if (UX == 0 || UY == 0) |
949 | return false; |
950 | |
951 | // UX and UY are in [1, 2^n], where n is the number of digits. |
952 | // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for |
953 | // positive) divided by an argument compares to the other. |
954 | if (IsNegative) |
955 | return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY; |
956 | else |
957 | return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY; |
958 | } |
959 | |
960 | } // End llvm namespace |
961 | |
962 | #endif |