LLVM 23.0.0git
X86ISelDAGToDAG.cpp
Go to the documentation of this file.
1//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines a DAG pattern matching instruction selector for X86,
10// converting from a legalized dag to a X86 dag.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelDAGToDAG.h"
15#include "X86.h"
17#include "X86Subtarget.h"
18#include "X86TargetMachine.h"
19#include "llvm/ADT/Statistic.h"
22#include "llvm/Config/llvm-config.h"
24#include "llvm/IR/Function.h"
26#include "llvm/IR/Intrinsics.h"
27#include "llvm/IR/IntrinsicsX86.h"
28#include "llvm/IR/Module.h"
29#include "llvm/IR/Type.h"
30#include "llvm/Support/Debug.h"
34#include <cstdint>
35
36using namespace llvm;
37
38#define DEBUG_TYPE "x86-isel"
39#define PASS_NAME "X86 DAG->DAG Instruction Selection"
40
41STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
42
43static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
44 cl::desc("Enable setting constant bits to reduce size of mask immediates"),
46
48 "x86-promote-anyext-load", cl::init(true),
49 cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
50
52
53//===----------------------------------------------------------------------===//
54// Pattern Matcher Implementation
55//===----------------------------------------------------------------------===//
56
57namespace {
58 /// This corresponds to X86AddressMode, but uses SDValue's instead of register
59 /// numbers for the leaves of the matched tree.
60 struct X86ISelAddressMode {
61 enum {
62 RegBase,
63 FrameIndexBase
64 } BaseType = RegBase;
65
66 // This is really a union, discriminated by BaseType!
67 SDValue Base_Reg;
68 int Base_FrameIndex = 0;
69
70 unsigned Scale = 1;
71 SDValue IndexReg;
72 int32_t Disp = 0;
73 SDValue Segment;
74 const GlobalValue *GV = nullptr;
75 const Constant *CP = nullptr;
76 const BlockAddress *BlockAddr = nullptr;
77 const char *ES = nullptr;
78 MCSymbol *MCSym = nullptr;
79 int JT = -1;
80 Align Alignment; // CP alignment.
81 unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_*
82 bool NegateIndex = false;
83
84 X86ISelAddressMode() = default;
85
86 bool hasSymbolicDisplacement() const {
87 return GV != nullptr || CP != nullptr || ES != nullptr ||
88 MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
89 }
90
91 bool hasBaseOrIndexReg() const {
92 return BaseType == FrameIndexBase ||
93 IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
94 }
95
96 /// Return true if this addressing mode is already RIP-relative.
97 bool isRIPRelative() const {
98 if (BaseType != RegBase) return false;
99 if (RegisterSDNode *RegNode =
100 dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
101 return RegNode->getReg() == X86::RIP;
102 return false;
103 }
104
105 void setBaseReg(SDValue Reg) {
106 BaseType = RegBase;
107 Base_Reg = Reg;
108 }
109
110#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
111 void dump(SelectionDAG *DAG = nullptr) {
112 dbgs() << "X86ISelAddressMode " << this << '\n';
113 dbgs() << "Base_Reg ";
114 if (Base_Reg.getNode())
115 Base_Reg.getNode()->dump(DAG);
116 else
117 dbgs() << "nul\n";
118 if (BaseType == FrameIndexBase)
119 dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
120 dbgs() << " Scale " << Scale << '\n'
121 << "IndexReg ";
122 if (NegateIndex)
123 dbgs() << "negate ";
124 if (IndexReg.getNode())
125 IndexReg.getNode()->dump(DAG);
126 else
127 dbgs() << "nul\n";
128 dbgs() << " Disp " << Disp << '\n'
129 << "GV ";
130 if (GV)
131 GV->dump();
132 else
133 dbgs() << "nul";
134 dbgs() << " CP ";
135 if (CP)
136 CP->dump();
137 else
138 dbgs() << "nul";
139 dbgs() << '\n'
140 << "ES ";
141 if (ES)
142 dbgs() << ES;
143 else
144 dbgs() << "nul";
145 dbgs() << " MCSym ";
146 if (MCSym)
147 dbgs() << MCSym;
148 else
149 dbgs() << "nul";
150 dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
151 }
152#endif
153 };
154}
155
156namespace {
157 //===--------------------------------------------------------------------===//
158 /// ISel - X86-specific code to select X86 machine instructions for
159 /// SelectionDAG operations.
160 ///
161 class X86DAGToDAGISel final : public SelectionDAGISel {
162 /// Keep a pointer to the X86Subtarget around so that we can
163 /// make the right decision when generating code for different targets.
164 const X86Subtarget *Subtarget;
165
166 /// If true, selector should try to optimize for minimum code size.
167 bool OptForMinSize;
168
169 /// Disable direct TLS access through segment registers.
170 bool IndirectTlsSegRefs;
171
172 public:
173 X86DAGToDAGISel() = delete;
174
175 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
176 : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
177 OptForMinSize(false), IndirectTlsSegRefs(false) {}
178
179 bool runOnMachineFunction(MachineFunction &MF) override {
180 // Reset the subtarget each time through.
181 Subtarget = &MF.getSubtarget<X86Subtarget>();
182 IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
183 "indirect-tls-seg-refs");
184
185 // OptFor[Min]Size are used in pattern predicates that isel is matching.
186 OptForMinSize = MF.getFunction().hasMinSize();
188 }
189
190 void emitFunctionEntryCode() override;
191
192 bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
193
194 void PreprocessISelDAG() override;
195 void PostprocessISelDAG() override;
196
197// Include the pieces autogenerated from the target description.
198#include "X86GenDAGISel.inc"
199
200 private:
201 void Select(SDNode *N) override;
202
203 bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
204 bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
205 bool AllowSegmentRegForX32 = false);
206 bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
207 bool matchAddress(SDValue N, X86ISelAddressMode &AM);
208 bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
209 bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
210 SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
211 unsigned Depth);
212 bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
213 unsigned Depth);
214 bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
215 unsigned Depth);
216 bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
217 bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
218 SDValue &Scale, SDValue &Index, SDValue &Disp,
219 SDValue &Segment);
220 bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
221 SDValue ScaleOp, SDValue &Base, SDValue &Scale,
222 SDValue &Index, SDValue &Disp, SDValue &Segment);
223 bool selectMOV64Imm32(SDValue N, SDValue &Imm);
224 bool selectLEAAddr(SDValue N, SDValue &Base,
225 SDValue &Scale, SDValue &Index, SDValue &Disp,
226 SDValue &Segment);
227 bool selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale,
228 SDValue &Index, SDValue &Disp, SDValue &Segment);
229 bool selectTLSADDRAddr(SDValue N, SDValue &Base,
230 SDValue &Scale, SDValue &Index, SDValue &Disp,
231 SDValue &Segment);
232 bool selectRelocImm(SDValue N, SDValue &Op);
233
234 bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
235 SDValue &Base, SDValue &Scale,
236 SDValue &Index, SDValue &Disp,
237 SDValue &Segment);
238
239 // Convenience method where P is also root.
240 bool tryFoldLoad(SDNode *P, SDValue N,
241 SDValue &Base, SDValue &Scale,
242 SDValue &Index, SDValue &Disp,
243 SDValue &Segment) {
244 return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
245 }
246
247 bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
248 SDValue &Base, SDValue &Scale,
249 SDValue &Index, SDValue &Disp,
250 SDValue &Segment);
251
252 bool isProfitableToFormMaskedOp(SDNode *N) const;
253
254 /// Implement addressing mode selection for inline asm expressions.
255 bool SelectInlineAsmMemoryOperand(const SDValue &Op,
256 InlineAsm::ConstraintCode ConstraintID,
257 std::vector<SDValue> &OutOps) override;
258
259 void emitSpecialCodeForMain();
260
261 inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
262 MVT VT, SDValue &Base, SDValue &Scale,
263 SDValue &Index, SDValue &Disp,
264 SDValue &Segment) {
265 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
266 Base = CurDAG->getTargetFrameIndex(
267 AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
268 else if (AM.Base_Reg.getNode())
269 Base = AM.Base_Reg;
270 else
271 Base = CurDAG->getRegister(0, VT);
272
273 Scale = getI8Imm(AM.Scale, DL);
274
275#define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
276 // Negate the index if needed.
277 if (AM.NegateIndex) {
278 unsigned NegOpc;
279 switch (VT.SimpleTy) {
280 default:
281 llvm_unreachable("Unsupported VT!");
282 case MVT::i64:
283 NegOpc = GET_ND_IF_ENABLED(X86::NEG64r);
284 break;
285 case MVT::i32:
286 NegOpc = GET_ND_IF_ENABLED(X86::NEG32r);
287 break;
288 case MVT::i16:
289 NegOpc = GET_ND_IF_ENABLED(X86::NEG16r);
290 break;
291 case MVT::i8:
292 NegOpc = GET_ND_IF_ENABLED(X86::NEG8r);
293 break;
294 }
295 SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
296 AM.IndexReg), 0);
297 AM.IndexReg = Neg;
298 }
299
300 if (AM.IndexReg.getNode())
301 Index = AM.IndexReg;
302 else
303 Index = CurDAG->getRegister(0, VT);
304
305 // These are 32-bit even in 64-bit mode since RIP-relative offset
306 // is 32-bit.
307 if (AM.GV)
308 Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
309 MVT::i32, AM.Disp,
310 AM.SymbolFlags);
311 else if (AM.CP)
312 Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment,
313 AM.Disp, AM.SymbolFlags);
314 else if (AM.ES) {
315 assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
316 Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
317 } else if (AM.MCSym) {
318 assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
319 assert(AM.SymbolFlags == 0 && "oo");
320 Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
321 } else if (AM.JT != -1) {
322 assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
323 Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
324 } else if (AM.BlockAddr)
325 Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
326 AM.SymbolFlags);
327 else
328 Disp = CurDAG->getSignedTargetConstant(AM.Disp, DL, MVT::i32);
329
330 if (AM.Segment.getNode())
331 Segment = AM.Segment;
332 else
333 Segment = CurDAG->getRegister(0, MVT::i16);
334 }
335
336 // Utility function to determine whether it is AMX SDNode right after
337 // lowering but before ISEL.
338 bool isAMXSDNode(SDNode *N) const {
339 // Check if N is AMX SDNode:
340 // 1. check result type;
341 // 2. check operand type;
342 for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) {
343 if (N->getValueType(Idx) == MVT::x86amx)
344 return true;
345 }
346 for (unsigned Idx = 0, E = N->getNumOperands(); Idx != E; ++Idx) {
347 SDValue Op = N->getOperand(Idx);
348 if (Op.getValueType() == MVT::x86amx)
349 return true;
350 }
351 return false;
352 }
353
354 // Utility function to determine whether we should avoid selecting
355 // immediate forms of instructions for better code size or not.
356 // At a high level, we'd like to avoid such instructions when
357 // we have similar constants used within the same basic block
358 // that can be kept in a register.
359 //
360 bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
361 uint32_t UseCount = 0;
362
363 // Do not want to hoist if we're not optimizing for size.
364 // TODO: We'd like to remove this restriction.
365 // See the comment in X86InstrInfo.td for more info.
366 if (!CurDAG->shouldOptForSize())
367 return false;
368
369 // Walk all the users of the immediate.
370 for (const SDNode *User : N->users()) {
371 if (UseCount >= 2)
372 break;
373
374 // This user is already selected. Count it as a legitimate use and
375 // move on.
376 if (User->isMachineOpcode()) {
377 UseCount++;
378 continue;
379 }
380
381 // We want to count stores of immediates as real uses.
382 if (User->getOpcode() == ISD::STORE &&
383 User->getOperand(1).getNode() == N) {
384 UseCount++;
385 continue;
386 }
387
388 // We don't currently match users that have > 2 operands (except
389 // for stores, which are handled above)
390 // Those instruction won't match in ISEL, for now, and would
391 // be counted incorrectly.
392 // This may change in the future as we add additional instruction
393 // types.
394 if (User->getNumOperands() != 2)
395 continue;
396
397 // If this is a sign-extended 8-bit integer immediate used in an ALU
398 // instruction, there is probably an opcode encoding to save space.
400 if (C && isInt<8>(C->getSExtValue()))
401 continue;
402
403 // Immediates that are used for offsets as part of stack
404 // manipulation should be left alone. These are typically
405 // used to indicate SP offsets for argument passing and
406 // will get pulled into stores/pushes (implicitly).
407 if (User->getOpcode() == X86ISD::ADD ||
408 User->getOpcode() == ISD::ADD ||
409 User->getOpcode() == X86ISD::SUB ||
410 User->getOpcode() == ISD::SUB) {
411
412 // Find the other operand of the add/sub.
413 SDValue OtherOp = User->getOperand(0);
414 if (OtherOp.getNode() == N)
415 OtherOp = User->getOperand(1);
416
417 // Don't count if the other operand is SP.
418 RegisterSDNode *RegNode;
419 if (OtherOp->getOpcode() == ISD::CopyFromReg &&
421 OtherOp->getOperand(1).getNode())))
422 if ((RegNode->getReg() == X86::ESP) ||
423 (RegNode->getReg() == X86::RSP))
424 continue;
425 }
426
427 // ... otherwise, count this and move on.
428 UseCount++;
429 }
430
431 // If we have more than 1 use, then recommend for hoisting.
432 return (UseCount > 1);
433 }
434
435 /// Return a target constant with the specified value of type i8.
436 inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
437 return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
438 }
439
440 /// Return a target constant with the specified value, of type i32.
441 inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
442 return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
443 }
444
445 /// Return a target constant with the specified value, of type i64.
446 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
447 return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
448 }
449
450 SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
451 const SDLoc &DL) {
452 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
453 uint64_t Index = N->getConstantOperandVal(1);
454 MVT VecVT = N->getOperand(0).getSimpleValueType();
455 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
456 }
457
458 SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
459 const SDLoc &DL) {
460 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
461 uint64_t Index = N->getConstantOperandVal(2);
462 MVT VecVT = N->getSimpleValueType(0);
463 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
464 }
465
466 SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth,
467 const SDLoc &DL) {
468 assert(VecWidth == 128 && "Unexpected vector width");
469 uint64_t Index = N->getConstantOperandVal(2);
470 MVT VecVT = N->getSimpleValueType(0);
471 uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
472 assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index");
473 // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
474 // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
475 return getI8Imm(InsertIdx ? 0x02 : 0x30, DL);
476 }
477
478 SDValue getSBBZero(SDNode *N) {
479 SDLoc dl(N);
480 MVT VT = N->getSimpleValueType(0);
481
482 // Create zero.
483 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
484 SDValue Zero =
485 SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, {}), 0);
486 if (VT == MVT::i64) {
487 Zero = SDValue(
488 CurDAG->getMachineNode(
489 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
490 CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
491 CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
492 0);
493 }
494
495 // Copy flags to the EFLAGS register and glue it to next node.
496 unsigned Opcode = N->getOpcode();
497 assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
498 "Unexpected opcode for SBB materialization");
499 unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
500 SDValue EFLAGS =
501 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
502 N->getOperand(FlagOpIndex), SDValue());
503
504 // Create a 64-bit instruction if the result is 64-bits otherwise use the
505 // 32-bit version.
506 unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
507 MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
508 VTs = CurDAG->getVTList(SBBVT, MVT::i32);
509 return SDValue(
510 CurDAG->getMachineNode(Opc, dl, VTs,
511 {Zero, Zero, EFLAGS, EFLAGS.getValue(1)}),
512 0);
513 }
514
515 // Helper to detect unneeded and instructions on shift amounts. Called
516 // from PatFrags in tablegen.
517 bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
518 assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
519 const APInt &Val = N->getConstantOperandAPInt(1);
520
521 if (Val.countr_one() >= Width)
522 return true;
523
524 APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
525 return Mask.countr_one() >= Width;
526 }
527
528 /// Return an SDNode that returns the value of the global base register.
529 /// Output instructions required to initialize the global base register,
530 /// if necessary.
531 SDNode *getGlobalBaseReg();
532
533 /// Return a reference to the TargetMachine, casted to the target-specific
534 /// type.
535 const X86TargetMachine &getTargetMachine() const {
536 return static_cast<const X86TargetMachine &>(TM);
537 }
538
539 /// Return a reference to the TargetInstrInfo, casted to the target-specific
540 /// type.
541 const X86InstrInfo *getInstrInfo() const {
542 return Subtarget->getInstrInfo();
543 }
544
545 /// Return a condition code of the given SDNode
546 X86::CondCode getCondFromNode(SDNode *N) const;
547
548 /// Address-mode matching performs shift-of-and to and-of-shift
549 /// reassociation in order to expose more scaled addressing
550 /// opportunities.
551 bool ComplexPatternFuncMutatesDAG() const override {
552 return true;
553 }
554
555 bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
556
557 // Indicates we should prefer to use a non-temporal load for this load.
558 bool useNonTemporalLoad(LoadSDNode *N) const {
559 if (!N->isNonTemporal())
560 return false;
561
562 unsigned StoreSize = N->getMemoryVT().getStoreSize();
563
564 if (N->getAlign().value() < StoreSize)
565 return false;
566
567 switch (StoreSize) {
568 default: llvm_unreachable("Unsupported store size");
569 case 4:
570 case 8:
571 return false;
572 case 16:
573 return Subtarget->hasSSE41();
574 case 32:
575 return Subtarget->hasAVX2();
576 case 64:
577 return Subtarget->hasAVX512();
578 }
579 }
580
581 bool foldLoadStoreIntoMemOperand(SDNode *Node);
582 MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
583 bool matchBitExtract(SDNode *Node);
584 bool shrinkAndImmediate(SDNode *N);
585 bool isMaskZeroExtended(SDNode *N) const;
586 bool tryShiftAmountMod(SDNode *N);
587 bool tryShrinkShlLogicImm(SDNode *N);
588 bool tryVPTERNLOG(SDNode *N);
589 bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB,
590 SDNode *ParentC, SDValue A, SDValue B, SDValue C,
591 uint8_t Imm);
592 bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
593 bool tryMatchBitSelect(SDNode *N);
594
595 MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
596 const SDLoc &dl, MVT VT, SDNode *Node);
597 MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
598 const SDLoc &dl, MVT VT, SDNode *Node,
599 SDValue &InGlue);
600
601 bool tryOptimizeRem8Extend(SDNode *N);
602
603 bool onlyUsesZeroFlag(SDValue Flags) const;
604 bool hasNoSignFlagUses(SDValue Flags) const;
605 bool hasNoCarryFlagUses(SDValue Flags) const;
606 };
607
608 class X86DAGToDAGISelLegacy : public SelectionDAGISelLegacy {
609 public:
610 static char ID;
611 explicit X86DAGToDAGISelLegacy(X86TargetMachine &tm,
612 CodeGenOptLevel OptLevel)
613 : SelectionDAGISelLegacy(
614 ID, std::make_unique<X86DAGToDAGISel>(tm, OptLevel)) {}
615 };
616}
617
618char X86DAGToDAGISelLegacy::ID = 0;
619
620INITIALIZE_PASS(X86DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
621
622// Returns true if this masked compare can be implemented legally with this
623// type.
624static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
625 unsigned Opcode = N->getOpcode();
626 if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
627 Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
628 Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
629 // We can get 256-bit 8 element types here without VLX being enabled. When
630 // this happens we will use 512-bit operations and the mask will not be
631 // zero extended.
632 EVT OpVT = N->getOperand(0).getValueType();
633 // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
634 // second operand.
635 if (Opcode == X86ISD::STRICT_CMPM)
636 OpVT = N->getOperand(1).getValueType();
637 if (OpVT.is256BitVector() || OpVT.is128BitVector())
638 return Subtarget->hasVLX();
639
640 return true;
641 }
642 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
643 if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
644 Opcode == X86ISD::FSETCCM_SAE)
645 return true;
646
647 return false;
648}
649
650// Returns true if we can assume the writer of the mask has zero extended it
651// for us.
652bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
653 // If this is an AND, check if we have a compare on either side. As long as
654 // one side guarantees the mask is zero extended, the AND will preserve those
655 // zeros.
656 if (N->getOpcode() == ISD::AND)
657 return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
658 isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
659
660 return isLegalMaskCompare(N, Subtarget);
661}
662
663bool
664X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
665 if (OptLevel == CodeGenOptLevel::None)
666 return false;
667
668 if (!N.hasOneUse())
669 return false;
670
671 if (N.getOpcode() != ISD::LOAD)
672 return true;
673
674 // Don't fold non-temporal loads if we have an instruction for them.
675 if (useNonTemporalLoad(cast<LoadSDNode>(N)))
676 return false;
677
678 // If N is a load, do additional profitability checks.
679 if (U == Root) {
680 switch (U->getOpcode()) {
681 default: break;
682 case X86ISD::ADD:
683 case X86ISD::ADC:
684 case X86ISD::SUB:
685 case X86ISD::SBB:
686 case X86ISD::AND:
687 case X86ISD::XOR:
688 case X86ISD::OR:
689 case ISD::ADD:
690 case ISD::UADDO_CARRY:
691 case ISD::AND:
692 case ISD::OR:
693 case ISD::XOR: {
694 SDValue Op1 = U->getOperand(1);
695
696 // If the other operand is a 8-bit immediate we should fold the immediate
697 // instead. This reduces code size.
698 // e.g.
699 // movl 4(%esp), %eax
700 // addl $4, %eax
701 // vs.
702 // movl $4, %eax
703 // addl 4(%esp), %eax
704 // The former is 2 bytes shorter. In case where the increment is 1, then
705 // the saving can be 4 bytes (by using incl %eax).
706 if (auto *Imm = dyn_cast<ConstantSDNode>(Op1)) {
707 if (Imm->getAPIntValue().isSignedIntN(8))
708 return false;
709
710 // If this is a 64-bit AND with an immediate that fits in 32-bits,
711 // prefer using the smaller and over folding the load. This is needed to
712 // make sure immediates created by shrinkAndImmediate are always folded.
713 // Ideally we would narrow the load during DAG combine and get the
714 // best of both worlds.
715 if (U->getOpcode() == ISD::AND &&
716 Imm->getAPIntValue().getBitWidth() == 64 &&
717 Imm->getAPIntValue().isIntN(32))
718 return false;
719
720 // If this really a zext_inreg that can be represented with a movzx
721 // instruction, prefer that.
722 // TODO: We could shrink the load and fold if it is non-volatile.
723 if (U->getOpcode() == ISD::AND &&
724 (Imm->getAPIntValue() == UINT8_MAX ||
725 Imm->getAPIntValue() == UINT16_MAX ||
726 Imm->getAPIntValue() == UINT32_MAX))
727 return false;
728
729 // ADD/SUB with can negate the immediate and use the opposite operation
730 // to fit 128 into a sign extended 8 bit immediate.
731 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
732 (-Imm->getAPIntValue()).isSignedIntN(8))
733 return false;
734
735 if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
736 (-Imm->getAPIntValue()).isSignedIntN(8) &&
737 hasNoCarryFlagUses(SDValue(U, 1)))
738 return false;
739 }
740
741 // If the other operand is a TLS address, we should fold it instead.
742 // This produces
743 // movl %gs:0, %eax
744 // leal i@NTPOFF(%eax), %eax
745 // instead of
746 // movl $i@NTPOFF, %eax
747 // addl %gs:0, %eax
748 // if the block also has an access to a second TLS address this will save
749 // a load.
750 // FIXME: This is probably also true for non-TLS addresses.
751 if (Op1.getOpcode() == X86ISD::Wrapper) {
752 SDValue Val = Op1.getOperand(0);
754 return false;
755 }
756
757 // Don't fold load if this matches the BTS/BTR/BTC patterns.
758 // BTS: (or X, (shl 1, n))
759 // BTR: (and X, (rotl -2, n))
760 // BTC: (xor X, (shl 1, n))
761 if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
762 if (U->getOperand(0).getOpcode() == ISD::SHL &&
763 isOneConstant(U->getOperand(0).getOperand(0)))
764 return false;
765
766 if (U->getOperand(1).getOpcode() == ISD::SHL &&
767 isOneConstant(U->getOperand(1).getOperand(0)))
768 return false;
769 }
770 if (U->getOpcode() == ISD::AND) {
771 SDValue U0 = U->getOperand(0);
772 SDValue U1 = U->getOperand(1);
773 if (U0.getOpcode() == ISD::ROTL) {
775 if (C && C->getSExtValue() == -2)
776 return false;
777 }
778
779 if (U1.getOpcode() == ISD::ROTL) {
781 if (C && C->getSExtValue() == -2)
782 return false;
783 }
784 }
785
786 break;
787 }
788 case ISD::SHL:
789 case ISD::SRA:
790 case ISD::SRL:
791 // Don't fold a load into a shift by immediate. The BMI2 instructions
792 // support folding a load, but not an immediate. The legacy instructions
793 // support folding an immediate, but can't fold a load. Folding an
794 // immediate is preferable to folding a load.
795 if (isa<ConstantSDNode>(U->getOperand(1)))
796 return false;
797
798 break;
799 }
800 }
801
802 // Prevent folding a load if this can implemented with an insert_subreg or
803 // a move that implicitly zeroes.
804 if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
805 isNullConstant(Root->getOperand(2)) &&
806 (Root->getOperand(0).isUndef() ||
808 return false;
809
810 return true;
811}
812
813// Indicates it is profitable to form an AVX512 masked operation. Returning
814// false will favor a masked register-register masked move or vblendm and the
815// operation will be selected separately.
816bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
817 assert(
818 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
819 "Unexpected opcode!");
820
821 // If the operation has additional users, the operation will be duplicated.
822 // Check the use count to prevent that.
823 // FIXME: Are there cheap opcodes we might want to duplicate?
824 return N->getOperand(1).hasOneUse();
825}
826
827/// Replace the original chain operand of the call with
828/// load's chain operand and move load below the call's chain operand.
829static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
830 SDValue Call, SDValue OrigChain) {
832 SDValue Chain = OrigChain.getOperand(0);
833 if (Chain.getNode() == Load.getNode())
834 Ops.push_back(Load.getOperand(0));
835 else {
836 assert(Chain.getOpcode() == ISD::TokenFactor &&
837 "Unexpected chain operand");
838 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
839 if (Chain.getOperand(i).getNode() == Load.getNode())
840 Ops.push_back(Load.getOperand(0));
841 else
842 Ops.push_back(Chain.getOperand(i));
843 SDValue NewChain =
844 CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
845 Ops.clear();
846 Ops.push_back(NewChain);
847 }
848 Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
849 CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
850 CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
851 Load.getOperand(1), Load.getOperand(2));
852
853 Ops.clear();
854 Ops.push_back(SDValue(Load.getNode(), 1));
855 Ops.append(Call->op_begin() + 1, Call->op_end());
856 CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
857}
858
859/// Return true if call address is a load and it can be
860/// moved below CALLSEQ_START and the chains leading up to the call.
861/// Return the CALLSEQ_START by reference as a second output.
862/// In the case of a tail call, there isn't a callseq node between the call
863/// chain and the load.
864static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
865 // The transformation is somewhat dangerous if the call's chain was glued to
866 // the call. After MoveBelowOrigChain the load is moved between the call and
867 // the chain, this can create a cycle if the load is not folded. So it is
868 // *really* important that we are sure the load will be folded.
869 if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
870 return false;
871 auto *LD = dyn_cast<LoadSDNode>(Callee.getNode());
872 if (!LD ||
873 !LD->isSimple() ||
874 LD->getAddressingMode() != ISD::UNINDEXED ||
875 LD->getExtensionType() != ISD::NON_EXTLOAD)
876 return false;
877
878 // Now let's find the callseq_start.
879 while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
880 if (!Chain.hasOneUse())
881 return false;
882 Chain = Chain.getOperand(0);
883 }
884
885 if (!Chain.getNumOperands())
886 return false;
887 // Since we are not checking for AA here, conservatively abort if the chain
888 // writes to memory. It's not safe to move the callee (a load) across a store.
889 if (isa<MemSDNode>(Chain.getNode()) &&
890 cast<MemSDNode>(Chain.getNode())->writeMem())
891 return false;
892 if (Chain.getOperand(0).getNode() == Callee.getNode())
893 return true;
894 if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
895 Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
896 Callee.getValue(1).hasOneUse())
897 return true;
898 return false;
899}
900
901static bool isEndbrImm64(uint64_t Imm) {
902// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
903// i.g: 0xF3660F1EFA, 0xF3670F1EFA
904 if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
905 return false;
906
907 uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
908 0x65, 0x66, 0x67, 0xf0, 0xf2};
909 int i = 24; // 24bit 0x0F1EFA has matched
910 while (i < 64) {
911 uint8_t Byte = (Imm >> i) & 0xFF;
912 if (Byte == 0xF3)
913 return true;
914 if (!llvm::is_contained(OptionalPrefixBytes, Byte))
915 return false;
916 i += 8;
917 }
918
919 return false;
920}
921
922static bool needBWI(MVT VT) {
923 return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8);
924}
925
926void X86DAGToDAGISel::PreprocessISelDAG() {
927 bool MadeChange = false;
928 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
929 E = CurDAG->allnodes_end(); I != E; ) {
930 SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
931
932 // This is for CET enhancement.
933 //
934 // ENDBR32 and ENDBR64 have specific opcodes:
935 // ENDBR32: F3 0F 1E FB
936 // ENDBR64: F3 0F 1E FA
937 // And we want that attackers won’t find unintended ENDBR32/64
938 // opcode matches in the binary
939 // Here’s an example:
940 // If the compiler had to generate asm for the following code:
941 // a = 0xF30F1EFA
942 // it could, for example, generate:
943 // mov 0xF30F1EFA, dword ptr[a]
944 // In such a case, the binary would include a gadget that starts
945 // with a fake ENDBR64 opcode. Therefore, we split such generation
946 // into multiple operations, let it not shows in the binary
947 if (N->getOpcode() == ISD::Constant) {
948 MVT VT = N->getSimpleValueType(0);
949 int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
950 int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
951 if (Imm == EndbrImm || isEndbrImm64(Imm)) {
952 // Check that the cf-protection-branch is enabled.
953 Metadata *CFProtectionBranch =
955 "cf-protection-branch");
956 if (CFProtectionBranch || IndirectBranchTracking) {
957 SDLoc dl(N);
958 SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true);
959 Complement = CurDAG->getNOT(dl, Complement, VT);
960 --I;
961 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement);
962 ++I;
963 MadeChange = true;
964 continue;
965 }
966 }
967 }
968
969 // If this is a target specific AND node with no flag usages, turn it back
970 // into ISD::AND to enable test instruction matching.
971 if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
972 SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
973 N->getOperand(0), N->getOperand(1));
974 --I;
975 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
976 ++I;
977 MadeChange = true;
978 continue;
979 }
980
981 // Convert vector increment or decrement to sub/add with an all-ones
982 // constant:
983 // add X, <1, 1...> --> sub X, <-1, -1...>
984 // sub X, <1, 1...> --> add X, <-1, -1...>
985 // The all-ones vector constant can be materialized using a pcmpeq
986 // instruction that is commonly recognized as an idiom (has no register
987 // dependency), so that's better/smaller than loading a splat 1 constant.
988 //
989 // But don't do this if it would inhibit a potentially profitable load
990 // folding opportunity for the other operand. That only occurs with the
991 // intersection of:
992 // (1) The other operand (op0) is load foldable.
993 // (2) The op is an add (otherwise, we are *creating* an add and can still
994 // load fold the other op).
995 // (3) The target has AVX (otherwise, we have a destructive add and can't
996 // load fold the other op without killing the constant op).
997 // (4) The constant 1 vector has multiple uses (so it is profitable to load
998 // into a register anyway).
999 auto mayPreventLoadFold = [&]() {
1000 return X86::mayFoldLoad(N->getOperand(0), *Subtarget) &&
1001 N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
1002 !N->getOperand(1).hasOneUse();
1003 };
1004 if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
1005 N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
1006 APInt SplatVal;
1008 peekThroughBitcasts(N->getOperand(0)).getNode()) &&
1009 X86::isConstantSplat(N->getOperand(1), SplatVal) &&
1010 SplatVal.isOne()) {
1011 SDLoc DL(N);
1012
1013 MVT VT = N->getSimpleValueType(0);
1014 unsigned NumElts = VT.getSizeInBits() / 32;
1016 CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts));
1017 AllOnes = CurDAG->getBitcast(VT, AllOnes);
1018
1019 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
1020 SDValue Res =
1021 CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes);
1022 --I;
1023 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1024 ++I;
1025 MadeChange = true;
1026 continue;
1027 }
1028 }
1029
1030 switch (N->getOpcode()) {
1031 case X86ISD::VBROADCAST: {
1032 MVT VT = N->getSimpleValueType(0);
1033 // Emulate v32i16/v64i8 broadcast without BWI.
1034 if (!Subtarget->hasBWI() && needBWI(VT)) {
1035 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1036 SDLoc dl(N);
1037 SDValue NarrowBCast =
1038 CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0));
1039 SDValue Res =
1040 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1041 NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1042 unsigned Index = NarrowVT.getVectorMinNumElements();
1043 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1044 CurDAG->getIntPtrConstant(Index, dl));
1045
1046 --I;
1047 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1048 ++I;
1049 MadeChange = true;
1050 continue;
1051 }
1052
1053 break;
1054 }
1056 MVT VT = N->getSimpleValueType(0);
1057 // Emulate v32i16/v64i8 broadcast without BWI.
1058 if (!Subtarget->hasBWI() && needBWI(VT)) {
1059 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1060 auto *MemNode = cast<MemSDNode>(N);
1061 SDLoc dl(N);
1062 SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
1063 SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1064 SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1065 X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(),
1066 MemNode->getMemOperand());
1067 SDValue Res =
1068 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1069 NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1070 unsigned Index = NarrowVT.getVectorMinNumElements();
1071 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1072 CurDAG->getIntPtrConstant(Index, dl));
1073
1074 --I;
1075 SDValue To[] = {Res, NarrowBCast.getValue(1)};
1076 CurDAG->ReplaceAllUsesWith(N, To);
1077 ++I;
1078 MadeChange = true;
1079 continue;
1080 }
1081
1082 break;
1083 }
1084 case ISD::LOAD: {
1085 // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1086 // load, then just extract the lower subvector and avoid the second load.
1087 auto *Ld = cast<LoadSDNode>(N);
1088 MVT VT = N->getSimpleValueType(0);
1089 if (!ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
1090 !(VT.is128BitVector() || VT.is256BitVector()))
1091 break;
1092
1093 MVT MaxVT = VT;
1094 SDNode *MaxLd = nullptr;
1095 SDValue Ptr = Ld->getBasePtr();
1096 SDValue Chain = Ld->getChain();
1097 for (SDNode *User : Ptr->users()) {
1098 auto *UserLd = dyn_cast<LoadSDNode>(User);
1099 MVT UserVT = User->getSimpleValueType(0);
1100 if (User != N && UserLd && ISD::isNormalLoad(User) &&
1101 UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1102 !User->hasAnyUseOfValue(1) &&
1103 (UserVT.is256BitVector() || UserVT.is512BitVector()) &&
1104 UserVT.getSizeInBits() > VT.getSizeInBits() &&
1105 (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1106 MaxLd = User;
1107 MaxVT = UserVT;
1108 }
1109 }
1110 if (MaxLd) {
1111 SDLoc dl(N);
1112 unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1113 MVT SubVT = MVT::getVectorVT(MaxVT.getScalarType(), NumSubElts);
1114 SDValue Extract = CurDAG->getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT,
1115 SDValue(MaxLd, 0),
1116 CurDAG->getIntPtrConstant(0, dl));
1117 SDValue Res = CurDAG->getBitcast(VT, Extract);
1118
1119 --I;
1120 SDValue To[] = {Res, SDValue(MaxLd, 1)};
1121 CurDAG->ReplaceAllUsesWith(N, To);
1122 ++I;
1123 MadeChange = true;
1124 continue;
1125 }
1126 break;
1127 }
1128 case ISD::VSELECT: {
1129 // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1130 EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();
1131 if (EleVT == MVT::i1)
1132 break;
1133
1134 assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1135 assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
1136 "We can't replace VSELECT with BLENDV in vXi16!");
1137 SDValue R;
1138 if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(N->getOperand(0)) ==
1139 EleVT.getSizeInBits()) {
1140 R = CurDAG->getNode(X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0),
1141 N->getOperand(0), N->getOperand(1), N->getOperand(2),
1142 CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
1143 } else {
1144 R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
1145 N->getOperand(0), N->getOperand(1),
1146 N->getOperand(2));
1147 }
1148 --I;
1149 CurDAG->ReplaceAllUsesWith(N, R.getNode());
1150 ++I;
1151 MadeChange = true;
1152 continue;
1153 }
1154 case ISD::FP_ROUND:
1156 case ISD::FP_TO_SINT:
1157 case ISD::FP_TO_UINT:
1160 // Replace vector fp_to_s/uint with their X86 specific equivalent so we
1161 // don't need 2 sets of patterns.
1162 if (!N->getSimpleValueType(0).isVector())
1163 break;
1164
1165 unsigned NewOpc;
1166 switch (N->getOpcode()) {
1167 default: llvm_unreachable("Unexpected opcode!");
1168 case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
1169 case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
1170 case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1171 case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
1172 case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1173 case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
1174 }
1175 SDValue Res;
1176 if (N->isStrictFPOpcode())
1177 Res =
1178 CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other},
1179 {N->getOperand(0), N->getOperand(1)});
1180 else
1181 Res =
1182 CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1183 N->getOperand(0));
1184 --I;
1185 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1186 ++I;
1187 MadeChange = true;
1188 continue;
1189 }
1190 case ISD::SHL:
1191 case ISD::SRA:
1192 case ISD::SRL: {
1193 // Replace vector shifts with their X86 specific equivalent so we don't
1194 // need 2 sets of patterns.
1195 if (!N->getValueType(0).isVector())
1196 break;
1197
1198 unsigned NewOpc;
1199 switch (N->getOpcode()) {
1200 default: llvm_unreachable("Unexpected opcode!");
1201 case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1202 case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1203 case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1204 }
1205 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1206 N->getOperand(0), N->getOperand(1));
1207 --I;
1208 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1209 ++I;
1210 MadeChange = true;
1211 continue;
1212 }
1213 case ISD::ANY_EXTEND:
1215 // Replace vector any extend with the zero extend equivalents so we don't
1216 // need 2 sets of patterns. Ignore vXi1 extensions.
1217 if (!N->getValueType(0).isVector())
1218 break;
1219
1220 unsigned NewOpc;
1221 if (N->getOperand(0).getScalarValueSizeInBits() == 1) {
1222 assert(N->getOpcode() == ISD::ANY_EXTEND &&
1223 "Unexpected opcode for mask vector!");
1224 NewOpc = ISD::SIGN_EXTEND;
1225 } else {
1226 NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1229 }
1230
1231 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1232 N->getOperand(0));
1233 --I;
1234 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1235 ++I;
1236 MadeChange = true;
1237 continue;
1238 }
1239 case ISD::FCEIL:
1240 case ISD::STRICT_FCEIL:
1241 case ISD::FFLOOR:
1242 case ISD::STRICT_FFLOOR:
1243 case ISD::FTRUNC:
1244 case ISD::STRICT_FTRUNC:
1245 case ISD::FROUNDEVEN:
1247 case ISD::FNEARBYINT:
1249 case ISD::FRINT:
1250 case ISD::STRICT_FRINT: {
1251 // Replace fp rounding with their X86 specific equivalent so we don't
1252 // need 2 sets of patterns.
1253 unsigned Imm;
1254 switch (N->getOpcode()) {
1255 default: llvm_unreachable("Unexpected opcode!");
1256 case ISD::STRICT_FCEIL:
1257 case ISD::FCEIL: Imm = 0xA; break;
1258 case ISD::STRICT_FFLOOR:
1259 case ISD::FFLOOR: Imm = 0x9; break;
1260 case ISD::STRICT_FTRUNC:
1261 case ISD::FTRUNC: Imm = 0xB; break;
1263 case ISD::FROUNDEVEN: Imm = 0x8; break;
1265 case ISD::FNEARBYINT: Imm = 0xC; break;
1266 case ISD::STRICT_FRINT:
1267 case ISD::FRINT: Imm = 0x4; break;
1268 }
1269 SDLoc dl(N);
1270 bool IsStrict = N->isStrictFPOpcode();
1271 SDValue Res;
1272 if (IsStrict)
1273 Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
1274 {N->getValueType(0), MVT::Other},
1275 {N->getOperand(0), N->getOperand(1),
1276 CurDAG->getTargetConstant(Imm, dl, MVT::i32)});
1277 else
1278 Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
1279 N->getOperand(0),
1280 CurDAG->getTargetConstant(Imm, dl, MVT::i32));
1281 --I;
1282 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1283 ++I;
1284 MadeChange = true;
1285 continue;
1286 }
1287 case X86ISD::FANDN:
1288 case X86ISD::FAND:
1289 case X86ISD::FOR:
1290 case X86ISD::FXOR: {
1291 // Widen scalar fp logic ops to vector to reduce isel patterns.
1292 // FIXME: Can we do this during lowering/combine.
1293 MVT VT = N->getSimpleValueType(0);
1294 if (VT.isVector() || VT == MVT::f128)
1295 break;
1296
1297 MVT VecVT = VT == MVT::f64 ? MVT::v2f64
1298 : VT == MVT::f32 ? MVT::v4f32
1299 : MVT::v8f16;
1300
1301 SDLoc dl(N);
1302 SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1303 N->getOperand(0));
1304 SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1305 N->getOperand(1));
1306
1307 SDValue Res;
1308 if (Subtarget->hasSSE2()) {
1309 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
1310 Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
1311 Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
1312 unsigned Opc;
1313 switch (N->getOpcode()) {
1314 default: llvm_unreachable("Unexpected opcode!");
1315 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1316 case X86ISD::FAND: Opc = ISD::AND; break;
1317 case X86ISD::FOR: Opc = ISD::OR; break;
1318 case X86ISD::FXOR: Opc = ISD::XOR; break;
1319 }
1320 Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
1321 Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
1322 } else {
1323 Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
1324 }
1325 Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
1326 CurDAG->getIntPtrConstant(0, dl));
1327 --I;
1328 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1329 ++I;
1330 MadeChange = true;
1331 continue;
1332 }
1333 }
1334
1335 if (OptLevel != CodeGenOptLevel::None &&
1336 // Only do this when the target can fold the load into the call or
1337 // jmp.
1338 !Subtarget->useIndirectThunkCalls() &&
1339 ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
1340 (N->getOpcode() == X86ISD::TC_RETURN &&
1341 (Subtarget->is64Bit() ||
1342 !getTargetMachine().isPositionIndependent())))) {
1343 /// Also try moving call address load from outside callseq_start to just
1344 /// before the call to allow it to be folded.
1345 ///
1346 /// [Load chain]
1347 /// ^
1348 /// |
1349 /// [Load]
1350 /// ^ ^
1351 /// | |
1352 /// / \--
1353 /// / |
1354 ///[CALLSEQ_START] |
1355 /// ^ |
1356 /// | |
1357 /// [LOAD/C2Reg] |
1358 /// | |
1359 /// \ /
1360 /// \ /
1361 /// [CALL]
1362 bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1363 SDValue Chain = N->getOperand(0);
1364 SDValue Load = N->getOperand(1);
1365 if (!isCalleeLoad(Load, Chain, HasCallSeq))
1366 continue;
1367 moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
1368 ++NumLoadMoved;
1369 MadeChange = true;
1370 continue;
1371 }
1372
1373 // Lower fpround and fpextend nodes that target the FP stack to be store and
1374 // load to the stack. This is a gross hack. We would like to simply mark
1375 // these as being illegal, but when we do that, legalize produces these when
1376 // it expands calls, then expands these in the same legalize pass. We would
1377 // like dag combine to be able to hack on these between the call expansion
1378 // and the node legalization. As such this pass basically does "really
1379 // late" legalization of these inline with the X86 isel pass.
1380 // FIXME: This should only happen when not compiled with -O0.
1381 switch (N->getOpcode()) {
1382 default: continue;
1383 case ISD::FP_ROUND:
1384 case ISD::FP_EXTEND:
1385 {
1386 MVT SrcVT = N->getOperand(0).getSimpleValueType();
1387 MVT DstVT = N->getSimpleValueType(0);
1388
1389 // If any of the sources are vectors, no fp stack involved.
1390 if (SrcVT.isVector() || DstVT.isVector())
1391 continue;
1392
1393 // If the source and destination are SSE registers, then this is a legal
1394 // conversion that should not be lowered.
1395 const X86TargetLowering *X86Lowering =
1396 static_cast<const X86TargetLowering *>(TLI);
1397 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1398 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1399 if (SrcIsSSE && DstIsSSE)
1400 continue;
1401
1402 if (!SrcIsSSE && !DstIsSSE) {
1403 // If this is an FPStack extension, it is a noop.
1404 if (N->getOpcode() == ISD::FP_EXTEND)
1405 continue;
1406 // If this is a value-preserving FPStack truncation, it is a noop.
1407 if (N->getConstantOperandVal(1))
1408 continue;
1409 }
1410
1411 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1412 // FPStack has extload and truncstore. SSE can fold direct loads into other
1413 // operations. Based on this, decide what we want to do.
1414 MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1415 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1416 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1417 MachinePointerInfo MPI =
1418 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1419 SDLoc dl(N);
1420
1421 // FIXME: optimize the case where the src/dest is a load or store?
1422
1423 SDValue Store = CurDAG->getTruncStore(
1424 CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT);
1425 SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store,
1426 MemTmp, MPI, MemVT);
1427
1428 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1429 // extload we created. This will cause general havok on the dag because
1430 // anything below the conversion could be folded into other existing nodes.
1431 // To avoid invalidating 'I', back it up to the convert node.
1432 --I;
1433 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1434 break;
1435 }
1436
1437 //The sequence of events for lowering STRICT_FP versions of these nodes requires
1438 //dealing with the chain differently, as there is already a preexisting chain.
1441 {
1442 MVT SrcVT = N->getOperand(1).getSimpleValueType();
1443 MVT DstVT = N->getSimpleValueType(0);
1444
1445 // If any of the sources are vectors, no fp stack involved.
1446 if (SrcVT.isVector() || DstVT.isVector())
1447 continue;
1448
1449 // If the source and destination are SSE registers, then this is a legal
1450 // conversion that should not be lowered.
1451 const X86TargetLowering *X86Lowering =
1452 static_cast<const X86TargetLowering *>(TLI);
1453 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1454 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1455 if (SrcIsSSE && DstIsSSE)
1456 continue;
1457
1458 if (!SrcIsSSE && !DstIsSSE) {
1459 // If this is an FPStack extension, it is a noop.
1460 if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1461 continue;
1462 // If this is a value-preserving FPStack truncation, it is a noop.
1463 if (N->getConstantOperandVal(2))
1464 continue;
1465 }
1466
1467 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1468 // FPStack has extload and truncstore. SSE can fold direct loads into other
1469 // operations. Based on this, decide what we want to do.
1470 MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1471 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1472 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1473 MachinePointerInfo MPI =
1474 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1475 SDLoc dl(N);
1476
1477 // FIXME: optimize the case where the src/dest is a load or store?
1478
1479 //Since the operation is StrictFP, use the preexisting chain.
1481 if (!SrcIsSSE) {
1482 SDVTList VTs = CurDAG->getVTList(MVT::Other);
1483 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
1484 Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
1485 MPI, /*Align*/ std::nullopt,
1487 if (N->getFlags().hasNoFPExcept()) {
1488 SDNodeFlags Flags = Store->getFlags();
1489 Flags.setNoFPExcept(true);
1490 Store->setFlags(Flags);
1491 }
1492 } else {
1493 assert(SrcVT == MemVT && "Unexpected VT!");
1494 Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
1495 MPI);
1496 }
1497
1498 if (!DstIsSSE) {
1499 SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
1500 SDValue Ops[] = {Store, MemTmp};
1501 Result = CurDAG->getMemIntrinsicNode(
1502 X86ISD::FLD, dl, VTs, Ops, MemVT, MPI,
1503 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
1504 if (N->getFlags().hasNoFPExcept()) {
1505 SDNodeFlags Flags = Result->getFlags();
1506 Flags.setNoFPExcept(true);
1507 Result->setFlags(Flags);
1508 }
1509 } else {
1510 assert(DstVT == MemVT && "Unexpected VT!");
1511 Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI);
1512 }
1513
1514 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1515 // extload we created. This will cause general havok on the dag because
1516 // anything below the conversion could be folded into other existing nodes.
1517 // To avoid invalidating 'I', back it up to the convert node.
1518 --I;
1519 CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1520 break;
1521 }
1522 }
1523
1524
1525 // Now that we did that, the node is dead. Increment the iterator to the
1526 // next node to process, then delete N.
1527 ++I;
1528 MadeChange = true;
1529 }
1530
1531 // Remove any dead nodes that may have been left behind.
1532 if (MadeChange)
1533 CurDAG->RemoveDeadNodes();
1534}
1535
1536// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1537bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1538 unsigned Opc = N->getMachineOpcode();
1539 if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1540 Opc != X86::MOVSX64rr8)
1541 return false;
1542
1543 SDValue N0 = N->getOperand(0);
1544
1545 // We need to be extracting the lower bit of an extend.
1546 if (!N0.isMachineOpcode() ||
1547 N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1548 N0.getConstantOperandVal(1) != X86::sub_8bit)
1549 return false;
1550
1551 // We're looking for either a movsx or movzx to match the original opcode.
1552 unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1553 : X86::MOVSX32rr8_NOREX;
1554 SDValue N00 = N0.getOperand(0);
1555 if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1556 return false;
1557
1558 if (Opc == X86::MOVSX64rr8) {
1559 // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1560 // to 64.
1561 MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1562 MVT::i64, N00);
1563 ReplaceUses(N, Extend);
1564 } else {
1565 // Ok we can drop this extend and just use the original extend.
1566 ReplaceUses(N, N00.getNode());
1567 }
1568
1569 return true;
1570}
1571
1572void X86DAGToDAGISel::PostprocessISelDAG() {
1573 // Skip peepholes at -O0.
1574 if (TM.getOptLevel() == CodeGenOptLevel::None)
1575 return;
1576
1577 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1578
1579 bool MadeChange = false;
1580 while (Position != CurDAG->allnodes_begin()) {
1581 SDNode *N = &*--Position;
1582 // Skip dead nodes and any non-machine opcodes.
1583 if (N->use_empty() || !N->isMachineOpcode())
1584 continue;
1585
1586 if (tryOptimizeRem8Extend(N)) {
1587 MadeChange = true;
1588 continue;
1589 }
1590
1591 unsigned Opc = N->getMachineOpcode();
1592 switch (Opc) {
1593 default:
1594 continue;
1595 // ANDrr/rm + TESTrr+ -> TESTrr/TESTmr
1596 case X86::TEST8rr:
1597 case X86::TEST16rr:
1598 case X86::TEST32rr:
1599 case X86::TEST64rr:
1600 // ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr
1601 case X86::CTEST8rr:
1602 case X86::CTEST16rr:
1603 case X86::CTEST32rr:
1604 case X86::CTEST64rr: {
1605 auto &Op0 = N->getOperand(0);
1606 if (Op0 != N->getOperand(1) || !Op0->hasNUsesOfValue(2, Op0.getResNo()) ||
1607 !Op0.isMachineOpcode())
1608 continue;
1609 SDValue And = N->getOperand(0);
1610#define CASE_ND(OP) \
1611 case X86::OP: \
1612 case X86::OP##_ND:
1613 switch (And.getMachineOpcode()) {
1614 default:
1615 continue;
1616 CASE_ND(AND8rr)
1617 CASE_ND(AND16rr)
1618 CASE_ND(AND32rr)
1619 CASE_ND(AND64rr) {
1620 if (And->hasAnyUseOfValue(1))
1621 continue;
1622 SmallVector<SDValue> Ops(N->op_values());
1623 Ops[0] = And.getOperand(0);
1624 Ops[1] = And.getOperand(1);
1625 MachineSDNode *Test =
1626 CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i32, Ops);
1627 ReplaceUses(N, Test);
1628 MadeChange = true;
1629 continue;
1630 }
1631 CASE_ND(AND8rm)
1632 CASE_ND(AND16rm)
1633 CASE_ND(AND32rm)
1634 CASE_ND(AND64rm) {
1635 if (And->hasAnyUseOfValue(1))
1636 continue;
1637 unsigned NewOpc;
1638 bool IsCTESTCC = X86::isCTESTCC(Opc);
1639#define FROM_TO(A, B) \
1640 CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B; \
1641 break;
1642 switch (And.getMachineOpcode()) {
1643 FROM_TO(AND8rm, TEST8mr);
1644 FROM_TO(AND16rm, TEST16mr);
1645 FROM_TO(AND32rm, TEST32mr);
1646 FROM_TO(AND64rm, TEST64mr);
1647 }
1648#undef FROM_TO
1649#undef CASE_ND
1650 // Need to swap the memory and register operand.
1651 SmallVector<SDValue> Ops = {And.getOperand(1), And.getOperand(2),
1652 And.getOperand(3), And.getOperand(4),
1653 And.getOperand(5), And.getOperand(0)};
1654 // CC, Cflags.
1655 if (IsCTESTCC) {
1656 Ops.push_back(N->getOperand(2));
1657 Ops.push_back(N->getOperand(3));
1658 }
1659 // Chain of memory load
1660 Ops.push_back(And.getOperand(6));
1661 // Glue
1662 if (IsCTESTCC)
1663 Ops.push_back(N->getOperand(4));
1664
1665 MachineSDNode *Test = CurDAG->getMachineNode(
1666 NewOpc, SDLoc(N), MVT::i32, MVT::Other, Ops);
1667 CurDAG->setNodeMemRefs(
1668 Test, cast<MachineSDNode>(And.getNode())->memoperands());
1669 ReplaceUses(And.getValue(2), SDValue(Test, 1));
1670 ReplaceUses(SDValue(N, 0), SDValue(Test, 0));
1671 MadeChange = true;
1672 continue;
1673 }
1674 }
1675 }
1676 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1677 // used. We're doing this late so we can prefer to fold the AND into masked
1678 // comparisons. Doing that can be better for the live range of the mask
1679 // register.
1680 case X86::KORTESTBkk:
1681 case X86::KORTESTWkk:
1682 case X86::KORTESTDkk:
1683 case X86::KORTESTQkk: {
1684 SDValue Op0 = N->getOperand(0);
1685 if (Op0 != N->getOperand(1) || !N->isOnlyUserOf(Op0.getNode()) ||
1686 !Op0.isMachineOpcode() || !onlyUsesZeroFlag(SDValue(N, 0)))
1687 continue;
1688#define CASE(A) \
1689 case X86::A: \
1690 break;
1691 switch (Op0.getMachineOpcode()) {
1692 default:
1693 continue;
1694 CASE(KANDBkk)
1695 CASE(KANDWkk)
1696 CASE(KANDDkk)
1697 CASE(KANDQkk)
1698 }
1699 unsigned NewOpc;
1700#define FROM_TO(A, B) \
1701 case X86::A: \
1702 NewOpc = X86::B; \
1703 break;
1704 switch (Opc) {
1705 FROM_TO(KORTESTBkk, KTESTBkk)
1706 FROM_TO(KORTESTWkk, KTESTWkk)
1707 FROM_TO(KORTESTDkk, KTESTDkk)
1708 FROM_TO(KORTESTQkk, KTESTQkk)
1709 }
1710 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1711 // KAND instructions and KTEST use the same ISA feature.
1712 if (NewOpc == X86::KTESTWkk && !Subtarget->hasDQI())
1713 continue;
1714#undef FROM_TO
1715 MachineSDNode *KTest = CurDAG->getMachineNode(
1716 NewOpc, SDLoc(N), MVT::i32, Op0.getOperand(0), Op0.getOperand(1));
1717 ReplaceUses(N, KTest);
1718 MadeChange = true;
1719 continue;
1720 }
1721 // Attempt to remove vectors moves that were inserted to zero upper bits.
1722 case TargetOpcode::SUBREG_TO_REG: {
1723 unsigned SubRegIdx = N->getConstantOperandVal(2);
1724 if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1725 continue;
1726
1727 SDValue Move = N->getOperand(1);
1728 if (!Move.isMachineOpcode())
1729 continue;
1730
1731 // Make sure its one of the move opcodes we recognize.
1732 switch (Move.getMachineOpcode()) {
1733 default:
1734 continue;
1735 CASE(VMOVAPDrr) CASE(VMOVUPDrr)
1736 CASE(VMOVAPSrr) CASE(VMOVUPSrr)
1737 CASE(VMOVDQArr) CASE(VMOVDQUrr)
1738 CASE(VMOVAPDYrr) CASE(VMOVUPDYrr)
1739 CASE(VMOVAPSYrr) CASE(VMOVUPSYrr)
1740 CASE(VMOVDQAYrr) CASE(VMOVDQUYrr)
1741 CASE(VMOVAPDZ128rr) CASE(VMOVUPDZ128rr)
1742 CASE(VMOVAPSZ128rr) CASE(VMOVUPSZ128rr)
1743 CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr)
1744 CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr)
1745 CASE(VMOVAPDZ256rr) CASE(VMOVUPDZ256rr)
1746 CASE(VMOVAPSZ256rr) CASE(VMOVUPSZ256rr)
1747 CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr)
1748 CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr)
1749 }
1750#undef CASE
1751
1752 SDValue In = Move.getOperand(0);
1753 if (!In.isMachineOpcode() ||
1754 In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1755 continue;
1756
1757 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1758 // the SHA instructions which use a legacy encoding.
1759 uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1760 if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1761 (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1762 (TSFlags & X86II::EncodingMask) != X86II::XOP)
1763 continue;
1764
1765 // Producing instruction is another vector instruction. We can drop the
1766 // move.
1767 CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1768 MadeChange = true;
1769 }
1770 }
1771 }
1772
1773 if (MadeChange)
1774 CurDAG->RemoveDeadNodes();
1775}
1776
1777
1778/// Emit any code that needs to be executed only in the main function.
1779void X86DAGToDAGISel::emitSpecialCodeForMain() {
1780 if (Subtarget->isTargetCygMing()) {
1781 TargetLowering::ArgListTy Args;
1782 auto &DL = CurDAG->getDataLayout();
1783
1784 TargetLowering::CallLoweringInfo CLI(*CurDAG);
1785 CLI.setChain(CurDAG->getRoot())
1786 .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1787 CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1788 std::move(Args));
1789 const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1790 std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1791 CurDAG->setRoot(Result.second);
1792 }
1793}
1794
1795void X86DAGToDAGISel::emitFunctionEntryCode() {
1796 // If this is main, emit special code for main.
1797 const Function &F = MF->getFunction();
1798 if (F.hasExternalLinkage() && F.getName() == "main")
1799 emitSpecialCodeForMain();
1800}
1801
1802static bool isDispSafeForFrameIndexOrRegBase(int64_t Val) {
1803 // We can run into an issue where a frame index or a register base
1804 // includes a displacement that, when added to the explicit displacement,
1805 // will overflow the displacement field. Assuming that the
1806 // displacement fits into a 31-bit integer (which is only slightly more
1807 // aggressive than the current fundamental assumption that it fits into
1808 // a 32-bit integer), a 31-bit disp should always be safe.
1809 return isInt<31>(Val);
1810}
1811
1812bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1813 X86ISelAddressMode &AM) {
1814 // We may have already matched a displacement and the caller just added the
1815 // symbolic displacement. So we still need to do the checks even if Offset
1816 // is zero.
1817
1818 int64_t Val = AM.Disp + Offset;
1819
1820 // Cannot combine ExternalSymbol displacements with integer offsets.
1821 if (Val != 0 && (AM.ES || AM.MCSym))
1822 return true;
1823
1824 CodeModel::Model M = TM.getCodeModel();
1825 if (Subtarget->is64Bit()) {
1826 if (Val != 0 &&
1828 AM.hasSymbolicDisplacement()))
1829 return true;
1830 // In addition to the checks required for a register base, check that
1831 // we do not try to use an unsafe Disp with a frame index.
1832 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1834 return true;
1835 // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1836 // 64 bits. Instructions with 32-bit register addresses perform this zero
1837 // extension for us and we can safely ignore the high bits of Offset.
1838 // Instructions with only a 32-bit immediate address do not, though: they
1839 // sign extend instead. This means only address the low 2GB of address space
1840 // is directly addressable, we need indirect addressing for the high 2GB of
1841 // address space.
1842 // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1843 // implicit zero extension of instructions would cover up any problem.
1844 // However, we have asserts elsewhere that get triggered if we do, so keep
1845 // the checks for now.
1846 // TODO: We would actually be able to accept these, as well as the same
1847 // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1848 // to get an address size override to be emitted. However, this
1849 // pseudo-register is not part of any register class and therefore causes
1850 // MIR verification to fail.
1851 if (Subtarget->isTarget64BitILP32() &&
1852 !isDispSafeForFrameIndexOrRegBase((uint32_t)Val) &&
1853 !AM.hasBaseOrIndexReg())
1854 return true;
1855 } else if (AM.hasBaseOrIndexReg() && !isDispSafeForFrameIndexOrRegBase(Val))
1856 // For 32-bit X86, make sure the displacement still isn't close to the
1857 // expressible limit.
1858 return true;
1859 AM.Disp = Val;
1860 return false;
1861}
1862
1863bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1864 bool AllowSegmentRegForX32) {
1865 SDValue Address = N->getOperand(1);
1866
1867 // load gs:0 -> GS segment register.
1868 // load fs:0 -> FS segment register.
1869 //
1870 // This optimization is generally valid because the GNU TLS model defines that
1871 // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1872 // with 32-bit registers, as we get in ILP32 mode, those registers are first
1873 // zero-extended to 64 bits and then added it to the base address, which gives
1874 // unwanted results when the register holds a negative value.
1875 // For more information see http://people.redhat.com/drepper/tls.pdf
1876 if (isNullConstant(Address) && AM.Segment.getNode() == nullptr &&
1877 !IndirectTlsSegRefs &&
1878 (Subtarget->isTargetGlibc() || Subtarget->isTargetMusl() ||
1879 Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())) {
1880 if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1881 return true;
1882 switch (N->getPointerInfo().getAddrSpace()) {
1883 case X86AS::GS:
1884 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1885 return false;
1886 case X86AS::FS:
1887 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1888 return false;
1889 // Address space X86AS::SS is not handled here, because it is not used to
1890 // address TLS areas.
1891 }
1892 }
1893
1894 return true;
1895}
1896
1897/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1898/// mode. These wrap things that will resolve down into a symbol reference.
1899/// If no match is possible, this returns true, otherwise it returns false.
1900bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1901 // If the addressing mode already has a symbol as the displacement, we can
1902 // never match another symbol.
1903 if (AM.hasSymbolicDisplacement())
1904 return true;
1905
1906 bool IsRIPRelTLS = false;
1907 bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1908 if (IsRIPRel) {
1909 SDValue Val = N.getOperand(0);
1911 IsRIPRelTLS = true;
1912 }
1913
1914 // We can't use an addressing mode in the 64-bit large code model.
1915 // Global TLS addressing is an exception. In the medium code model,
1916 // we use can use a mode when RIP wrappers are present.
1917 // That signifies access to globals that are known to be "near",
1918 // such as the GOT itself.
1919 CodeModel::Model M = TM.getCodeModel();
1920 if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1921 return true;
1922
1923 // Base and index reg must be 0 in order to use %rip as base.
1924 if (IsRIPRel && AM.hasBaseOrIndexReg())
1925 return true;
1926
1927 // Make a local copy in case we can't do this fold.
1928 X86ISelAddressMode Backup = AM;
1929
1930 int64_t Offset = 0;
1931 SDValue N0 = N.getOperand(0);
1932 if (auto *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1933 AM.GV = G->getGlobal();
1934 AM.SymbolFlags = G->getTargetFlags();
1935 Offset = G->getOffset();
1936 } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1937 AM.CP = CP->getConstVal();
1938 AM.Alignment = CP->getAlign();
1939 AM.SymbolFlags = CP->getTargetFlags();
1940 Offset = CP->getOffset();
1941 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1942 AM.ES = S->getSymbol();
1943 AM.SymbolFlags = S->getTargetFlags();
1944 } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1945 AM.MCSym = S->getMCSymbol();
1946 } else if (auto *J = dyn_cast<JumpTableSDNode>(N0)) {
1947 AM.JT = J->getIndex();
1948 AM.SymbolFlags = J->getTargetFlags();
1949 } else if (auto *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1950 AM.BlockAddr = BA->getBlockAddress();
1951 AM.SymbolFlags = BA->getTargetFlags();
1952 Offset = BA->getOffset();
1953 } else
1954 llvm_unreachable("Unhandled symbol reference node.");
1955
1956 // Can't use an addressing mode with large globals.
1957 if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1958 TM.isLargeGlobalValue(AM.GV)) {
1959 AM = Backup;
1960 return true;
1961 }
1962
1963 if (foldOffsetIntoAddress(Offset, AM)) {
1964 AM = Backup;
1965 return true;
1966 }
1967
1968 if (IsRIPRel)
1969 AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1970
1971 // Commit the changes now that we know this fold is safe.
1972 return false;
1973}
1974
1975/// Add the specified node to the specified addressing mode, returning true if
1976/// it cannot be done. This just pattern matches for the addressing mode.
1977bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1978 if (matchAddressRecursively(N, AM, 0))
1979 return true;
1980
1981 // Post-processing: Make a second attempt to fold a load, if we now know
1982 // that there will not be any other register. This is only performed for
1983 // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
1984 // any foldable load the first time.
1985 if (Subtarget->isTarget64BitILP32() &&
1986 AM.BaseType == X86ISelAddressMode::RegBase &&
1987 AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
1988 SDValue Save_Base_Reg = AM.Base_Reg;
1989 if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) {
1990 AM.Base_Reg = SDValue();
1991 if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true))
1992 AM.Base_Reg = Save_Base_Reg;
1993 }
1994 }
1995
1996 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1997 // a smaller encoding and avoids a scaled-index.
1998 if (AM.Scale == 2 &&
1999 AM.BaseType == X86ISelAddressMode::RegBase &&
2000 AM.Base_Reg.getNode() == nullptr) {
2001 AM.Base_Reg = AM.IndexReg;
2002 AM.Scale = 1;
2003 }
2004
2005 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
2006 // because it has a smaller encoding.
2007 if (TM.getCodeModel() != CodeModel::Large &&
2008 (!AM.GV || !TM.isLargeGlobalValue(AM.GV)) && Subtarget->is64Bit() &&
2009 AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
2010 AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
2011 AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
2012 // However, when GV is a local function symbol and in the same section as
2013 // the current instruction, and AM.Disp is negative and near INT32_MIN,
2014 // referencing GV+Disp generates a relocation referencing the section symbol
2015 // with an even smaller offset, which might underflow. We should bail out if
2016 // the negative offset is too close to INT32_MIN. Actually, we are more
2017 // conservative here, using a smaller magic number also used by
2018 // isOffsetSuitableForCodeModel.
2019 if (isa_and_nonnull<Function>(AM.GV) && AM.Disp < -16 * 1024 * 1024)
2020 return true;
2021
2022 AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
2023 }
2024
2025 return false;
2026}
2027
2028bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
2029 unsigned Depth) {
2030 // Add an artificial use to this node so that we can keep track of
2031 // it if it gets CSE'd with a different node.
2032 HandleSDNode Handle(N);
2033
2034 X86ISelAddressMode Backup = AM;
2035 if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
2036 !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
2037 return false;
2038 AM = Backup;
2039
2040 // Try again after commutating the operands.
2041 if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM,
2042 Depth + 1) &&
2043 !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1))
2044 return false;
2045 AM = Backup;
2046
2047 // If we couldn't fold both operands into the address at the same time,
2048 // see if we can just put each operand into a register and fold at least
2049 // the add.
2050 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2051 !AM.Base_Reg.getNode() &&
2052 !AM.IndexReg.getNode()) {
2053 N = Handle.getValue();
2054 AM.Base_Reg = N.getOperand(0);
2055 AM.IndexReg = N.getOperand(1);
2056 AM.Scale = 1;
2057 return false;
2058 }
2059 N = Handle.getValue();
2060 return true;
2061}
2062
2063// Insert a node into the DAG at least before the Pos node's position. This
2064// will reposition the node as needed, and will assign it a node ID that is <=
2065// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
2066// IDs! The selection DAG must no longer depend on their uniqueness when this
2067// is used.
2068static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
2069 if (N->getNodeId() == -1 ||
2072 DAG.RepositionNode(Pos->getIterator(), N.getNode());
2073 // Mark Node as invalid for pruning as after this it may be a successor to a
2074 // selected node but otherwise be in the same position of Pos.
2075 // Conservatively mark it with the same -abs(Id) to assure node id
2076 // invariant is preserved.
2077 N->setNodeId(Pos->getNodeId());
2079 }
2080}
2081
2082// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
2083// safe. This allows us to convert the shift and and into an h-register
2084// extract and a scaled index. Returns false if the simplification is
2085// performed.
2087 uint64_t Mask,
2088 SDValue Shift, SDValue X,
2089 X86ISelAddressMode &AM) {
2090 if (Shift.getOpcode() != ISD::SRL ||
2091 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2092 !Shift.hasOneUse())
2093 return true;
2094
2095 int ScaleLog = 8 - Shift.getConstantOperandVal(1);
2096 if (ScaleLog <= 0 || ScaleLog >= 4 ||
2097 Mask != (0xffu << ScaleLog))
2098 return true;
2099
2100 MVT XVT = X.getSimpleValueType();
2101 MVT VT = N.getSimpleValueType();
2102 SDLoc DL(N);
2103 SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
2104 SDValue NewMask = DAG.getConstant(0xff, DL, XVT);
2105 SDValue Srl = DAG.getNode(ISD::SRL, DL, XVT, X, Eight);
2106 SDValue And = DAG.getNode(ISD::AND, DL, XVT, Srl, NewMask);
2107 SDValue Ext = DAG.getZExtOrTrunc(And, DL, VT);
2108 SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
2109 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Ext, ShlCount);
2110
2111 // Insert the new nodes into the topological ordering. We must do this in
2112 // a valid topological ordering as nothing is going to go back and re-sort
2113 // these nodes. We continually insert before 'N' in sequence as this is
2114 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2115 // hierarchy left to express.
2116 insertDAGNode(DAG, N, Eight);
2117 insertDAGNode(DAG, N, NewMask);
2118 insertDAGNode(DAG, N, Srl);
2119 insertDAGNode(DAG, N, And);
2120 insertDAGNode(DAG, N, Ext);
2121 insertDAGNode(DAG, N, ShlCount);
2122 insertDAGNode(DAG, N, Shl);
2123 DAG.ReplaceAllUsesWith(N, Shl);
2124 DAG.RemoveDeadNode(N.getNode());
2125 AM.IndexReg = Ext;
2126 AM.Scale = (1 << ScaleLog);
2127 return false;
2128}
2129
2130// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2131// allows us to fold the shift into this addressing mode. Returns false if the
2132// transform succeeded.
2134 X86ISelAddressMode &AM) {
2135 SDValue Shift = N.getOperand(0);
2136
2137 // Use a signed mask so that shifting right will insert sign bits. These
2138 // bits will be removed when we shift the result left so it doesn't matter
2139 // what we use. This might allow a smaller immediate encoding.
2140 int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
2141
2142 // If we have an any_extend feeding the AND, look through it to see if there
2143 // is a shift behind it. But only if the AND doesn't use the extended bits.
2144 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2145 bool FoundAnyExtend = false;
2146 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2147 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
2148 isUInt<32>(Mask)) {
2149 FoundAnyExtend = true;
2150 Shift = Shift.getOperand(0);
2151 }
2152
2153 if (Shift.getOpcode() != ISD::SHL ||
2155 return true;
2156
2157 SDValue X = Shift.getOperand(0);
2158
2159 // Not likely to be profitable if either the AND or SHIFT node has more
2160 // than one use (unless all uses are for address computation). Besides,
2161 // isel mechanism requires their node ids to be reused.
2162 if (!N.hasOneUse() || !Shift.hasOneUse())
2163 return true;
2164
2165 // Verify that the shift amount is something we can fold.
2166 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2167 if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
2168 return true;
2169
2170 MVT VT = N.getSimpleValueType();
2171 SDLoc DL(N);
2172 if (FoundAnyExtend) {
2173 SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
2174 insertDAGNode(DAG, N, NewX);
2175 X = NewX;
2176 }
2177
2178 SDValue NewMask = DAG.getSignedConstant(Mask >> ShiftAmt, DL, VT);
2179 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
2180 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
2181
2182 // Insert the new nodes into the topological ordering. We must do this in
2183 // a valid topological ordering as nothing is going to go back and re-sort
2184 // these nodes. We continually insert before 'N' in sequence as this is
2185 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2186 // hierarchy left to express.
2187 insertDAGNode(DAG, N, NewMask);
2188 insertDAGNode(DAG, N, NewAnd);
2189 insertDAGNode(DAG, N, NewShift);
2190 DAG.ReplaceAllUsesWith(N, NewShift);
2191 DAG.RemoveDeadNode(N.getNode());
2192
2193 AM.Scale = 1 << ShiftAmt;
2194 AM.IndexReg = NewAnd;
2195 return false;
2196}
2197
2198// Implement some heroics to detect shifts of masked values where the mask can
2199// be replaced by extending the shift and undoing that in the addressing mode
2200// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2201// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2202// the addressing mode. This results in code such as:
2203//
2204// int f(short *y, int *lookup_table) {
2205// ...
2206// return *y + lookup_table[*y >> 11];
2207// }
2208//
2209// Turning into:
2210// movzwl (%rdi), %eax
2211// movl %eax, %ecx
2212// shrl $11, %ecx
2213// addl (%rsi,%rcx,4), %eax
2214//
2215// Instead of:
2216// movzwl (%rdi), %eax
2217// movl %eax, %ecx
2218// shrl $9, %ecx
2219// andl $124, %rcx
2220// addl (%rsi,%rcx), %eax
2221//
2222// Note that this function assumes the mask is provided as a mask *after* the
2223// value is shifted. The input chain may or may not match that, but computing
2224// such a mask is trivial.
2226 uint64_t Mask,
2227 SDValue Shift, SDValue X,
2228 X86ISelAddressMode &AM) {
2229 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
2231 return true;
2232
2233 // We need to ensure that mask is a continuous run of bits.
2234 unsigned MaskIdx, MaskLen;
2235 if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2236 return true;
2237 unsigned MaskLZ = 64 - (MaskIdx + MaskLen);
2238
2239 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2240
2241 // The amount of shift we're trying to fit into the addressing mode is taken
2242 // from the shifted mask index (number of trailing zeros of the mask).
2243 unsigned AMShiftAmt = MaskIdx;
2244
2245 // There is nothing we can do here unless the mask is removing some bits.
2246 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2247 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2248
2249 // Scale the leading zero count down based on the actual size of the value.
2250 // Also scale it down based on the size of the shift.
2251 unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2252 if (MaskLZ < ScaleDown)
2253 return true;
2254 MaskLZ -= ScaleDown;
2255
2256 // The final check is to ensure that any masked out high bits of X are
2257 // already known to be zero. Otherwise, the mask has a semantic impact
2258 // other than masking out a couple of low bits. Unfortunately, because of
2259 // the mask, zero extensions will be removed from operands in some cases.
2260 // This code works extra hard to look through extensions because we can
2261 // replace them with zero extensions cheaply if necessary.
2262 bool ReplacingAnyExtend = false;
2263 if (X.getOpcode() == ISD::ANY_EXTEND) {
2264 unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2265 X.getOperand(0).getSimpleValueType().getSizeInBits();
2266 // Assume that we'll replace the any-extend with a zero-extend, and
2267 // narrow the search to the extended value.
2268 X = X.getOperand(0);
2269 MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
2270 ReplacingAnyExtend = true;
2271 }
2272 APInt MaskedHighBits =
2273 APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
2274 if (!DAG.MaskedValueIsZero(X, MaskedHighBits))
2275 return true;
2276
2277 // We've identified a pattern that can be transformed into a single shift
2278 // and an addressing mode. Make it so.
2279 MVT VT = N.getSimpleValueType();
2280 if (ReplacingAnyExtend) {
2281 assert(X.getValueType() != VT);
2282 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2283 SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
2284 insertDAGNode(DAG, N, NewX);
2285 X = NewX;
2286 }
2287
2288 MVT XVT = X.getSimpleValueType();
2289 SDLoc DL(N);
2290 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2291 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2292 SDValue NewExt = DAG.getZExtOrTrunc(NewSRL, DL, VT);
2293 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2294 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2295
2296 // Insert the new nodes into the topological ordering. We must do this in
2297 // a valid topological ordering as nothing is going to go back and re-sort
2298 // these nodes. We continually insert before 'N' in sequence as this is
2299 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2300 // hierarchy left to express.
2301 insertDAGNode(DAG, N, NewSRLAmt);
2302 insertDAGNode(DAG, N, NewSRL);
2303 insertDAGNode(DAG, N, NewExt);
2304 insertDAGNode(DAG, N, NewSHLAmt);
2305 insertDAGNode(DAG, N, NewSHL);
2306 DAG.ReplaceAllUsesWith(N, NewSHL);
2307 DAG.RemoveDeadNode(N.getNode());
2308
2309 AM.Scale = 1 << AMShiftAmt;
2310 AM.IndexReg = NewExt;
2311 return false;
2312}
2313
2314// Transform "(X >> SHIFT) & (MASK << C1)" to
2315// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2316// matched to a BEXTR later. Returns false if the simplification is performed.
2318 uint64_t Mask,
2319 SDValue Shift, SDValue X,
2320 X86ISelAddressMode &AM,
2321 const X86Subtarget &Subtarget) {
2322 if (Shift.getOpcode() != ISD::SRL ||
2323 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2324 !Shift.hasOneUse() || !N.hasOneUse())
2325 return true;
2326
2327 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2328 if (!Subtarget.hasTBM() &&
2329 !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2330 return true;
2331
2332 // We need to ensure that mask is a continuous run of bits.
2333 unsigned MaskIdx, MaskLen;
2334 if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2335 return true;
2336
2337 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2338
2339 // The amount of shift we're trying to fit into the addressing mode is taken
2340 // from the shifted mask index (number of trailing zeros of the mask).
2341 unsigned AMShiftAmt = MaskIdx;
2342
2343 // There is nothing we can do here unless the mask is removing some bits.
2344 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2345 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2346
2347 MVT XVT = X.getSimpleValueType();
2348 MVT VT = N.getSimpleValueType();
2349 SDLoc DL(N);
2350 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2351 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2352 SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, XVT);
2353 SDValue NewAnd = DAG.getNode(ISD::AND, DL, XVT, NewSRL, NewMask);
2354 SDValue NewExt = DAG.getZExtOrTrunc(NewAnd, DL, VT);
2355 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2356 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2357
2358 // Insert the new nodes into the topological ordering. We must do this in
2359 // a valid topological ordering as nothing is going to go back and re-sort
2360 // these nodes. We continually insert before 'N' in sequence as this is
2361 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2362 // hierarchy left to express.
2363 insertDAGNode(DAG, N, NewSRLAmt);
2364 insertDAGNode(DAG, N, NewSRL);
2365 insertDAGNode(DAG, N, NewMask);
2366 insertDAGNode(DAG, N, NewAnd);
2367 insertDAGNode(DAG, N, NewExt);
2368 insertDAGNode(DAG, N, NewSHLAmt);
2369 insertDAGNode(DAG, N, NewSHL);
2370 DAG.ReplaceAllUsesWith(N, NewSHL);
2371 DAG.RemoveDeadNode(N.getNode());
2372
2373 AM.Scale = 1 << AMShiftAmt;
2374 AM.IndexReg = NewExt;
2375 return false;
2376}
2377
2378// Attempt to peek further into a scaled index register, collecting additional
2379// extensions / offsets / etc. Returns /p N if we can't peek any further.
2380SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2381 X86ISelAddressMode &AM,
2382 unsigned Depth) {
2383 assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2384 assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) &&
2385 "Illegal index scale");
2386
2387 // Limit recursion.
2389 return N;
2390
2391 EVT VT = N.getValueType();
2392 unsigned Opc = N.getOpcode();
2393
2394 // index: add(x,c) -> index: x, disp + c
2395 if (CurDAG->isBaseWithConstantOffset(N)) {
2396 auto *AddVal = cast<ConstantSDNode>(N.getOperand(1));
2397 uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2398 if (!foldOffsetIntoAddress(Offset, AM))
2399 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2400 }
2401
2402 // index: add(x,x) -> index: x, scale * 2
2403 if (Opc == ISD::ADD && N.getOperand(0) == N.getOperand(1)) {
2404 if (AM.Scale <= 4) {
2405 AM.Scale *= 2;
2406 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2407 }
2408 }
2409
2410 // index: shl(x,i) -> index: x, scale * (1 << i)
2411 if (Opc == X86ISD::VSHLI) {
2412 uint64_t ShiftAmt = N.getConstantOperandVal(1);
2413 uint64_t ScaleAmt = 1ULL << ShiftAmt;
2414 if ((AM.Scale * ScaleAmt) <= 8) {
2415 AM.Scale *= ScaleAmt;
2416 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2417 }
2418 }
2419
2420 // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2421 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2422 if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2423 SDValue Src = N.getOperand(0);
2424 if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() &&
2425 Src.hasOneUse()) {
2426 if (CurDAG->isBaseWithConstantOffset(Src)) {
2427 SDValue AddSrc = Src.getOperand(0);
2428 auto *AddVal = cast<ConstantSDNode>(Src.getOperand(1));
2429 int64_t Offset = AddVal->getSExtValue();
2430 if (!foldOffsetIntoAddress((uint64_t)Offset * AM.Scale, AM)) {
2431 SDLoc DL(N);
2432 SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2433 SDValue ExtVal = CurDAG->getSignedConstant(Offset, DL, VT);
2434 SDValue ExtAdd = CurDAG->getNode(ISD::ADD, DL, VT, ExtSrc, ExtVal);
2435 insertDAGNode(*CurDAG, N, ExtSrc);
2436 insertDAGNode(*CurDAG, N, ExtVal);
2437 insertDAGNode(*CurDAG, N, ExtAdd);
2438 CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2439 CurDAG->RemoveDeadNode(N.getNode());
2440 return ExtSrc;
2441 }
2442 }
2443 }
2444 }
2445
2446 // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2447 // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2448 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2449 if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2450 SDValue Src = N.getOperand(0);
2451 unsigned SrcOpc = Src.getOpcode();
2452 if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) ||
2453 CurDAG->isADDLike(Src, /*NoWrap=*/true)) &&
2454 Src.hasOneUse()) {
2455 if (CurDAG->isBaseWithConstantOffset(Src)) {
2456 SDValue AddSrc = Src.getOperand(0);
2457 uint64_t Offset = Src.getConstantOperandVal(1);
2458 if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2459 SDLoc DL(N);
2460 SDValue Res;
2461 // If we're also scaling, see if we can use that as well.
2462 if (AddSrc.getOpcode() == ISD::SHL &&
2463 isa<ConstantSDNode>(AddSrc.getOperand(1))) {
2464 SDValue ShVal = AddSrc.getOperand(0);
2465 uint64_t ShAmt = AddSrc.getConstantOperandVal(1);
2466 APInt HiBits =
2468 uint64_t ScaleAmt = 1ULL << ShAmt;
2469 if ((AM.Scale * ScaleAmt) <= 8 &&
2470 (AddSrc->getFlags().hasNoUnsignedWrap() ||
2471 CurDAG->MaskedValueIsZero(ShVal, HiBits))) {
2472 AM.Scale *= ScaleAmt;
2473 SDValue ExtShVal = CurDAG->getNode(Opc, DL, VT, ShVal);
2474 SDValue ExtShift = CurDAG->getNode(ISD::SHL, DL, VT, ExtShVal,
2475 AddSrc.getOperand(1));
2476 insertDAGNode(*CurDAG, N, ExtShVal);
2477 insertDAGNode(*CurDAG, N, ExtShift);
2478 AddSrc = ExtShift;
2479 Res = ExtShVal;
2480 }
2481 }
2482 SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2483 SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2484 SDValue ExtAdd = CurDAG->getNode(SrcOpc, DL, VT, ExtSrc, ExtVal);
2485 insertDAGNode(*CurDAG, N, ExtSrc);
2486 insertDAGNode(*CurDAG, N, ExtVal);
2487 insertDAGNode(*CurDAG, N, ExtAdd);
2488 CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2489 CurDAG->RemoveDeadNode(N.getNode());
2490 return Res ? Res : ExtSrc;
2491 }
2492 }
2493 }
2494 }
2495
2496 // TODO: Handle extensions, shifted masks etc.
2497 return N;
2498}
2499
2500bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2501 unsigned Depth) {
2502 LLVM_DEBUG({
2503 dbgs() << "MatchAddress: ";
2504 AM.dump(CurDAG);
2505 });
2506 // Limit recursion.
2508 return matchAddressBase(N, AM);
2509
2510 // If this is already a %rip relative address, we can only merge immediates
2511 // into it. Instead of handling this in every case, we handle it here.
2512 // RIP relative addressing: %rip + 32-bit displacement!
2513 if (AM.isRIPRelative()) {
2514 // FIXME: JumpTable and ExternalSymbol address currently don't like
2515 // displacements. It isn't very important, but this should be fixed for
2516 // consistency.
2517 if (!(AM.ES || AM.MCSym) && AM.JT != -1)
2518 return true;
2519
2520 if (auto *Cst = dyn_cast<ConstantSDNode>(N))
2521 if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
2522 return false;
2523 return true;
2524 }
2525
2526 switch (N.getOpcode()) {
2527 default: break;
2528 case ISD::LOCAL_RECOVER: {
2529 if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
2530 if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
2531 // Use the symbol and don't prefix it.
2532 AM.MCSym = ESNode->getMCSymbol();
2533 return false;
2534 }
2535 break;
2536 }
2537 case ISD::Constant: {
2538 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2539 if (!foldOffsetIntoAddress(Val, AM))
2540 return false;
2541 break;
2542 }
2543
2544 case X86ISD::Wrapper:
2545 case X86ISD::WrapperRIP:
2546 if (!matchWrapper(N, AM))
2547 return false;
2548 break;
2549
2550 case ISD::LOAD:
2551 if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
2552 return false;
2553 break;
2554
2555 case ISD::FrameIndex:
2556 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2557 AM.Base_Reg.getNode() == nullptr &&
2558 (!Subtarget->is64Bit() || isDispSafeForFrameIndexOrRegBase(AM.Disp))) {
2559 AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2560 AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
2561 return false;
2562 }
2563 break;
2564
2565 case ISD::SHL:
2566 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2567 break;
2568
2569 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
2570 unsigned Val = CN->getZExtValue();
2571 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2572 // that the base operand remains free for further matching. If
2573 // the base doesn't end up getting used, a post-processing step
2574 // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2575 if (Val == 1 || Val == 2 || Val == 3) {
2576 SDValue ShVal = N.getOperand(0);
2577 AM.Scale = 1 << Val;
2578 AM.IndexReg = matchIndexRecursively(ShVal, AM, Depth + 1);
2579 return false;
2580 }
2581 }
2582 break;
2583
2584 case ISD::SRL: {
2585 // Scale must not be used already.
2586 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2587
2588 // We only handle up to 64-bit values here as those are what matter for
2589 // addressing mode optimizations.
2590 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2591 "Unexpected value size!");
2592
2593 SDValue And = N.getOperand(0);
2594 if (And.getOpcode() != ISD::AND) break;
2595 SDValue X = And.getOperand(0);
2596
2597 // The mask used for the transform is expected to be post-shift, but we
2598 // found the shift first so just apply the shift to the mask before passing
2599 // it down.
2600 if (!isa<ConstantSDNode>(N.getOperand(1)) ||
2601 !isa<ConstantSDNode>(And.getOperand(1)))
2602 break;
2603 uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
2604
2605 // Try to fold the mask and shift into the scale, and return false if we
2606 // succeed.
2607 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
2608 return false;
2609 break;
2610 }
2611
2612 case ISD::SMUL_LOHI:
2613 case ISD::UMUL_LOHI:
2614 // A mul_lohi where we need the low part can be folded as a plain multiply.
2615 if (N.getResNo() != 0) break;
2616 [[fallthrough]];
2617 case ISD::MUL:
2618 case X86ISD::MUL_IMM:
2619 // X*[3,5,9] -> X+X*[2,4,8]
2620 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2621 AM.Base_Reg.getNode() == nullptr &&
2622 AM.IndexReg.getNode() == nullptr) {
2623 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
2624 if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
2625 CN->getZExtValue() == 9) {
2626 AM.Scale = unsigned(CN->getZExtValue())-1;
2627
2628 SDValue MulVal = N.getOperand(0);
2629 SDValue Reg;
2630
2631 // Okay, we know that we have a scale by now. However, if the scaled
2632 // value is an add of something and a constant, we can fold the
2633 // constant into the disp field here.
2634 if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2635 isa<ConstantSDNode>(MulVal.getOperand(1))) {
2636 Reg = MulVal.getOperand(0);
2637 auto *AddVal = cast<ConstantSDNode>(MulVal.getOperand(1));
2638 uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2639 if (foldOffsetIntoAddress(Disp, AM))
2640 Reg = N.getOperand(0);
2641 } else {
2642 Reg = N.getOperand(0);
2643 }
2644
2645 AM.IndexReg = AM.Base_Reg = Reg;
2646 return false;
2647 }
2648 }
2649 break;
2650
2651 case ISD::SUB: {
2652 // Given A-B, if A can be completely folded into the address and
2653 // the index field with the index field unused, use -B as the index.
2654 // This is a win if a has multiple parts that can be folded into
2655 // the address. Also, this saves a mov if the base register has
2656 // other uses, since it avoids a two-address sub instruction, however
2657 // it costs an additional mov if the index register has other uses.
2658
2659 // Add an artificial use to this node so that we can keep track of
2660 // it if it gets CSE'd with a different node.
2661 HandleSDNode Handle(N);
2662
2663 // Test if the LHS of the sub can be folded.
2664 X86ISelAddressMode Backup = AM;
2665 if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2666 N = Handle.getValue();
2667 AM = Backup;
2668 break;
2669 }
2670 N = Handle.getValue();
2671 // Test if the index field is free for use.
2672 if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2673 AM = Backup;
2674 break;
2675 }
2676
2677 int Cost = 0;
2678 SDValue RHS = N.getOperand(1);
2679 // If the RHS involves a register with multiple uses, this
2680 // transformation incurs an extra mov, due to the neg instruction
2681 // clobbering its operand.
2682 if (!RHS.getNode()->hasOneUse() ||
2683 RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2684 RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2685 RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2686 (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2687 RHS.getOperand(0).getValueType() == MVT::i32))
2688 ++Cost;
2689 // If the base is a register with multiple uses, this
2690 // transformation may save a mov.
2691 if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2692 !AM.Base_Reg.getNode()->hasOneUse()) ||
2693 AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2694 --Cost;
2695 // If the folded LHS was interesting, this transformation saves
2696 // address arithmetic.
2697 if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2698 ((AM.Disp != 0) && (Backup.Disp == 0)) +
2699 (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2700 --Cost;
2701 // If it doesn't look like it may be an overall win, don't do it.
2702 if (Cost >= 0) {
2703 AM = Backup;
2704 break;
2705 }
2706
2707 // Ok, the transformation is legal and appears profitable. Go for it.
2708 // Negation will be emitted later to avoid creating dangling nodes if this
2709 // was an unprofitable LEA.
2710 AM.IndexReg = RHS;
2711 AM.NegateIndex = true;
2712 AM.Scale = 1;
2713 return false;
2714 }
2715
2716 case ISD::OR:
2717 case ISD::XOR:
2718 // See if we can treat the OR/XOR node as an ADD node.
2719 if (!CurDAG->isADDLike(N))
2720 break;
2721 [[fallthrough]];
2722 case ISD::ADD:
2723 if (!matchAdd(N, AM, Depth))
2724 return false;
2725 break;
2726
2727 case ISD::AND: {
2728 // Perform some heroic transforms on an and of a constant-count shift
2729 // with a constant to enable use of the scaled offset field.
2730
2731 // Scale must not be used already.
2732 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2733
2734 // We only handle up to 64-bit values here as those are what matter for
2735 // addressing mode optimizations.
2736 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2737 "Unexpected value size!");
2738
2739 if (!isa<ConstantSDNode>(N.getOperand(1)))
2740 break;
2741
2742 if (N.getOperand(0).getOpcode() == ISD::SRL) {
2743 SDValue Shift = N.getOperand(0);
2744 SDValue X = Shift.getOperand(0);
2745
2746 uint64_t Mask = N.getConstantOperandVal(1);
2747
2748 // Try to fold the mask and shift into an extract and scale.
2749 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2750 return false;
2751
2752 // Try to fold the mask and shift directly into the scale.
2753 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2754 return false;
2755
2756 // Try to fold the mask and shift into BEXTR and scale.
2757 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2758 return false;
2759 }
2760
2761 // Try to swap the mask and shift to place shifts which can be done as
2762 // a scale on the outside of the mask.
2763 if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2764 return false;
2765
2766 break;
2767 }
2768 case ISD::ZERO_EXTEND: {
2769 // Try to widen a zexted shift left to the same size as its use, so we can
2770 // match the shift as a scale factor.
2771 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2772 break;
2773
2774 SDValue Src = N.getOperand(0);
2775
2776 // See if we can match a zext(addlike(x,c)).
2777 // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2778 if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR)
2779 if (SDValue Index = matchIndexRecursively(N, AM, Depth + 1))
2780 if (Index != N) {
2781 AM.IndexReg = Index;
2782 return false;
2783 }
2784
2785 // Peek through mask: zext(and(shl(x,c1),c2))
2786 APInt Mask = APInt::getAllOnes(Src.getScalarValueSizeInBits());
2787 if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2788 if (auto *MaskC = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
2789 Mask = MaskC->getAPIntValue();
2790 Src = Src.getOperand(0);
2791 }
2792
2793 if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N->hasOneUse()) {
2794 // Give up if the shift is not a valid scale factor [1,2,3].
2795 SDValue ShlSrc = Src.getOperand(0);
2796 SDValue ShlAmt = Src.getOperand(1);
2797 auto *ShAmtC = dyn_cast<ConstantSDNode>(ShlAmt);
2798 if (!ShAmtC)
2799 break;
2800 unsigned ShAmtV = ShAmtC->getZExtValue();
2801 if (ShAmtV > 3)
2802 break;
2803
2804 // The narrow shift must only shift out zero bits (it must be 'nuw').
2805 // That makes it safe to widen to the destination type.
2806 APInt HighZeros =
2807 APInt::getHighBitsSet(ShlSrc.getValueSizeInBits(), ShAmtV);
2808 if (!Src->getFlags().hasNoUnsignedWrap() &&
2809 !CurDAG->MaskedValueIsZero(ShlSrc, HighZeros & Mask))
2810 break;
2811
2812 // zext (shl nuw i8 %x, C1) to i32
2813 // --> shl (zext i8 %x to i32), (zext C1)
2814 // zext (and (shl nuw i8 %x, C1), C2) to i32
2815 // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2816 MVT SrcVT = ShlSrc.getSimpleValueType();
2817 MVT VT = N.getSimpleValueType();
2818 SDLoc DL(N);
2819
2820 SDValue Res = ShlSrc;
2821 if (!Mask.isAllOnes()) {
2822 Res = CurDAG->getConstant(Mask.lshr(ShAmtV), DL, SrcVT);
2823 insertDAGNode(*CurDAG, N, Res);
2824 Res = CurDAG->getNode(ISD::AND, DL, SrcVT, ShlSrc, Res);
2825 insertDAGNode(*CurDAG, N, Res);
2826 }
2827 SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Res);
2828 insertDAGNode(*CurDAG, N, Zext);
2829 SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, ShlAmt);
2830 insertDAGNode(*CurDAG, N, NewShl);
2831 CurDAG->ReplaceAllUsesWith(N, NewShl);
2832 CurDAG->RemoveDeadNode(N.getNode());
2833
2834 // Convert the shift to scale factor.
2835 AM.Scale = 1 << ShAmtV;
2836 // If matchIndexRecursively is not called here,
2837 // Zext may be replaced by other nodes but later used to call a builder
2838 // method
2839 AM.IndexReg = matchIndexRecursively(Zext, AM, Depth + 1);
2840 return false;
2841 }
2842
2843 if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2844 // Try to fold the mask and shift into an extract and scale.
2845 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask.getZExtValue(), Src,
2846 Src.getOperand(0), AM))
2847 return false;
2848
2849 // Try to fold the mask and shift directly into the scale.
2850 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask.getZExtValue(), Src,
2851 Src.getOperand(0), AM))
2852 return false;
2853
2854 // Try to fold the mask and shift into BEXTR and scale.
2855 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask.getZExtValue(), Src,
2856 Src.getOperand(0), AM, *Subtarget))
2857 return false;
2858 }
2859
2860 break;
2861 }
2862 }
2863
2864 return matchAddressBase(N, AM);
2865}
2866
2867/// Helper for MatchAddress. Add the specified node to the
2868/// specified addressing mode without any further recursion.
2869bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2870 // Is the base register already occupied?
2871 if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2872 // If so, check to see if the scale index register is set.
2873 if (!AM.IndexReg.getNode()) {
2874 AM.IndexReg = N;
2875 AM.Scale = 1;
2876 return false;
2877 }
2878
2879 // Otherwise, we cannot select it.
2880 return true;
2881 }
2882
2883 // Default, generate it as a register.
2884 AM.BaseType = X86ISelAddressMode::RegBase;
2885 AM.Base_Reg = N;
2886 return false;
2887}
2888
2889bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2890 X86ISelAddressMode &AM,
2891 unsigned Depth) {
2892 LLVM_DEBUG({
2893 dbgs() << "MatchVectorAddress: ";
2894 AM.dump(CurDAG);
2895 });
2896 // Limit recursion.
2898 return matchAddressBase(N, AM);
2899
2900 // TODO: Support other operations.
2901 switch (N.getOpcode()) {
2902 case ISD::Constant: {
2903 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2904 if (!foldOffsetIntoAddress(Val, AM))
2905 return false;
2906 break;
2907 }
2908 case X86ISD::Wrapper:
2909 if (!matchWrapper(N, AM))
2910 return false;
2911 break;
2912 case ISD::ADD: {
2913 // Add an artificial use to this node so that we can keep track of
2914 // it if it gets CSE'd with a different node.
2915 HandleSDNode Handle(N);
2916
2917 X86ISelAddressMode Backup = AM;
2918 if (!matchVectorAddressRecursively(N.getOperand(0), AM, Depth + 1) &&
2919 !matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2920 Depth + 1))
2921 return false;
2922 AM = Backup;
2923
2924 // Try again after commuting the operands.
2925 if (!matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2926 Depth + 1) &&
2927 !matchVectorAddressRecursively(Handle.getValue().getOperand(0), AM,
2928 Depth + 1))
2929 return false;
2930 AM = Backup;
2931
2932 N = Handle.getValue();
2933 break;
2934 }
2935 }
2936
2937 return matchAddressBase(N, AM);
2938}
2939
2940/// Helper for selectVectorAddr. Handles things that can be folded into a
2941/// gather/scatter address. The index register and scale should have already
2942/// been handled.
2943bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2944 return matchVectorAddressRecursively(N, AM, 0);
2945}
2946
2947bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2948 SDValue IndexOp, SDValue ScaleOp,
2949 SDValue &Base, SDValue &Scale,
2950 SDValue &Index, SDValue &Disp,
2951 SDValue &Segment) {
2952 X86ISelAddressMode AM;
2953 AM.Scale = ScaleOp->getAsZExtVal();
2954
2955 // Attempt to match index patterns, as long as we're not relying on implicit
2956 // sign-extension, which is performed BEFORE scale.
2957 if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2958 AM.IndexReg = matchIndexRecursively(IndexOp, AM, 0);
2959 else
2960 AM.IndexReg = IndexOp;
2961
2962 unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2963 if (AddrSpace == X86AS::GS)
2964 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2965 if (AddrSpace == X86AS::FS)
2966 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2967 if (AddrSpace == X86AS::SS)
2968 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2969
2970 SDLoc DL(BasePtr);
2971 MVT VT = BasePtr.getSimpleValueType();
2972
2973 // Try to match into the base and displacement fields.
2974 if (matchVectorAddress(BasePtr, AM))
2975 return false;
2976
2977 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2978 return true;
2979}
2980
2981/// Returns true if it is able to pattern match an addressing mode.
2982/// It returns the operands which make up the maximal addressing mode it can
2983/// match by reference.
2984///
2985/// Parent is the parent node of the addr operand that is being matched. It
2986/// is always a load, store, atomic node, or null. It is only null when
2987/// checking memory operands for inline asm nodes.
2988bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2989 SDValue &Scale, SDValue &Index,
2990 SDValue &Disp, SDValue &Segment) {
2991 X86ISelAddressMode AM;
2992
2993 if (Parent &&
2994 // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2995 // that are not a MemSDNode, and thus don't have proper addrspace info.
2996 Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
2997 Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
2998 Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
2999 Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
3000 Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
3001 Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
3002 Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
3003 unsigned AddrSpace =
3004 cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
3005 if (AddrSpace == X86AS::GS)
3006 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
3007 if (AddrSpace == X86AS::FS)
3008 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
3009 if (AddrSpace == X86AS::SS)
3010 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
3011 }
3012
3013 // Save the DL and VT before calling matchAddress, it can invalidate N.
3014 SDLoc DL(N);
3015 MVT VT = N.getSimpleValueType();
3016
3017 if (matchAddress(N, AM))
3018 return false;
3019
3020 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3021 return true;
3022}
3023
3024bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
3025 // Cannot use 32 bit constants to reference objects in kernel/large code
3026 // model.
3027 if (TM.getCodeModel() == CodeModel::Kernel ||
3028 TM.getCodeModel() == CodeModel::Large)
3029 return false;
3030
3031 // In static codegen with small code model, we can get the address of a label
3032 // into a register with 'movl'
3033 if (N->getOpcode() != X86ISD::Wrapper)
3034 return false;
3035
3036 N = N.getOperand(0);
3037
3038 // At least GNU as does not accept 'movl' for TPOFF relocations.
3039 // FIXME: We could use 'movl' when we know we are targeting MC.
3040 if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
3041 return false;
3042
3043 Imm = N;
3044 // Small/medium code model can reference non-TargetGlobalAddress objects with
3045 // 32 bit constants.
3046 if (N->getOpcode() != ISD::TargetGlobalAddress) {
3047 return TM.getCodeModel() == CodeModel::Small ||
3048 TM.getCodeModel() == CodeModel::Medium;
3049 }
3050
3051 const GlobalValue *GV = cast<GlobalAddressSDNode>(N)->getGlobal();
3052 if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
3053 return CR->getUnsignedMax().ult(1ull << 32);
3054
3055 return !TM.isLargeGlobalValue(GV);
3056}
3057
3058bool X86DAGToDAGISel::selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale,
3059 SDValue &Index, SDValue &Disp,
3060 SDValue &Segment) {
3061 // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
3062 SDLoc DL(N);
3063
3064 if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
3065 return false;
3066
3067 EVT BaseType = Base.getValueType();
3068 unsigned SubReg;
3069 if (BaseType == MVT::i8)
3070 SubReg = X86::sub_8bit;
3071 else if (BaseType == MVT::i16)
3072 SubReg = X86::sub_16bit;
3073 else
3074 SubReg = X86::sub_32bit;
3075
3077 if (RN && RN->getReg() == 0)
3078 Base = CurDAG->getRegister(0, MVT::i64);
3079 else if ((BaseType == MVT::i8 || BaseType == MVT::i16 ||
3080 BaseType == MVT::i32) &&
3082 // Base could already be %rip, particularly in the x32 ABI.
3083 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3084 MVT::i64), 0);
3085 Base = CurDAG->getTargetInsertSubreg(SubReg, DL, MVT::i64, ImplDef, Base);
3086 }
3087
3088 [[maybe_unused]] EVT IndexType = Index.getValueType();
3090 if (RN && RN->getReg() == 0)
3091 Index = CurDAG->getRegister(0, MVT::i64);
3092 else {
3093 assert((IndexType == BaseType) &&
3094 "Expect to be extending 8/16/32-bit registers for use in LEA");
3095 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3096 MVT::i64), 0);
3097 Index = CurDAG->getTargetInsertSubreg(SubReg, DL, MVT::i64, ImplDef, Index);
3098 }
3099
3100 return true;
3101}
3102
3103/// Calls SelectAddr and determines if the maximal addressing
3104/// mode it matches can be cost effectively emitted as an LEA instruction.
3105bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3106 SDValue &Base, SDValue &Scale,
3107 SDValue &Index, SDValue &Disp,
3108 SDValue &Segment) {
3109 X86ISelAddressMode AM;
3110
3111 // Save the DL and VT before calling matchAddress, it can invalidate N.
3112 SDLoc DL(N);
3113 MVT VT = N.getSimpleValueType();
3114
3115 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3116 // segments.
3117 SDValue Copy = AM.Segment;
3118 SDValue T = CurDAG->getRegister(0, MVT::i32);
3119 AM.Segment = T;
3120 if (matchAddress(N, AM))
3121 return false;
3122 assert (T == AM.Segment);
3123 AM.Segment = Copy;
3124
3125 unsigned Complexity = 0;
3126 if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3127 Complexity = 1;
3128 else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3129 Complexity = 4;
3130
3131 if (AM.IndexReg.getNode())
3132 Complexity++;
3133
3134 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3135 // a simple shift.
3136 if (AM.Scale > 1)
3137 Complexity++;
3138
3139 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3140 // to a LEA. This is determined with some experimentation but is by no means
3141 // optimal (especially for code size consideration). LEA is nice because of
3142 // its three-address nature. Tweak the cost function again when we can run
3143 // convertToThreeAddress() at register allocation time.
3144 if (AM.hasSymbolicDisplacement()) {
3145 // For X86-64, always use LEA to materialize RIP-relative addresses.
3146 if (Subtarget->is64Bit())
3147 Complexity = 4;
3148 else
3149 Complexity += 2;
3150 }
3151
3152 // Heuristic: try harder to form an LEA from ADD if the operands set flags.
3153 // Unlike ADD, LEA does not affect flags, so we will be less likely to require
3154 // duplicating flag-producing instructions later in the pipeline.
3155 if (N.getOpcode() == ISD::ADD) {
3156 auto isMathWithFlags = [](SDValue V) {
3157 switch (V.getOpcode()) {
3158 case X86ISD::ADD:
3159 case X86ISD::SUB:
3160 case X86ISD::ADC:
3161 case X86ISD::SBB:
3162 case X86ISD::SMUL:
3163 case X86ISD::UMUL:
3164 /* TODO: These opcodes can be added safely, but we may want to justify
3165 their inclusion for different reasons (better for reg-alloc).
3166 case X86ISD::OR:
3167 case X86ISD::XOR:
3168 case X86ISD::AND:
3169 */
3170 // Value 1 is the flag output of the node - verify it's not dead.
3171 return !SDValue(V.getNode(), 1).use_empty();
3172 default:
3173 return false;
3174 }
3175 };
3176 // TODO: We might want to factor in whether there's a load folding
3177 // opportunity for the math op that disappears with LEA.
3178 if (isMathWithFlags(N.getOperand(0)) || isMathWithFlags(N.getOperand(1)))
3179 Complexity++;
3180 }
3181
3182 if (AM.Disp)
3183 Complexity++;
3184
3185 // If it isn't worth using an LEA, reject it.
3186 if (Complexity <= 2)
3187 return false;
3188
3189 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3190 return true;
3191}
3192
3193/// This is only run on TargetGlobalTLSAddress nodes.
3194bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3195 SDValue &Scale, SDValue &Index,
3196 SDValue &Disp, SDValue &Segment) {
3197 assert(N.getOpcode() == ISD::TargetGlobalTLSAddress ||
3198 N.getOpcode() == ISD::TargetExternalSymbol);
3199
3200 X86ISelAddressMode AM;
3201 if (auto *GA = dyn_cast<GlobalAddressSDNode>(N)) {
3202 AM.GV = GA->getGlobal();
3203 AM.Disp += GA->getOffset();
3204 AM.SymbolFlags = GA->getTargetFlags();
3205 } else {
3206 auto *SA = cast<ExternalSymbolSDNode>(N);
3207 AM.ES = SA->getSymbol();
3208 AM.SymbolFlags = SA->getTargetFlags();
3209 }
3210
3211 if (Subtarget->is32Bit()) {
3212 AM.Scale = 1;
3213 AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
3214 }
3215
3216 MVT VT = N.getSimpleValueType();
3217 getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
3218 return true;
3219}
3220
3221bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3222 // Keep track of the original value type and whether this value was
3223 // truncated. If we see a truncation from pointer type to VT that truncates
3224 // bits that are known to be zero, we can use a narrow reference.
3225 EVT VT = N.getValueType();
3226 bool WasTruncated = false;
3227 if (N.getOpcode() == ISD::TRUNCATE) {
3228 WasTruncated = true;
3229 N = N.getOperand(0);
3230 }
3231
3232 if (N.getOpcode() != X86ISD::Wrapper)
3233 return false;
3234
3235 // We can only use non-GlobalValues as immediates if they were not truncated,
3236 // as we do not have any range information. If we have a GlobalValue and the
3237 // address was not truncated, we can select it as an operand directly.
3238 unsigned Opc = N.getOperand(0)->getOpcode();
3239 if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
3240 Op = N.getOperand(0);
3241 // We can only select the operand directly if we didn't have to look past a
3242 // truncate.
3243 return !WasTruncated;
3244 }
3245
3246 // Check that the global's range fits into VT.
3247 auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
3248 std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3249 if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
3250 return false;
3251
3252 // Okay, we can use a narrow reference.
3253 Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
3254 GA->getOffset(), GA->getTargetFlags());
3255 return true;
3256}
3257
3258bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
3259 SDValue &Base, SDValue &Scale,
3260 SDValue &Index, SDValue &Disp,
3261 SDValue &Segment) {
3262 assert(Root && P && "Unknown root/parent nodes");
3263 if (!ISD::isNON_EXTLoad(N.getNode()) ||
3264 !IsProfitableToFold(N, P, Root) ||
3265 !IsLegalToFold(N, P, Root, OptLevel))
3266 return false;
3267
3268 return selectAddr(N.getNode(),
3269 N.getOperand(1), Base, Scale, Index, Disp, Segment);
3270}
3271
3272bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
3273 SDValue &Base, SDValue &Scale,
3274 SDValue &Index, SDValue &Disp,
3275 SDValue &Segment) {
3276 assert(Root && P && "Unknown root/parent nodes");
3277 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
3278 !IsProfitableToFold(N, P, Root) ||
3279 !IsLegalToFold(N, P, Root, OptLevel))
3280 return false;
3281
3282 return selectAddr(N.getNode(),
3283 N.getOperand(1), Base, Scale, Index, Disp, Segment);
3284}
3285
3286/// Return an SDNode that returns the value of the global base register.
3287/// Output instructions required to initialize the global base register,
3288/// if necessary.
3289SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3290 Register GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3291 auto &DL = MF->getDataLayout();
3292 return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
3293}
3294
3295bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
3296 if (N->getOpcode() == ISD::TRUNCATE)
3297 N = N->getOperand(0).getNode();
3298 if (N->getOpcode() != X86ISD::Wrapper)
3299 return false;
3300
3301 auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
3302 if (!GA)
3303 return false;
3304
3305 auto *GV = GA->getGlobal();
3306 std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3307 if (CR)
3308 return CR->getSignedMin().sge(-1ull << Width) &&
3309 CR->getSignedMax().slt(1ull << Width);
3310 // In the kernel code model, globals are in the negative 2GB of the address
3311 // space, so globals can be a sign extended 32-bit immediate.
3312 // In other code models, small globals are in the low 2GB of the address
3313 // space, so sign extending them is equivalent to zero extending them.
3314 return TM.getCodeModel() != CodeModel::Large && Width == 32 &&
3315 !TM.isLargeGlobalValue(GV);
3316}
3317
3318X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
3319 assert(N->isMachineOpcode() && "Unexpected node");
3320 unsigned Opc = N->getMachineOpcode();
3321 const MCInstrDesc &MCID = getInstrInfo()->get(Opc);
3322 int CondNo = X86::getCondSrcNoFromDesc(MCID);
3323 if (CondNo < 0)
3324 return X86::COND_INVALID;
3325
3326 return static_cast<X86::CondCode>(N->getConstantOperandVal(CondNo));
3327}
3328
3329/// Test whether the given X86ISD::CMP node has any users that use a flag
3330/// other than ZF.
3331bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3332 // Examine each user of the node.
3333 for (SDUse &Use : Flags->uses()) {
3334 // Only check things that use the flags.
3335 if (Use.getResNo() != Flags.getResNo())
3336 continue;
3337 SDNode *User = Use.getUser();
3338 // Only examine CopyToReg uses that copy to EFLAGS.
3339 if (User->getOpcode() != ISD::CopyToReg ||
3340 cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3341 return false;
3342 // Examine each user of the CopyToReg use.
3343 for (SDUse &FlagUse : User->uses()) {
3344 // Only examine the Flag result.
3345 if (FlagUse.getResNo() != 1)
3346 continue;
3347 // Anything unusual: assume conservatively.
3348 if (!FlagUse.getUser()->isMachineOpcode())
3349 return false;
3350 // Examine the condition code of the user.
3351 X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3352
3353 switch (CC) {
3354 // Comparisons which only use the zero flag.
3355 case X86::COND_E: case X86::COND_NE:
3356 continue;
3357 // Anything else: assume conservatively.
3358 default:
3359 return false;
3360 }
3361 }
3362 }
3363 return true;
3364}
3365
3366/// Test whether the given X86ISD::CMP node has any uses which require the SF
3367/// flag to be accurate.
3368bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3369 // Examine each user of the node.
3370 for (SDUse &Use : Flags->uses()) {
3371 // Only check things that use the flags.
3372 if (Use.getResNo() != Flags.getResNo())
3373 continue;
3374 SDNode *User = Use.getUser();
3375 // Only examine CopyToReg uses that copy to EFLAGS.
3376 if (User->getOpcode() != ISD::CopyToReg ||
3377 cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3378 return false;
3379 // Examine each user of the CopyToReg use.
3380 for (SDUse &FlagUse : User->uses()) {
3381 // Only examine the Flag result.
3382 if (FlagUse.getResNo() != 1)
3383 continue;
3384 // Anything unusual: assume conservatively.
3385 if (!FlagUse.getUser()->isMachineOpcode())
3386 return false;
3387 // Examine the condition code of the user.
3388 X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3389
3390 switch (CC) {
3391 // Comparisons which don't examine the SF flag.
3392 case X86::COND_A: case X86::COND_AE:
3393 case X86::COND_B: case X86::COND_BE:
3394 case X86::COND_E: case X86::COND_NE:
3395 case X86::COND_O: case X86::COND_NO:
3396 case X86::COND_P: case X86::COND_NP:
3397 continue;
3398 // Anything else: assume conservatively.
3399 default:
3400 return false;
3401 }
3402 }
3403 }
3404 return true;
3405}
3406
3408 switch (CC) {
3409 // Comparisons which don't examine the CF flag.
3410 case X86::COND_O: case X86::COND_NO:
3411 case X86::COND_E: case X86::COND_NE:
3412 case X86::COND_S: case X86::COND_NS:
3413 case X86::COND_P: case X86::COND_NP:
3414 case X86::COND_L: case X86::COND_GE:
3415 case X86::COND_G: case X86::COND_LE:
3416 return false;
3417 // Anything else: assume conservatively.
3418 default:
3419 return true;
3420 }
3421}
3422
3423/// Test whether the given node which sets flags has any uses which require the
3424/// CF flag to be accurate.
3425 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3426 // Examine each user of the node.
3427 for (SDUse &Use : Flags->uses()) {
3428 // Only check things that use the flags.
3429 if (Use.getResNo() != Flags.getResNo())
3430 continue;
3431
3432 SDNode *User = Use.getUser();
3433 unsigned UserOpc = User->getOpcode();
3434
3435 if (UserOpc == ISD::CopyToReg) {
3436 // Only examine CopyToReg uses that copy to EFLAGS.
3437 if (cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3438 return false;
3439 // Examine each user of the CopyToReg use.
3440 for (SDUse &FlagUse : User->uses()) {
3441 // Only examine the Flag result.
3442 if (FlagUse.getResNo() != 1)
3443 continue;
3444 // Anything unusual: assume conservatively.
3445 if (!FlagUse.getUser()->isMachineOpcode())
3446 return false;
3447 // Examine the condition code of the user.
3448 X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3449
3450 if (mayUseCarryFlag(CC))
3451 return false;
3452 }
3453
3454 // This CopyToReg is ok. Move on to the next user.
3455 continue;
3456 }
3457
3458 // This might be an unselected node. So look for the pre-isel opcodes that
3459 // use flags.
3460 unsigned CCOpNo;
3461 switch (UserOpc) {
3462 default:
3463 // Something unusual. Be conservative.
3464 return false;
3465 case X86ISD::SETCC: CCOpNo = 0; break;
3466 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
3467 case X86ISD::CMOV: CCOpNo = 2; break;
3468 case X86ISD::BRCOND: CCOpNo = 2; break;
3469 }
3470
3471 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
3472 if (mayUseCarryFlag(CC))
3473 return false;
3474 }
3475 return true;
3476}
3477
3478/// Check whether or not the chain ending in StoreNode is suitable for doing
3479/// the {load; op; store} to modify transformation.
3481 SDValue StoredVal, SelectionDAG *CurDAG,
3482 unsigned LoadOpNo,
3483 LoadSDNode *&LoadNode,
3484 SDValue &InputChain) {
3485 // Is the stored value result 0 of the operation?
3486 if (StoredVal.getResNo() != 0) return false;
3487
3488 // Are there other uses of the operation other than the store?
3489 if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
3490
3491 // Is the store non-extending and non-indexed?
3492 if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
3493 return false;
3494
3495 SDValue Load = StoredVal->getOperand(LoadOpNo);
3496 // Is the stored value a non-extending and non-indexed load?
3497 if (!ISD::isNormalLoad(Load.getNode())) return false;
3498
3499 // Return LoadNode by reference.
3500 LoadNode = cast<LoadSDNode>(Load);
3501
3502 // Is store the only read of the loaded value?
3503 if (!Load.hasOneUse())
3504 return false;
3505
3506 // Is the address of the store the same as the load?
3507 if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
3508 LoadNode->getOffset() != StoreNode->getOffset())
3509 return false;
3510
3511 bool FoundLoad = false;
3512 SmallVector<SDValue, 4> ChainOps;
3513 SmallVector<const SDNode *, 4> LoopWorklist;
3515 const unsigned int Max = 1024;
3516
3517 // Visualization of Load-Op-Store fusion:
3518 // -------------------------
3519 // Legend:
3520 // *-lines = Chain operand dependencies.
3521 // |-lines = Normal operand dependencies.
3522 // Dependencies flow down and right. n-suffix references multiple nodes.
3523 //
3524 // C Xn C
3525 // * * *
3526 // * * *
3527 // Xn A-LD Yn TF Yn
3528 // * * \ | * |
3529 // * * \ | * |
3530 // * * \ | => A--LD_OP_ST
3531 // * * \| \
3532 // TF OP \
3533 // * | \ Zn
3534 // * | \
3535 // A-ST Zn
3536 //
3537
3538 // This merge induced dependences from: #1: Xn -> LD, OP, Zn
3539 // #2: Yn -> LD
3540 // #3: ST -> Zn
3541
3542 // Ensure the transform is safe by checking for the dual
3543 // dependencies to make sure we do not induce a loop.
3544
3545 // As LD is a predecessor to both OP and ST we can do this by checking:
3546 // a). if LD is a predecessor to a member of Xn or Yn.
3547 // b). if a Zn is a predecessor to ST.
3548
3549 // However, (b) can only occur through being a chain predecessor to
3550 // ST, which is the same as Zn being a member or predecessor of Xn,
3551 // which is a subset of LD being a predecessor of Xn. So it's
3552 // subsumed by check (a).
3553
3554 SDValue Chain = StoreNode->getChain();
3555
3556 // Gather X elements in ChainOps.
3557 if (Chain == Load.getValue(1)) {
3558 FoundLoad = true;
3559 ChainOps.push_back(Load.getOperand(0));
3560 } else if (Chain.getOpcode() == ISD::TokenFactor) {
3561 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
3562 SDValue Op = Chain.getOperand(i);
3563 if (Op == Load.getValue(1)) {
3564 FoundLoad = true;
3565 // Drop Load, but keep its chain. No cycle check necessary.
3566 ChainOps.push_back(Load.getOperand(0));
3567 continue;
3568 }
3569 LoopWorklist.push_back(Op.getNode());
3570 ChainOps.push_back(Op);
3571 }
3572 }
3573
3574 if (!FoundLoad)
3575 return false;
3576
3577 // Worklist is currently Xn. Add Yn to worklist.
3578 for (SDValue Op : StoredVal->ops())
3579 if (Op.getNode() != LoadNode)
3580 LoopWorklist.push_back(Op.getNode());
3581
3582 // Check (a) if Load is a predecessor to Xn + Yn
3583 if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
3584 true))
3585 return false;
3586
3587 InputChain =
3588 CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
3589 return true;
3590}
3591
3592// Change a chain of {load; op; store} of the same value into a simple op
3593// through memory of that value, if the uses of the modified value and its
3594// address are suitable.
3595//
3596// The tablegen pattern memory operand pattern is currently not able to match
3597// the case where the EFLAGS on the original operation are used.
3598//
3599// To move this to tablegen, we'll need to improve tablegen to allow flags to
3600// be transferred from a node in the pattern to the result node, probably with
3601// a new keyword. For example, we have this
3602// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3603// [(store (add (loadi64 addr:$dst), -1), addr:$dst)]>;
3604// but maybe need something like this
3605// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3606// [(store (X86add_flag (loadi64 addr:$dst), -1), addr:$dst),
3607// (transferrable EFLAGS)]>;
3608//
3609// Until then, we manually fold these and instruction select the operation
3610// here.
3611bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3612 auto *StoreNode = cast<StoreSDNode>(Node);
3613 SDValue StoredVal = StoreNode->getOperand(1);
3614 unsigned Opc = StoredVal->getOpcode();
3615
3616 // Before we try to select anything, make sure this is memory operand size
3617 // and opcode we can handle. Note that this must match the code below that
3618 // actually lowers the opcodes.
3619 EVT MemVT = StoreNode->getMemoryVT();
3620 if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3621 MemVT != MVT::i8)
3622 return false;
3623
3624 bool IsCommutable = false;
3625 bool IsNegate = false;
3626 switch (Opc) {
3627 default:
3628 return false;
3629 case X86ISD::SUB:
3630 IsNegate = isNullConstant(StoredVal.getOperand(0));
3631 break;
3632 case X86ISD::SBB:
3633 break;
3634 case X86ISD::ADD:
3635 case X86ISD::ADC:
3636 case X86ISD::AND:
3637 case X86ISD::OR:
3638 case X86ISD::XOR:
3639 IsCommutable = true;
3640 break;
3641 }
3642
3643 unsigned LoadOpNo = IsNegate ? 1 : 0;
3644 LoadSDNode *LoadNode = nullptr;
3645 SDValue InputChain;
3646 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3647 LoadNode, InputChain)) {
3648 if (!IsCommutable)
3649 return false;
3650
3651 // This operation is commutable, try the other operand.
3652 LoadOpNo = 1;
3653 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3654 LoadNode, InputChain))
3655 return false;
3656 }
3657
3658 SDValue Base, Scale, Index, Disp, Segment;
3659 if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3660 Segment))
3661 return false;
3662
3663 auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3664 unsigned Opc8) {
3665 switch (MemVT.getSimpleVT().SimpleTy) {
3666 case MVT::i64:
3667 return Opc64;
3668 case MVT::i32:
3669 return Opc32;
3670 case MVT::i16:
3671 return Opc16;
3672 case MVT::i8:
3673 return Opc8;
3674 default:
3675 llvm_unreachable("Invalid size!");
3676 }
3677 };
3678
3679 MachineSDNode *Result;
3680 switch (Opc) {
3681 case X86ISD::SUB:
3682 // Handle negate.
3683 if (IsNegate) {
3684 unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3685 X86::NEG8m);
3686 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3687 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3688 MVT::Other, Ops);
3689 break;
3690 }
3691 [[fallthrough]];
3692 case X86ISD::ADD:
3693 // Try to match inc/dec.
3694 if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
3695 bool IsOne = isOneConstant(StoredVal.getOperand(1));
3696 bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
3697 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3698 if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
3699 unsigned NewOpc =
3700 ((Opc == X86ISD::ADD) == IsOne)
3701 ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3702 : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3703 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3704 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3705 MVT::Other, Ops);
3706 break;
3707 }
3708 }
3709 [[fallthrough]];
3710 case X86ISD::ADC:
3711 case X86ISD::SBB:
3712 case X86ISD::AND:
3713 case X86ISD::OR:
3714 case X86ISD::XOR: {
3715 auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3716 switch (Opc) {
3717 case X86ISD::ADD:
3718 return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3719 X86::ADD8mr);
3720 case X86ISD::ADC:
3721 return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3722 X86::ADC8mr);
3723 case X86ISD::SUB:
3724 return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3725 X86::SUB8mr);
3726 case X86ISD::SBB:
3727 return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3728 X86::SBB8mr);
3729 case X86ISD::AND:
3730 return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3731 X86::AND8mr);
3732 case X86ISD::OR:
3733 return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3734 case X86ISD::XOR:
3735 return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3736 X86::XOR8mr);
3737 default:
3738 llvm_unreachable("Invalid opcode!");
3739 }
3740 };
3741 auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3742 switch (Opc) {
3743 case X86ISD::ADD:
3744 return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3745 X86::ADD8mi);
3746 case X86ISD::ADC:
3747 return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3748 X86::ADC8mi);
3749 case X86ISD::SUB:
3750 return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3751 X86::SUB8mi);
3752 case X86ISD::SBB:
3753 return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3754 X86::SBB8mi);
3755 case X86ISD::AND:
3756 return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3757 X86::AND8mi);
3758 case X86ISD::OR:
3759 return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3760 X86::OR8mi);
3761 case X86ISD::XOR:
3762 return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3763 X86::XOR8mi);
3764 default:
3765 llvm_unreachable("Invalid opcode!");
3766 }
3767 };
3768
3769 unsigned NewOpc = SelectRegOpcode(Opc);
3770 SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3771
3772 // See if the operand is a constant that we can fold into an immediate
3773 // operand.
3774 if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3775 int64_t OperandV = OperandC->getSExtValue();
3776
3777 // Check if we can shrink the operand enough to fit in an immediate (or
3778 // fit into a smaller immediate) by negating it and switching the
3779 // operation.
3780 if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3781 ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3782 (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3783 isInt<32>(-OperandV))) &&
3784 hasNoCarryFlagUses(StoredVal.getValue(1))) {
3785 OperandV = -OperandV;
3787 }
3788
3789 if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3790 Operand = CurDAG->getSignedTargetConstant(OperandV, SDLoc(Node), MemVT);
3791 NewOpc = SelectImmOpcode(Opc);
3792 }
3793 }
3794
3795 if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3796 SDValue CopyTo =
3797 CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3798 StoredVal.getOperand(2), SDValue());
3799
3800 const SDValue Ops[] = {Base, Scale, Index, Disp,
3801 Segment, Operand, CopyTo, CopyTo.getValue(1)};
3802 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3803 Ops);
3804 } else {
3805 const SDValue Ops[] = {Base, Scale, Index, Disp,
3806 Segment, Operand, InputChain};
3807 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3808 Ops);
3809 }
3810 break;
3811 }
3812 default:
3813 llvm_unreachable("Invalid opcode!");
3814 }
3815
3816 MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3817 LoadNode->getMemOperand()};
3818 CurDAG->setNodeMemRefs(Result, MemOps);
3819
3820 // Update Load Chain uses as well.
3821 ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3822 ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3823 ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3824 CurDAG->RemoveDeadNode(Node);
3825 return true;
3826}
3827
3828// See if this is an X & Mask that we can match to BEXTR/BZHI.
3829// Where Mask is one of the following patterns:
3830// a) x & (1 << nbits) - 1
3831// b) x & ~(-1 << nbits)
3832// c) x & (-1 >> (32 - y))
3833// d) x << (32 - y) >> (32 - y)
3834// e) (1 << nbits) - 1
3835bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3836 assert(
3837 (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND ||
3838 Node->getOpcode() == ISD::SRL) &&
3839 "Should be either an and-mask, or right-shift after clearing high bits.");
3840
3841 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3842 if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3843 return false;
3844
3845 MVT NVT = Node->getSimpleValueType(0);
3846
3847 // Only supported for 32 and 64 bits.
3848 if (NVT != MVT::i32 && NVT != MVT::i64)
3849 return false;
3850
3851 SDValue NBits;
3852 bool NegateNBits;
3853
3854 // If we have BMI2's BZHI, we are ok with muti-use patterns.
3855 // Else, if we only have BMI1's BEXTR, we require one-use.
3856 const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3857 auto checkUses = [AllowExtraUsesByDefault](
3858 SDValue Op, unsigned NUses,
3859 std::optional<bool> AllowExtraUses) {
3860 return AllowExtraUses.value_or(AllowExtraUsesByDefault) ||
3861 Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3862 };
3863 auto checkOneUse = [checkUses](SDValue Op,
3864 std::optional<bool> AllowExtraUses =
3865 std::nullopt) {
3866 return checkUses(Op, 1, AllowExtraUses);
3867 };
3868 auto checkTwoUse = [checkUses](SDValue Op,
3869 std::optional<bool> AllowExtraUses =
3870 std::nullopt) {
3871 return checkUses(Op, 2, AllowExtraUses);
3872 };
3873
3874 auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3875 if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3876 assert(V.getSimpleValueType() == MVT::i32 &&
3877 V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3878 "Expected i64 -> i32 truncation");
3879 V = V.getOperand(0);
3880 }
3881 return V;
3882 };
3883
3884 // a) x & ((1 << nbits) + (-1))
3885 auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3886 &NegateNBits](SDValue Mask) -> bool {
3887 // Match `add`. Must only have one use!
3888 if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3889 return false;
3890 // We should be adding all-ones constant (i.e. subtracting one.)
3891 if (!isAllOnesConstant(Mask->getOperand(1)))
3892 return false;
3893 // Match `1 << nbits`. Might be truncated. Must only have one use!
3894 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3895 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3896 return false;
3897 if (!isOneConstant(M0->getOperand(0)))
3898 return false;
3899 NBits = M0->getOperand(1);
3900 NegateNBits = false;
3901 return true;
3902 };
3903
3904 auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3905 V = peekThroughOneUseTruncation(V);
3906 return CurDAG->MaskedValueIsAllOnes(
3907 V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
3908 NVT.getSizeInBits()));
3909 };
3910
3911 // b) x & ~(-1 << nbits)
3912 auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3913 &NBits, &NegateNBits](SDValue Mask) -> bool {
3914 // Match `~()`. Must only have one use!
3915 if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3916 return false;
3917 // The -1 only has to be all-ones for the final Node's NVT.
3918 if (!isAllOnes(Mask->getOperand(1)))
3919 return false;
3920 // Match `-1 << nbits`. Might be truncated. Must only have one use!
3921 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3922 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3923 return false;
3924 // The -1 only has to be all-ones for the final Node's NVT.
3925 if (!isAllOnes(M0->getOperand(0)))
3926 return false;
3927 NBits = M0->getOperand(1);
3928 NegateNBits = false;
3929 return true;
3930 };
3931
3932 // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
3933 // or leave the shift amount as-is, but then we'll have to negate it.
3934 auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
3935 unsigned Bitwidth) {
3936 NBits = ShiftAmt;
3937 NegateNBits = true;
3938 // Skip over a truncate of the shift amount, if any.
3939 if (NBits.getOpcode() == ISD::TRUNCATE)
3940 NBits = NBits.getOperand(0);
3941 // Try to match the shift amount as (bitwidth - y). It should go away, too.
3942 // If it doesn't match, that's fine, we'll just negate it ourselves.
3943 if (NBits.getOpcode() != ISD::SUB)
3944 return;
3945 auto *V0 = dyn_cast<ConstantSDNode>(NBits.getOperand(0));
3946 if (!V0 || V0->getZExtValue() != Bitwidth)
3947 return;
3948 NBits = NBits.getOperand(1);
3949 NegateNBits = false;
3950 };
3951
3952 // c) x & (-1 >> z) but then we'll have to subtract z from bitwidth
3953 // or
3954 // c) x & (-1 >> (32 - y))
3955 auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
3956 canonicalizeShiftAmt](SDValue Mask) -> bool {
3957 // The mask itself may be truncated.
3958 Mask = peekThroughOneUseTruncation(Mask);
3959 unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3960 // Match `l>>`. Must only have one use!
3961 if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3962 return false;
3963 // We should be shifting truly all-ones constant.
3964 if (!isAllOnesConstant(Mask.getOperand(0)))
3965 return false;
3966 SDValue M1 = Mask.getOperand(1);
3967 // The shift amount should not be used externally.
3968 if (!checkOneUse(M1))
3969 return false;
3970 canonicalizeShiftAmt(M1, Bitwidth);
3971 // Pattern c. is non-canonical, and is expanded into pattern d. iff there
3972 // is no extra use of the mask. Clearly, there was one since we are here.
3973 // But at the same time, if we need to negate the shift amount,
3974 // then we don't want the mask to stick around, else it's unprofitable.
3975 return !NegateNBits;
3976 };
3977
3978 SDValue X;
3979
3980 // d) x << z >> z but then we'll have to subtract z from bitwidth
3981 // or
3982 // d) x << (32 - y) >> (32 - y)
3983 auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
3984 AllowExtraUsesByDefault, &NegateNBits,
3985 &X](SDNode *Node) -> bool {
3986 if (Node->getOpcode() != ISD::SRL)
3987 return false;
3988 SDValue N0 = Node->getOperand(0);
3989 if (N0->getOpcode() != ISD::SHL)
3990 return false;
3991 unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3992 SDValue N1 = Node->getOperand(1);
3993 SDValue N01 = N0->getOperand(1);
3994 // Both of the shifts must be by the exact same value.
3995 if (N1 != N01)
3996 return false;
3997 canonicalizeShiftAmt(N1, Bitwidth);
3998 // There should not be any external uses of the inner shift / shift amount.
3999 // Note that while we are generally okay with external uses given BMI2,
4000 // iff we need to negate the shift amount, we are not okay with extra uses.
4001 const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
4002 if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses))
4003 return false;
4004 X = N0->getOperand(0);
4005 return true;
4006 };
4007
4008 auto matchLowBitMask = [matchPatternA, matchPatternB,
4009 matchPatternC](SDValue Mask) -> bool {
4010 return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
4011 };
4012
4013 if (Node->getOpcode() == ISD::AND) {
4014 X = Node->getOperand(0);
4015 SDValue Mask = Node->getOperand(1);
4016
4017 if (matchLowBitMask(Mask)) {
4018 // Great.
4019 } else {
4020 std::swap(X, Mask);
4021 if (!matchLowBitMask(Mask))
4022 return false;
4023 }
4024 } else if (matchLowBitMask(SDValue(Node, 0))) {
4025 X = CurDAG->getAllOnesConstant(SDLoc(Node), NVT);
4026 } else if (!matchPatternD(Node))
4027 return false;
4028
4029 // If we need to negate the shift amount, require BMI2 BZHI support.
4030 // It's just too unprofitable for BMI1 BEXTR.
4031 if (NegateNBits && !Subtarget->hasBMI2())
4032 return false;
4033
4034 SDLoc DL(Node);
4035
4036 if (NBits.getSimpleValueType() != MVT::i8) {
4037 // Truncate the shift amount.
4038 NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
4039 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4040 }
4041
4042 // Turn (i32)(x & imm8) into (i32)x & imm32.
4043 ConstantSDNode *Imm = nullptr;
4044 if (NBits->getOpcode() == ISD::AND)
4045 if ((Imm = dyn_cast<ConstantSDNode>(NBits->getOperand(1))))
4046 NBits = NBits->getOperand(0);
4047
4048 // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
4049 // All the other bits are undefined, we do not care about them.
4050 SDValue ImplDef = SDValue(
4051 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
4052 insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
4053
4054 SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
4055 insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
4056 NBits = SDValue(CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
4057 MVT::i32, ImplDef, NBits, SRIdxVal),
4058 0);
4059 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4060
4061 if (Imm) {
4062 NBits =
4063 CurDAG->getNode(ISD::AND, DL, MVT::i32, NBits,
4064 CurDAG->getConstant(Imm->getZExtValue(), DL, MVT::i32));
4065 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4066 }
4067
4068 // We might have matched the amount of high bits to be cleared,
4069 // but we want the amount of low bits to be kept, so negate it then.
4070 if (NegateNBits) {
4071 SDValue BitWidthC = CurDAG->getConstant(NVT.getSizeInBits(), DL, MVT::i32);
4072 insertDAGNode(*CurDAG, SDValue(Node, 0), BitWidthC);
4073
4074 NBits = CurDAG->getNode(ISD::SUB, DL, MVT::i32, BitWidthC, NBits);
4075 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4076 }
4077
4078 if (Subtarget->hasBMI2()) {
4079 // Great, just emit the BZHI..
4080 if (NVT != MVT::i32) {
4081 // But have to place the bit count into the wide-enough register first.
4082 NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
4083 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4084 }
4085
4086 SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
4087 ReplaceNode(Node, Extract.getNode());
4088 SelectCode(Extract.getNode());
4089 return true;
4090 }
4091
4092 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
4093 // *logically* shifted (potentially with one-use trunc inbetween),
4094 // and the truncation was the only use of the shift,
4095 // and if so look past one-use truncation.
4096 {
4097 SDValue RealX = peekThroughOneUseTruncation(X);
4098 // FIXME: only if the shift is one-use?
4099 if (RealX != X && RealX.getOpcode() == ISD::SRL)
4100 X = RealX;
4101 }
4102
4103 MVT XVT = X.getSimpleValueType();
4104
4105 // Else, emitting BEXTR requires one more step.
4106 // The 'control' of BEXTR has the pattern of:
4107 // [15...8 bit][ 7...0 bit] location
4108 // [ bit count][ shift] name
4109 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4110
4111 // Shift NBits left by 8 bits, thus producing 'control'.
4112 // This makes the low 8 bits to be zero.
4113 SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
4114 insertDAGNode(*CurDAG, SDValue(Node, 0), C8);
4115 SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
4116 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4117
4118 // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
4119 // FIXME: only if the shift is one-use?
4120 if (X.getOpcode() == ISD::SRL) {
4121 SDValue ShiftAmt = X.getOperand(1);
4122 X = X.getOperand(0);
4123
4124 assert(ShiftAmt.getValueType() == MVT::i8 &&
4125 "Expected shift amount to be i8");
4126
4127 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
4128 // We could zext to i16 in some form, but we intentionally don't do that.
4129 SDValue OrigShiftAmt = ShiftAmt;
4130 ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
4131 insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
4132
4133 // And now 'or' these low 8 bits of shift amount into the 'control'.
4134 Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
4135 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4136 }
4137
4138 // But have to place the 'control' into the wide-enough register first.
4139 if (XVT != MVT::i32) {
4140 Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
4141 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4142 }
4143
4144 // And finally, form the BEXTR itself.
4145 SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
4146
4147 // The 'X' was originally truncated. Do that now.
4148 if (XVT != NVT) {
4149 insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
4150 Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
4151 }
4152
4153 ReplaceNode(Node, Extract.getNode());
4154 SelectCode(Extract.getNode());
4155
4156 return true;
4157}
4158
4159// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4160MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
4161 MVT NVT = Node->getSimpleValueType(0);
4162 SDLoc dl(Node);
4163
4164 SDValue N0 = Node->getOperand(0);
4165 SDValue N1 = Node->getOperand(1);
4166
4167 // If we have TBM we can use an immediate for the control. If we have BMI
4168 // we should only do this if the BEXTR instruction is implemented well.
4169 // Otherwise moving the control into a register makes this more costly.
4170 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4171 // hoisting the move immediate would make it worthwhile with a less optimal
4172 // BEXTR?
4173 bool PreferBEXTR =
4174 Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4175 if (!PreferBEXTR && !Subtarget->hasBMI2())
4176 return nullptr;
4177
4178 // Must have a shift right.
4179 if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
4180 return nullptr;
4181
4182 // Shift can't have additional users.
4183 if (!N0->hasOneUse())
4184 return nullptr;
4185
4186 // Only supported for 32 and 64 bits.
4187 if (NVT != MVT::i32 && NVT != MVT::i64)
4188 return nullptr;
4189
4190 // Shift amount and RHS of and must be constant.
4191 auto *MaskCst = dyn_cast<ConstantSDNode>(N1);
4192 auto *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
4193 if (!MaskCst || !ShiftCst)
4194 return nullptr;
4195
4196 // And RHS must be a mask.
4197 uint64_t Mask = MaskCst->getZExtValue();
4198 if (!isMask_64(Mask))
4199 return nullptr;
4200
4201 uint64_t Shift = ShiftCst->getZExtValue();
4202 uint64_t MaskSize = llvm::popcount(Mask);
4203
4204 // Don't interfere with something that can be handled by extracting AH.
4205 // TODO: If we are able to fold a load, BEXTR might still be better than AH.
4206 if (Shift == 8 && MaskSize == 8)
4207 return nullptr;
4208
4209 // Make sure we are only using bits that were in the original value, not
4210 // shifted in.
4211 if (Shift + MaskSize > NVT.getSizeInBits())
4212 return nullptr;
4213
4214 // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4215 // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4216 // does not fit into 32 bits. Load folding is not a sufficient reason.
4217 if (!PreferBEXTR && MaskSize <= 32)
4218 return nullptr;
4219
4220 SDValue Control;
4221 unsigned ROpc, MOpc;
4222
4223#define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4224 if (!PreferBEXTR) {
4225 assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4226 // If we can't make use of BEXTR then we can't fuse shift+mask stages.
4227 // Let's perform the mask first, and apply shift later. Note that we need to
4228 // widen the mask to account for the fact that we'll apply shift afterwards!
4229 Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
4230 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4231 : GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4232 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4233 : GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4234 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4235 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4236 } else {
4237 // The 'control' of BEXTR has the pattern of:
4238 // [15...8 bit][ 7...0 bit] location
4239 // [ bit count][ shift] name
4240 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4241 Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
4242 if (Subtarget->hasTBM()) {
4243 ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4244 MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4245 } else {
4246 assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4247 // BMI requires the immediate to placed in a register.
4248 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4249 : GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4250 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4251 : GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4252 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4253 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4254 }
4255 }
4256
4257 MachineSDNode *NewNode;
4258 SDValue Input = N0->getOperand(0);
4259 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4260 if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4261 SDValue Ops[] = {
4262 Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
4263 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4264 NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4265 // Update the chain.
4266 ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
4267 // Record the mem-refs
4268 CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
4269 } else {
4270 NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
4271 }
4272
4273 if (!PreferBEXTR) {
4274 // We still need to apply the shift.
4275 SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
4276 unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4277 : GET_ND_IF_ENABLED(X86::SHR32ri);
4278 NewNode =
4279 CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
4280 }
4281
4282 return NewNode;
4283}
4284
4285// Emit a PCMISTR(I/M) instruction.
4286MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
4287 bool MayFoldLoad, const SDLoc &dl,
4288 MVT VT, SDNode *Node) {
4289 SDValue N0 = Node->getOperand(0);
4290 SDValue N1 = Node->getOperand(1);
4291 SDValue Imm = Node->getOperand(2);
4292 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4293 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4294
4295 // Try to fold a load. No need to check alignment.
4296 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4297 if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4298 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4299 N1.getOperand(0) };
4300 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
4301 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4302 // Update the chain.
4303 ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
4304 // Record the mem-refs
4305 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4306 return CNode;
4307 }
4308
4309 SDValue Ops[] = { N0, N1, Imm };
4310 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
4311 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4312 return CNode;
4313}
4314
4315// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4316// to emit a second instruction after this one. This is needed since we have two
4317// copyToReg nodes glued before this and we need to continue that glue through.
4318MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
4319 bool MayFoldLoad, const SDLoc &dl,
4320 MVT VT, SDNode *Node,
4321 SDValue &InGlue) {
4322 SDValue N0 = Node->getOperand(0);
4323 SDValue N2 = Node->getOperand(2);
4324 SDValue Imm = Node->getOperand(4);
4325 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4326 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4327
4328 // Try to fold a load. No need to check alignment.
4329 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4330 if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4331 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4332 N2.getOperand(0), InGlue };
4333 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
4334 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4335 InGlue = SDValue(CNode, 3);
4336 // Update the chain.
4337 ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
4338 // Record the mem-refs
4339 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
4340 return CNode;
4341 }
4342
4343 SDValue Ops[] = { N0, N2, Imm, InGlue };
4344 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
4345 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4346 InGlue = SDValue(CNode, 2);
4347 return CNode;
4348}
4349
4350bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4351 EVT VT = N->getValueType(0);
4352
4353 // Only handle scalar shifts.
4354 if (VT.isVector())
4355 return false;
4356
4357 // Narrower shifts only mask to 5 bits in hardware.
4358 unsigned Size = VT == MVT::i64 ? 64 : 32;
4359
4360 SDValue OrigShiftAmt = N->getOperand(1);
4361 SDValue ShiftAmt = OrigShiftAmt;
4362 SDLoc DL(N);
4363
4364 // Skip over a truncate of the shift amount.
4365 if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
4366 ShiftAmt = ShiftAmt->getOperand(0);
4367
4368 // This function is called after X86DAGToDAGISel::matchBitExtract(),
4369 // so we are not afraid that we might mess up BZHI/BEXTR pattern.
4370
4371 SDValue NewShiftAmt;
4372 if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB ||
4373 ShiftAmt->getOpcode() == ISD::XOR) {
4374 SDValue Add0 = ShiftAmt->getOperand(0);
4375 SDValue Add1 = ShiftAmt->getOperand(1);
4376 auto *Add0C = dyn_cast<ConstantSDNode>(Add0);
4377 auto *Add1C = dyn_cast<ConstantSDNode>(Add1);
4378 // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4379 // to avoid the ADD/SUB/XOR.
4380 if (Add1C && Add1C->getAPIntValue().urem(Size) == 0) {
4381 NewShiftAmt = Add0;
4382
4383 } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4384 ((Add0C && Add0C->getAPIntValue().urem(Size) == Size - 1) ||
4385 (Add1C && Add1C->getAPIntValue().urem(Size) == Size - 1))) {
4386 // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
4387 // we can replace it with a NOT. In the XOR case it may save some code
4388 // size, in the SUB case it also may save a move.
4389 assert(Add0C == nullptr || Add1C == nullptr);
4390
4391 // We can only do N-X, not X-N
4392 if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr)
4393 return false;
4394
4395 EVT OpVT = ShiftAmt.getValueType();
4396
4397 SDValue AllOnes = CurDAG->getAllOnesConstant(DL, OpVT);
4398 NewShiftAmt = CurDAG->getNode(ISD::XOR, DL, OpVT,
4399 Add0C == nullptr ? Add0 : Add1, AllOnes);
4400 insertDAGNode(*CurDAG, OrigShiftAmt, AllOnes);
4401 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4402 // If we are shifting by N-X where N == 0 mod Size, then just shift by
4403 // -X to generate a NEG instead of a SUB of a constant.
4404 } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
4405 Add0C->getZExtValue() != 0) {
4406 EVT SubVT = ShiftAmt.getValueType();
4407 SDValue X;
4408 if (Add0C->getZExtValue() % Size == 0)
4409 X = Add1;
4410 else if (ShiftAmt.hasOneUse() && Size == 64 &&
4411 Add0C->getZExtValue() % 32 == 0) {
4412 // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
4413 // This is mainly beneficial if we already compute (x+n*32).
4414 if (Add1.getOpcode() == ISD::TRUNCATE) {
4415 Add1 = Add1.getOperand(0);
4416 SubVT = Add1.getValueType();
4417 }
4418 if (Add0.getValueType() != SubVT) {
4419 Add0 = CurDAG->getZExtOrTrunc(Add0, DL, SubVT);
4420 insertDAGNode(*CurDAG, OrigShiftAmt, Add0);
4421 }
4422
4423 X = CurDAG->getNode(ISD::ADD, DL, SubVT, Add1, Add0);
4424 insertDAGNode(*CurDAG, OrigShiftAmt, X);
4425 } else
4426 return false;
4427 // Insert a negate op.
4428 // TODO: This isn't guaranteed to replace the sub if there is a logic cone
4429 // that uses it that's not a shift.
4430 SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
4431 SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, X);
4432 NewShiftAmt = Neg;
4433
4434 // Insert these operands into a valid topological order so they can
4435 // get selected independently.
4436 insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
4437 insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
4438 } else
4439 return false;
4440 } else
4441 return false;
4442
4443 if (NewShiftAmt.getValueType() != MVT::i8) {
4444 // Need to truncate the shift amount.
4445 NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
4446 // Add to a correct topological ordering.
4447 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4448 }
4449
4450 // Insert a new mask to keep the shift amount legal. This should be removed
4451 // by isel patterns.
4452 NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
4453 CurDAG->getConstant(Size - 1, DL, MVT::i8));
4454 // Place in a correct topological ordering.
4455 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4456
4457 SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
4458 NewShiftAmt);
4459 if (UpdatedNode != N) {
4460 // If we found an existing node, we should replace ourselves with that node
4461 // and wait for it to be selected after its other users.
4462 ReplaceNode(N, UpdatedNode);
4463 return true;
4464 }
4465
4466 // If the original shift amount is now dead, delete it so that we don't run
4467 // it through isel.
4468 if (OrigShiftAmt.getNode()->use_empty())
4469 CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
4470
4471 // Now that we've optimized the shift amount, defer to normal isel to get
4472 // load folding and legacy vs BMI2 selection without repeating it here.
4473 SelectCode(N);
4474 return true;
4475}
4476
4477bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4478 MVT NVT = N->getSimpleValueType(0);
4479 unsigned Opcode = N->getOpcode();
4480 SDLoc dl(N);
4481
4482 // For operations of the form (x << C1) op C2, check if we can use a smaller
4483 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4484 SDValue Shift = N->getOperand(0);
4485 SDValue N1 = N->getOperand(1);
4486
4487 auto *Cst = dyn_cast<ConstantSDNode>(N1);
4488 if (!Cst)
4489 return false;
4490
4491 int64_t Val = Cst->getSExtValue();
4492
4493 // If we have an any_extend feeding the AND, look through it to see if there
4494 // is a shift behind it. But only if the AND doesn't use the extended bits.
4495 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4496 bool FoundAnyExtend = false;
4497 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4498 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
4499 isUInt<32>(Val)) {
4500 FoundAnyExtend = true;
4501 Shift = Shift.getOperand(0);
4502 }
4503
4504 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
4505 return false;
4506
4507 // i8 is unshrinkable, i16 should be promoted to i32.
4508 if (NVT != MVT::i32 && NVT != MVT::i64)
4509 return false;
4510
4511 auto *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
4512 if (!ShlCst)
4513 return false;
4514
4515 uint64_t ShAmt = ShlCst->getZExtValue();
4516
4517 // Make sure that we don't change the operation by removing bits.
4518 // This only matters for OR and XOR, AND is unaffected.
4519 uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
4520 if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
4521 return false;
4522
4523 // Check the minimum bitwidth for the new constant.
4524 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4525 auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4526 if (Opcode == ISD::AND) {
4527 // AND32ri is the same as AND64ri32 with zext imm.
4528 // Try this before sign extended immediates below.
4529 ShiftedVal = (uint64_t)Val >> ShAmt;
4530 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4531 return true;
4532 // Also swap order when the AND can become MOVZX.
4533 if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
4534 return true;
4535 }
4536 ShiftedVal = Val >> ShAmt;
4537 if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
4538 (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
4539 return true;
4540 if (Opcode != ISD::AND) {
4541 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4542 ShiftedVal = (uint64_t)Val >> ShAmt;
4543 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4544 return true;
4545 }
4546 return false;
4547 };
4548
4549 int64_t ShiftedVal;
4550 if (!CanShrinkImmediate(ShiftedVal))
4551 return false;
4552
4553 // Ok, we can reorder to get a smaller immediate.
4554
4555 // But, its possible the original immediate allowed an AND to become MOVZX.
4556 // Doing this late due to avoid the MakedValueIsZero call as late as
4557 // possible.
4558 if (Opcode == ISD::AND) {
4559 // Find the smallest zext this could possibly be.
4560 unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4561 ZExtWidth = llvm::bit_ceil(std::max(ZExtWidth, 8U));
4562
4563 // Figure out which bits need to be zero to achieve that mask.
4564 APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
4565 ZExtWidth);
4566 NeededMask &= ~Cst->getAPIntValue();
4567
4568 if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
4569 return false;
4570 }
4571
4572 SDValue X = Shift.getOperand(0);
4573 if (FoundAnyExtend) {
4574 SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
4575 insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
4576 X = NewX;
4577 }
4578
4579 SDValue NewCst = CurDAG->getSignedConstant(ShiftedVal, dl, NVT);
4580 insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
4581 SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
4582 insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
4583 SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
4584 Shift.getOperand(1));
4585 ReplaceNode(N, NewSHL.getNode());
4586 SelectCode(NewSHL.getNode());
4587 return true;
4588}
4589
4590bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
4591 SDNode *ParentB, SDNode *ParentC,
4593 uint8_t Imm) {
4594 assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4595 C.isOperandOf(ParentC) && "Incorrect parent node");
4596
4597 auto tryFoldLoadOrBCast =
4598 [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
4599 SDValue &Index, SDValue &Disp, SDValue &Segment) {
4600 if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4601 return true;
4602
4603 // Not a load, check for broadcast which may be behind a bitcast.
4604 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4605 P = L.getNode();
4606 L = L.getOperand(0);
4607 }
4608
4609 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4610 return false;
4611
4612 // Only 32 and 64 bit broadcasts are supported.
4613 auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4614 unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4615 if (Size != 32 && Size != 64)
4616 return false;
4617
4618 return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4619 };
4620
4621 bool FoldedLoad = false;
4622 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4623 if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4624 FoldedLoad = true;
4625 } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4626 Tmp4)) {
4627 FoldedLoad = true;
4628 std::swap(A, C);
4629 // Swap bits 1/4 and 3/6.
4630 uint8_t OldImm = Imm;
4631 Imm = OldImm & 0xa5;
4632 if (OldImm & 0x02) Imm |= 0x10;
4633 if (OldImm & 0x10) Imm |= 0x02;
4634 if (OldImm & 0x08) Imm |= 0x40;
4635 if (OldImm & 0x40) Imm |= 0x08;
4636 } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4637 Tmp4)) {
4638 FoldedLoad = true;
4639 std::swap(B, C);
4640 // Swap bits 1/2 and 5/6.
4641 uint8_t OldImm = Imm;
4642 Imm = OldImm & 0x99;
4643 if (OldImm & 0x02) Imm |= 0x04;
4644 if (OldImm & 0x04) Imm |= 0x02;
4645 if (OldImm & 0x20) Imm |= 0x40;
4646 if (OldImm & 0x40) Imm |= 0x20;
4647 }
4648
4649 SDLoc DL(Root);
4650
4651 SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8);
4652
4653 MVT NVT = Root->getSimpleValueType(0);
4654
4655 MachineSDNode *MNode;
4656 if (FoldedLoad) {
4657 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
4658
4659 unsigned Opc;
4660 if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4661 auto *MemIntr = cast<MemIntrinsicSDNode>(C);
4662 unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4663 assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
4664
4665 bool UseD = EltSize == 32;
4666 if (NVT.is128BitVector())
4667 Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4668 else if (NVT.is256BitVector())
4669 Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4670 else if (NVT.is512BitVector())
4671 Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4672 else
4673 llvm_unreachable("Unexpected vector size!");
4674 } else {
4675 bool UseD = NVT.getVectorElementType() == MVT::i32;
4676 if (NVT.is128BitVector())
4677 Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4678 else if (NVT.is256BitVector())
4679 Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4680 else if (NVT.is512BitVector())
4681 Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4682 else
4683 llvm_unreachable("Unexpected vector size!");
4684 }
4685
4686 SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)};
4687 MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
4688
4689 // Update the chain.
4690 ReplaceUses(C.getValue(1), SDValue(MNode, 1));
4691 // Record the mem-refs
4692 CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()});
4693 } else {
4694 bool UseD = NVT.getVectorElementType() == MVT::i32;
4695 unsigned Opc;
4696 if (NVT.is128BitVector())
4697 Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4698 else if (NVT.is256BitVector())
4699 Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4700 else if (NVT.is512BitVector())
4701 Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4702 else
4703 llvm_unreachable("Unexpected vector size!");
4704
4705 MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm});
4706 }
4707
4708 ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0));
4709 CurDAG->RemoveDeadNode(Root);
4710 return true;
4711}
4712
4713// Try to match two logic ops to a VPTERNLOG.
4714// FIXME: Handle more complex patterns that use an operand more than once?
4715bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4716 MVT NVT = N->getSimpleValueType(0);
4717
4718 // Make sure we support VPTERNLOG.
4719 if (!NVT.isVector() || !Subtarget->hasAVX512() ||
4720 NVT.getVectorElementType() == MVT::i1)
4721 return false;
4722
4723 // We need VLX for 128/256-bit.
4724 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4725 return false;
4726
4727 auto getFoldableLogicOp = [](SDValue Op) {
4728 // Peek through single use bitcast.
4729 if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4730 Op = Op.getOperand(0);
4731
4732 if (!Op.hasOneUse())
4733 return SDValue();
4734
4735 unsigned Opc = Op.getOpcode();
4736 if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
4737 Opc == X86ISD::ANDNP)
4738 return Op;
4739
4740 return SDValue();
4741 };
4742
4743 SDValue N0, N1, A, FoldableOp;
4744
4745 // Identify and (optionally) peel an outer NOT that wraps a pure logic tree
4746 auto tryPeelOuterNotWrappingLogic = [&](SDNode *Op) {
4747 if (Op->getOpcode() == ISD::XOR && Op->hasOneUse() &&
4748 ISD::isBuildVectorAllOnes(Op->getOperand(1).getNode())) {
4749 SDValue InnerOp = getFoldableLogicOp(Op->getOperand(0));
4750
4751 if (!InnerOp)
4752 return SDValue();
4753
4754 N0 = InnerOp.getOperand(0);
4755 N1 = InnerOp.getOperand(1);
4756 if ((FoldableOp = getFoldableLogicOp(N1))) {
4757 A = N0;
4758 return InnerOp;
4759 }
4760 if ((FoldableOp = getFoldableLogicOp(N0))) {
4761 A = N1;
4762 return InnerOp;
4763 }
4764 }
4765 return SDValue();
4766 };
4767
4768 bool PeeledOuterNot = false;
4769 SDNode *OriN = N;
4770 if (SDValue InnerOp = tryPeelOuterNotWrappingLogic(N)) {
4771 PeeledOuterNot = true;
4772 N = InnerOp.getNode();
4773 } else {
4774 N0 = N->getOperand(0);
4775 N1 = N->getOperand(1);
4776
4777 if ((FoldableOp = getFoldableLogicOp(N1)))
4778 A = N0;
4779 else if ((FoldableOp = getFoldableLogicOp(N0)))
4780 A = N1;
4781 else
4782 return false;
4783 }
4784
4785 SDValue B = FoldableOp.getOperand(0);
4786 SDValue C = FoldableOp.getOperand(1);
4787 SDNode *ParentA = N;
4788 SDNode *ParentB = FoldableOp.getNode();
4789 SDNode *ParentC = FoldableOp.getNode();
4790
4791 // We can build the appropriate control immediate by performing the logic
4792 // operation we're matching using these constants for A, B, and C.
4793 uint8_t TernlogMagicA = 0xf0;
4794 uint8_t TernlogMagicB = 0xcc;
4795 uint8_t TernlogMagicC = 0xaa;
4796
4797 // Some of the inputs may be inverted, peek through them and invert the
4798 // magic values accordingly.
4799 // TODO: There may be a bitcast before the xor that we should peek through.
4800 auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4801 if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4802 ISD::isBuildVectorAllOnes(Op.getOperand(1).getNode())) {
4803 Magic = ~Magic;
4804 Parent = Op.getNode();
4805 Op = Op.getOperand(0);
4806 }
4807 };
4808
4809 PeekThroughNot(A, ParentA, TernlogMagicA);
4810 PeekThroughNot(B, ParentB, TernlogMagicB);
4811 PeekThroughNot(C, ParentC, TernlogMagicC);
4812
4813 uint8_t Imm;
4814 switch (FoldableOp.getOpcode()) {
4815 default: llvm_unreachable("Unexpected opcode!");
4816 case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
4817 case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break;
4818 case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
4819 case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4820 }
4821
4822 switch (N->getOpcode()) {
4823 default: llvm_unreachable("Unexpected opcode!");
4824 case X86ISD::ANDNP:
4825 if (A == N0)
4826 Imm &= ~TernlogMagicA;
4827 else
4828 Imm = ~(Imm) & TernlogMagicA;
4829 break;
4830 case ISD::AND: Imm &= TernlogMagicA; break;
4831 case ISD::OR: Imm |= TernlogMagicA; break;
4832 case ISD::XOR: Imm ^= TernlogMagicA; break;
4833 }
4834
4835 if (PeeledOuterNot)
4836 Imm = ~Imm;
4837
4838 return matchVPTERNLOG(OriN, ParentA, ParentB, ParentC, A, B, C, Imm);
4839}
4840
4841/// If the high bits of an 'and' operand are known zero, try setting the
4842/// high bits of an 'and' constant operand to produce a smaller encoding by
4843/// creating a small, sign-extended negative immediate rather than a large
4844/// positive one. This reverses a transform in SimplifyDemandedBits that
4845/// shrinks mask constants by clearing bits. There is also a possibility that
4846/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4847/// case, just replace the 'and'. Return 'true' if the node is replaced.
4848bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4849 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4850 // have immediate operands.
4851 MVT VT = And->getSimpleValueType(0);
4852 if (VT != MVT::i32 && VT != MVT::i64)
4853 return false;
4854
4855 auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
4856 if (!And1C)
4857 return false;
4858
4859 // Bail out if the mask constant is already negative. It's can't shrink more.
4860 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4861 // patterns to use a 32-bit and instead of a 64-bit and by relying on the
4862 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4863 // are negative too.
4864 APInt MaskVal = And1C->getAPIntValue();
4865 unsigned MaskLZ = MaskVal.countl_zero();
4866 if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
4867 return false;
4868
4869 // Don't extend into the upper 32 bits of a 64 bit mask.
4870 if (VT == MVT::i64 && MaskLZ >= 32) {
4871 MaskLZ -= 32;
4872 MaskVal = MaskVal.trunc(32);
4873 }
4874
4875 SDValue And0 = And->getOperand(0);
4876 APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
4877 APInt NegMaskVal = MaskVal | HighZeros;
4878
4879 // If a negative constant would not allow a smaller encoding, there's no need
4880 // to continue. Only change the constant when we know it's a win.
4881 unsigned MinWidth = NegMaskVal.getSignificantBits();
4882 if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32))
4883 return false;
4884
4885 // Extend masks if we truncated above.
4886 if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
4887 NegMaskVal = NegMaskVal.zext(64);
4888 HighZeros = HighZeros.zext(64);
4889 }
4890
4891 // The variable operand must be all zeros in the top bits to allow using the
4892 // new, negative constant as the mask.
4893 // TODO: Handle constant folding?
4894 KnownBits Known0 = CurDAG->computeKnownBits(And0);
4895 if (Known0.isConstant() || !HighZeros.isSubsetOf(Known0.Zero))
4896 return false;
4897
4898 // Check if the mask is -1. In that case, this is an unnecessary instruction
4899 // that escaped earlier analysis.
4900 if (NegMaskVal.isAllOnes()) {
4901 ReplaceNode(And, And0.getNode());
4902 return true;
4903 }
4904
4905 // A negative mask allows a smaller encoding. Create a new 'and' node.
4906 SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
4907 insertDAGNode(*CurDAG, SDValue(And, 0), NewMask);
4908 SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
4909 ReplaceNode(And, NewAnd.getNode());
4910 SelectCode(NewAnd.getNode());
4911 return true;
4912}
4913
4914static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
4915 bool FoldedBCast, bool Masked) {
4916#define VPTESTM_CASE(VT, SUFFIX) \
4917case MVT::VT: \
4918 if (Masked) \
4919 return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4920 return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4921
4922
4923#define VPTESTM_BROADCAST_CASES(SUFFIX) \
4924default: llvm_unreachable("Unexpected VT!"); \
4925VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4926VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4927VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4928VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4929VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4930VPTESTM_CASE(v8i64, QZ##SUFFIX)
4931
4932#define VPTESTM_FULL_CASES(SUFFIX) \
4933VPTESTM_BROADCAST_CASES(SUFFIX) \
4934VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
4935VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
4936VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
4937VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
4938VPTESTM_CASE(v64i8, BZ##SUFFIX) \
4939VPTESTM_CASE(v32i16, WZ##SUFFIX)
4940
4941 if (FoldedBCast) {
4942 switch (TestVT.SimpleTy) {
4944 }
4945 }
4946
4947 if (FoldedLoad) {
4948 switch (TestVT.SimpleTy) {
4950 }
4951 }
4952
4953 switch (TestVT.SimpleTy) {
4955 }
4956
4957#undef VPTESTM_FULL_CASES
4958#undef VPTESTM_BROADCAST_CASES
4959#undef VPTESTM_CASE
4960}
4961
4962// Try to create VPTESTM instruction. If InMask is not null, it will be used
4963// to form a masked operation.
4964bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4965 SDValue InMask) {
4966 assert(Subtarget->hasAVX512() && "Expected AVX512!");
4967 assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4968 "Unexpected VT!");
4969
4970 // Look for equal and not equal compares.
4971 ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
4972 if (CC != ISD::SETEQ && CC != ISD::SETNE)
4973 return false;
4974
4975 SDValue SetccOp0 = Setcc.getOperand(0);
4976 SDValue SetccOp1 = Setcc.getOperand(1);
4977
4978 // Canonicalize the all zero vector to the RHS.
4979 if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
4980 std::swap(SetccOp0, SetccOp1);
4981
4982 // See if we're comparing against zero.
4983 if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
4984 return false;
4985
4986 SDValue N0 = SetccOp0;
4987
4988 MVT CmpVT = N0.getSimpleValueType();
4989 MVT CmpSVT = CmpVT.getVectorElementType();
4990
4991 // Start with both operands the same. We'll try to refine this.
4992 SDValue Src0 = N0;
4993 SDValue Src1 = N0;
4994
4995 {
4996 // Look through single use bitcasts.
4997 SDValue N0Temp = N0;
4998 if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4999 N0Temp = N0.getOperand(0);
5000
5001 // Look for single use AND.
5002 if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
5003 Src0 = N0Temp.getOperand(0);
5004 Src1 = N0Temp.getOperand(1);
5005 }
5006 }
5007
5008 // Without VLX we need to widen the operation.
5009 bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
5010
5011 auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
5012 SDValue &Base, SDValue &Scale, SDValue &Index,
5013 SDValue &Disp, SDValue &Segment) {
5014 // If we need to widen, we can't fold the load.
5015 if (!Widen)
5016 if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
5017 return true;
5018
5019 // If we didn't fold a load, try to match broadcast. No widening limitation
5020 // for this. But only 32 and 64 bit types are supported.
5021 if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
5022 return false;
5023
5024 // Look through single use bitcasts.
5025 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
5026 P = L.getNode();
5027 L = L.getOperand(0);
5028 }
5029
5030 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
5031 return false;
5032
5033 auto *MemIntr = cast<MemIntrinsicSDNode>(L);
5034 if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
5035 return false;
5036
5037 return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
5038 };
5039
5040 // We can only fold loads if the sources are unique.
5041 bool CanFoldLoads = Src0 != Src1;
5042
5043 bool FoldedLoad = false;
5044 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5045 if (CanFoldLoads) {
5046 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
5047 Tmp3, Tmp4);
5048 if (!FoldedLoad) {
5049 // And is commutative.
5050 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
5051 Tmp2, Tmp3, Tmp4);
5052 if (FoldedLoad)
5053 std::swap(Src0, Src1);
5054 }
5055 }
5056
5057 bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
5058
5059 bool IsMasked = InMask.getNode() != nullptr;
5060
5061 SDLoc dl(Root);
5062
5063 MVT ResVT = Setcc.getSimpleValueType();
5064 MVT MaskVT = ResVT;
5065 if (Widen) {
5066 // Widen the inputs using insert_subreg or copy_to_regclass.
5067 unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
5068 unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
5069 unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
5070 CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
5071 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
5072 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
5073 CmpVT), 0);
5074 Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
5075
5076 if (!FoldedBCast)
5077 Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
5078
5079 if (IsMasked) {
5080 // Widen the mask.
5081 unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID();
5082 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
5083 InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
5084 dl, MaskVT, InMask, RC), 0);
5085 }
5086 }
5087
5088 bool IsTestN = CC == ISD::SETEQ;
5089 unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
5090 IsMasked);
5091
5092 MachineSDNode *CNode;
5093 if (FoldedLoad) {
5094 SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
5095
5096 if (IsMasked) {
5097 SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5098 Src1.getOperand(0) };
5099 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5100 } else {
5101 SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5102 Src1.getOperand(0) };
5103 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5104 }
5105
5106 // Update the chain.
5107 ReplaceUses(Src1.getValue(1), SDValue(CNode, 1));
5108 // Record the mem-refs
5109 CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Src1)->getMemOperand()});
5110 } else {
5111 if (IsMasked)
5112 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
5113 else
5114 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
5115 }
5116
5117 // If we widened, we need to shrink the mask VT.
5118 if (Widen) {
5119 unsigned RegClass = TLI->getRegClassFor(ResVT)->getID();
5120 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
5121 CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
5122 dl, ResVT, SDValue(CNode, 0), RC);
5123 }
5124
5125 ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
5126 CurDAG->RemoveDeadNode(Root);
5127 return true;
5128}
5129
5130// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
5131// into vpternlog.
5132bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
5133 assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
5134
5135 MVT NVT = N->getSimpleValueType(0);
5136
5137 // Make sure we support VPTERNLOG.
5138 if (!NVT.isVector() || !Subtarget->hasAVX512())
5139 return false;
5140
5141 // We need VLX for 128/256-bit.
5142 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
5143 return false;
5144
5145 SDValue N0 = N->getOperand(0);
5146 SDValue N1 = N->getOperand(1);
5147
5148 // Canonicalize AND to LHS.
5149 if (N1.getOpcode() == ISD::AND)
5150 std::swap(N0, N1);
5151
5152 if (N0.getOpcode() != ISD::AND ||
5153 N1.getOpcode() != X86ISD::ANDNP ||
5154 !N0.hasOneUse() || !N1.hasOneUse())
5155 return false;
5156
5157 // ANDN is not commutable, use it to pick down A and C.
5158 SDValue A = N1.getOperand(0);
5159 SDValue C = N1.getOperand(1);
5160
5161 // AND is commutable, if one operand matches A, the other operand is B.
5162 // Otherwise this isn't a match.
5163 SDValue B;
5164 if (N0.getOperand(0) == A)
5165 B = N0.getOperand(1);
5166 else if (N0.getOperand(1) == A)
5167 B = N0.getOperand(0);
5168 else
5169 return false;
5170
5171 SDLoc dl(N);
5172 SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
5173 SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
5174 ReplaceNode(N, Ternlog.getNode());
5175
5176 return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(),
5177 Ternlog.getNode(), A, B, C, 0xCA);
5178}
5179
5180void X86DAGToDAGISel::Select(SDNode *Node) {
5181 MVT NVT = Node->getSimpleValueType(0);
5182 unsigned Opcode = Node->getOpcode();
5183 SDLoc dl(Node);
5184
5185 if (Node->isMachineOpcode()) {
5186 LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
5187 Node->setNodeId(-1);
5188 return; // Already selected.
5189 }
5190
5191 switch (Opcode) {
5192 default: break;
5194 unsigned IntNo = Node->getConstantOperandVal(1);
5195 switch (IntNo) {
5196 default: break;
5197 case Intrinsic::x86_encodekey128:
5198 case Intrinsic::x86_encodekey256: {
5199 if (!Subtarget->hasKL())
5200 break;
5201
5202 unsigned Opcode;
5203 switch (IntNo) {
5204 default: llvm_unreachable("Impossible intrinsic");
5205 case Intrinsic::x86_encodekey128:
5206 Opcode = X86::ENCODEKEY128;
5207 break;
5208 case Intrinsic::x86_encodekey256:
5209 Opcode = X86::ENCODEKEY256;
5210 break;
5211 }
5212
5213 SDValue Chain = Node->getOperand(0);
5214 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3),
5215 SDValue());
5216 if (Opcode == X86::ENCODEKEY256)
5217 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4),
5218 Chain.getValue(1));
5219
5220 MachineSDNode *Res = CurDAG->getMachineNode(
5221 Opcode, dl, Node->getVTList(),
5222 {Node->getOperand(2), Chain, Chain.getValue(1)});
5223 ReplaceNode(Node, Res);
5224 return;
5225 }
5226 case Intrinsic::x86_tileloaddrs64_internal:
5227 case Intrinsic::x86_tileloaddrst164_internal:
5228 if (!Subtarget->hasAMXMOVRS())
5229 break;
5230 [[fallthrough]];
5231 case Intrinsic::x86_tileloadd64_internal:
5232 case Intrinsic::x86_tileloaddt164_internal: {
5233 if (!Subtarget->hasAMXTILE())
5234 break;
5235 auto *MFI =
5236 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5237 MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5238 unsigned Opc;
5239 switch (IntNo) {
5240 default:
5241 llvm_unreachable("Unexpected intrinsic!");
5242 case Intrinsic::x86_tileloaddrs64_internal:
5243 Opc = X86::PTILELOADDRSV;
5244 break;
5245 case Intrinsic::x86_tileloaddrst164_internal:
5246 Opc = X86::PTILELOADDRST1V;
5247 break;
5248 case Intrinsic::x86_tileloadd64_internal:
5249 Opc = X86::PTILELOADDV;
5250 break;
5251 case Intrinsic::x86_tileloaddt164_internal:
5252 Opc = X86::PTILELOADDT1V;
5253 break;
5254 }
5255 // _tile_loadd_internal(row, col, buf, STRIDE)
5256 SDValue Base = Node->getOperand(4);
5257 SDValue Scale = getI8Imm(1, dl);
5258 SDValue Index = Node->getOperand(5);
5259 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5260 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5261 SDValue Chain = Node->getOperand(0);
5262 MachineSDNode *CNode;
5263 SDValue Ops[] = {Node->getOperand(2),
5264 Node->getOperand(3),
5265 Base,
5266 Scale,
5267 Index,
5268 Disp,
5269 Segment,
5270 Chain};
5271 CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
5272 ReplaceNode(Node, CNode);
5273 return;
5274 }
5275 }
5276 break;
5277 }
5278 case ISD::INTRINSIC_VOID: {
5279 unsigned IntNo = Node->getConstantOperandVal(1);
5280 switch (IntNo) {
5281 default: break;
5282 case Intrinsic::x86_sse3_monitor:
5283 case Intrinsic::x86_monitorx:
5284 case Intrinsic::x86_clzero: {
5285 bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
5286
5287 unsigned Opc = 0;
5288 switch (IntNo) {
5289 default: llvm_unreachable("Unexpected intrinsic!");
5290 case Intrinsic::x86_sse3_monitor:
5291 if (!Subtarget->hasSSE3())
5292 break;
5293 Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
5294 break;
5295 case Intrinsic::x86_monitorx:
5296 if (!Subtarget->hasMWAITX())
5297 break;
5298 Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
5299 break;
5300 case Intrinsic::x86_clzero:
5301 if (!Subtarget->hasCLZERO())
5302 break;
5303 Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
5304 break;
5305 }
5306
5307 if (Opc) {
5308 unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
5309 SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
5310 Node->getOperand(2), SDValue());
5311 SDValue InGlue = Chain.getValue(1);
5312
5313 if (IntNo == Intrinsic::x86_sse3_monitor ||
5314 IntNo == Intrinsic::x86_monitorx) {
5315 // Copy the other two operands to ECX and EDX.
5316 Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
5317 InGlue);
5318 InGlue = Chain.getValue(1);
5319 Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
5320 InGlue);
5321 InGlue = Chain.getValue(1);
5322 }
5323
5324 MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
5325 { Chain, InGlue});
5326 ReplaceNode(Node, CNode);
5327 return;
5328 }
5329
5330 break;
5331 }
5332 case Intrinsic::x86_tilestored64_internal: {
5333 auto *MFI =
5334 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5335 MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5336 unsigned Opc = X86::PTILESTOREDV;
5337 // _tile_stored_internal(row, col, buf, STRIDE, c)
5338 SDValue Base = Node->getOperand(4);
5339 SDValue Scale = getI8Imm(1, dl);
5340 SDValue Index = Node->getOperand(5);
5341 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5342 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5343 SDValue Chain = Node->getOperand(0);
5344 MachineSDNode *CNode;
5345 SDValue Ops[] = {Node->getOperand(2),
5346 Node->getOperand(3),
5347 Base,
5348 Scale,
5349 Index,
5350 Disp,
5351 Segment,
5352 Node->getOperand(6),
5353 Chain};
5354 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5355 ReplaceNode(Node, CNode);
5356 return;
5357 }
5358 case Intrinsic::x86_tileloaddrs64:
5359 case Intrinsic::x86_tileloaddrst164:
5360 if (!Subtarget->hasAMXMOVRS())
5361 break;
5362 [[fallthrough]];
5363 case Intrinsic::x86_tileloadd64:
5364 case Intrinsic::x86_tileloaddt164:
5365 case Intrinsic::x86_tilestored64: {
5366 if (!Subtarget->hasAMXTILE())
5367 break;
5368 auto *MFI =
5369 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5370 MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5371 unsigned Opc;
5372 switch (IntNo) {
5373 default: llvm_unreachable("Unexpected intrinsic!");
5374 case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
5375 case Intrinsic::x86_tileloaddrs64:
5376 Opc = X86::PTILELOADDRS;
5377 break;
5378 case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
5379 case Intrinsic::x86_tileloaddrst164:
5380 Opc = X86::PTILELOADDRST1;
5381 break;
5382 case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
5383 }
5384 // FIXME: Match displacement and scale.
5385 unsigned TIndex = Node->getConstantOperandVal(2);
5386 SDValue TReg = getI8Imm(TIndex, dl);
5387 SDValue Base = Node->getOperand(3);
5388 SDValue Scale = getI8Imm(1, dl);
5389 SDValue Index = Node->getOperand(4);
5390 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5391 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5392 SDValue Chain = Node->getOperand(0);
5393 MachineSDNode *CNode;
5394 if (Opc == X86::PTILESTORED) {
5395 SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
5396 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5397 } else {
5398 SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
5399 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5400 }
5401 ReplaceNode(Node, CNode);
5402 return;
5403 }
5404 }
5405 break;
5406 }
5407 case ISD::BRIND:
5408 case X86ISD::NT_BRIND: {
5409 if (Subtarget->isTarget64BitILP32()) {
5410 // Converts a 32-bit register to a 64-bit, zero-extended version of
5411 // it. This is needed because x86-64 can do many things, but jmp %r32
5412 // ain't one of them.
5413 SDValue Target = Node->getOperand(1);
5414 assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
5415 SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64);
5416 SDValue Brind = CurDAG->getNode(Opcode, dl, MVT::Other,
5417 Node->getOperand(0), ZextTarget);
5418 ReplaceNode(Node, Brind.getNode());
5419 SelectCode(ZextTarget.getNode());
5420 SelectCode(Brind.getNode());
5421 return;
5422 }
5423 break;
5424 }
5426 ReplaceNode(Node, getGlobalBaseReg());
5427 return;
5428
5429 case ISD::BITCAST:
5430 // Just drop all 128/256/512-bit bitcasts.
5431 if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
5432 NVT == MVT::f128) {
5433 ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
5434 CurDAG->RemoveDeadNode(Node);
5435 return;
5436 }
5437 break;
5438
5439 case ISD::SRL:
5440 if (matchBitExtract(Node))
5441 return;
5442 [[fallthrough]];
5443 case ISD::SRA:
5444 case ISD::SHL:
5445 if (tryShiftAmountMod(Node))
5446 return;
5447 break;
5448
5449 case X86ISD::VPTERNLOG: {
5450 uint8_t Imm = Node->getConstantOperandVal(3);
5451 if (matchVPTERNLOG(Node, Node, Node, Node, Node->getOperand(0),
5452 Node->getOperand(1), Node->getOperand(2), Imm))
5453 return;
5454 break;
5455 }
5456
5457 case X86ISD::ANDNP:
5458 if (tryVPTERNLOG(Node))
5459 return;
5460 break;
5461
5462 case ISD::AND:
5463 if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
5464 // Try to form a masked VPTESTM. Operands can be in either order.
5465 SDValue N0 = Node->getOperand(0);
5466 SDValue N1 = Node->getOperand(1);
5467 if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
5468 tryVPTESTM(Node, N0, N1))
5469 return;
5470 if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
5471 tryVPTESTM(Node, N1, N0))
5472 return;
5473 }
5474
5475 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
5476 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
5477 CurDAG->RemoveDeadNode(Node);
5478 return;
5479 }
5480 if (matchBitExtract(Node))
5481 return;
5482 if (AndImmShrink && shrinkAndImmediate(Node))
5483 return;
5484
5485 [[fallthrough]];
5486 case ISD::OR:
5487 case ISD::XOR:
5488 if (tryShrinkShlLogicImm(Node))
5489 return;
5490 if (Opcode == ISD::OR && tryMatchBitSelect(Node))
5491 return;
5492 if (tryVPTERNLOG(Node))
5493 return;
5494
5495 [[fallthrough]];
5496 case ISD::ADD:
5497 if (Opcode == ISD::ADD && matchBitExtract(Node))
5498 return;
5499 [[fallthrough]];
5500 case ISD::SUB: {
5501 // Try to avoid folding immediates with multiple uses for optsize.
5502 // This code tries to select to register form directly to avoid going
5503 // through the isel table which might fold the immediate. We can't change
5504 // the patterns on the add/sub/and/or/xor with immediate paterns in the
5505 // tablegen files to check immediate use count without making the patterns
5506 // unavailable to the fast-isel table.
5507 if (!CurDAG->shouldOptForSize())
5508 break;
5509
5510 // Only handle i8/i16/i32/i64.
5511 if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
5512 break;
5513
5514 SDValue N0 = Node->getOperand(0);
5515 SDValue N1 = Node->getOperand(1);
5516
5517 auto *Cst = dyn_cast<ConstantSDNode>(N1);
5518 if (!Cst)
5519 break;
5520
5521 int64_t Val = Cst->getSExtValue();
5522
5523 // Make sure its an immediate that is considered foldable.
5524 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5525 if (!isInt<8>(Val) && !isInt<32>(Val))
5526 break;
5527
5528 // If this can match to INC/DEC, let it go.
5529 if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
5530 break;
5531
5532 // Check if we should avoid folding this immediate.
5533 if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
5534 break;
5535
5536 // We should not fold the immediate. So we need a register form instead.
5537 unsigned ROpc, MOpc;
5538 switch (NVT.SimpleTy) {
5539 default: llvm_unreachable("Unexpected VT!");
5540 case MVT::i8:
5541 switch (Opcode) {
5542 default: llvm_unreachable("Unexpected opcode!");
5543 case ISD::ADD:
5544 ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
5545 MOpc = GET_ND_IF_ENABLED(X86::ADD8rm);
5546 break;
5547 case ISD::SUB:
5548 ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
5549 MOpc = GET_ND_IF_ENABLED(X86::SUB8rm);
5550 break;
5551 case ISD::AND:
5552 ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
5553 MOpc = GET_ND_IF_ENABLED(X86::AND8rm);
5554 break;
5555 case ISD::OR:
5556 ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
5557 MOpc = GET_ND_IF_ENABLED(X86::OR8rm);
5558 break;
5559 case ISD::XOR:
5560 ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
5561 MOpc = GET_ND_IF_ENABLED(X86::XOR8rm);
5562 break;
5563 }
5564 break;
5565 case MVT::i16:
5566 switch (Opcode) {
5567 default: llvm_unreachable("Unexpected opcode!");
5568 case ISD::ADD:
5569 ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
5570 MOpc = GET_ND_IF_ENABLED(X86::ADD16rm);
5571 break;
5572 case ISD::SUB:
5573 ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
5574 MOpc = GET_ND_IF_ENABLED(X86::SUB16rm);
5575 break;
5576 case ISD::AND:
5577 ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
5578 MOpc = GET_ND_IF_ENABLED(X86::AND16rm);
5579 break;
5580 case ISD::OR:
5581 ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
5582 MOpc = GET_ND_IF_ENABLED(X86::OR16rm);
5583 break;
5584 case ISD::XOR:
5585 ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
5586 MOpc = GET_ND_IF_ENABLED(X86::XOR16rm);
5587 break;
5588 }
5589 break;
5590 case MVT::i32:
5591 switch (Opcode) {
5592 default: llvm_unreachable("Unexpected opcode!");
5593 case ISD::ADD:
5594 ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
5595 MOpc = GET_ND_IF_ENABLED(X86::ADD32rm);
5596 break;
5597 case ISD::SUB:
5598 ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
5599 MOpc = GET_ND_IF_ENABLED(X86::SUB32rm);
5600 break;
5601 case ISD::AND:
5602 ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
5603 MOpc = GET_ND_IF_ENABLED(X86::AND32rm);
5604 break;
5605 case ISD::OR:
5606 ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
5607 MOpc = GET_ND_IF_ENABLED(X86::OR32rm);
5608 break;
5609 case ISD::XOR:
5610 ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
5611 MOpc = GET_ND_IF_ENABLED(X86::XOR32rm);
5612 break;
5613 }
5614 break;
5615 case MVT::i64:
5616 switch (Opcode) {
5617 default: llvm_unreachable("Unexpected opcode!");
5618 case ISD::ADD:
5619 ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
5620 MOpc = GET_ND_IF_ENABLED(X86::ADD64rm);
5621 break;
5622 case ISD::SUB:
5623 ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
5624 MOpc = GET_ND_IF_ENABLED(X86::SUB64rm);
5625 break;
5626 case ISD::AND:
5627 ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
5628 MOpc = GET_ND_IF_ENABLED(X86::AND64rm);
5629 break;
5630 case ISD::OR:
5631 ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
5632 MOpc = GET_ND_IF_ENABLED(X86::OR64rm);
5633 break;
5634 case ISD::XOR:
5635 ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
5636 MOpc = GET_ND_IF_ENABLED(X86::XOR64rm);
5637 break;
5638 }
5639 break;
5640 }
5641
5642 // Ok this is a AND/OR/XOR/ADD/SUB with constant.
5643
5644 // If this is a not a subtract, we can still try to fold a load.
5645 if (Opcode != ISD::SUB) {
5646 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5647 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5648 SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5649 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5650 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5651 // Update the chain.
5652 ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
5653 // Record the mem-refs
5654 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
5655 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5656 CurDAG->RemoveDeadNode(Node);
5657 return;
5658 }
5659 }
5660
5661 CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
5662 return;
5663 }
5664
5665 case X86ISD::SMUL:
5666 // i16/i32/i64 are handled with isel patterns.
5667 if (NVT != MVT::i8)
5668 break;
5669 [[fallthrough]];
5670 case X86ISD::UMUL: {
5671 SDValue N0 = Node->getOperand(0);
5672 SDValue N1 = Node->getOperand(1);
5673
5674 unsigned LoReg, ROpc, MOpc;
5675 switch (NVT.SimpleTy) {
5676 default: llvm_unreachable("Unsupported VT!");
5677 case MVT::i8:
5678 LoReg = X86::AL;
5679 ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
5680 MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
5681 break;
5682 case MVT::i16:
5683 LoReg = X86::AX;
5684 ROpc = X86::MUL16r;
5685 MOpc = X86::MUL16m;
5686 break;
5687 case MVT::i32:
5688 LoReg = X86::EAX;
5689 ROpc = X86::MUL32r;
5690 MOpc = X86::MUL32m;
5691 break;
5692 case MVT::i64:
5693 LoReg = X86::RAX;
5694 ROpc = X86::MUL64r;
5695 MOpc = X86::MUL64m;
5696 break;
5697 }
5698
5699 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5700 bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5701 // Multiply is commutative.
5702 if (!FoldedLoad) {
5703 FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5704 if (FoldedLoad)
5705 std::swap(N0, N1);
5706 }
5707
5708 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5709 N0, SDValue()).getValue(1);
5710
5711 MachineSDNode *CNode;
5712 if (FoldedLoad) {
5713 // i16/i32/i64 use an instruction that produces a low and high result even
5714 // though only the low result is used.
5715 SDVTList VTs;
5716 if (NVT == MVT::i8)
5717 VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5718 else
5719 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
5720
5721 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5722 InGlue };
5723 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5724
5725 // Update the chain.
5726 ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
5727 // Record the mem-refs
5728 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5729 } else {
5730 // i16/i32/i64 use an instruction that produces a low and high result even
5731 // though only the low result is used.
5732 SDVTList VTs;
5733 if (NVT == MVT::i8)
5734 VTs = CurDAG->getVTList(NVT, MVT::i32);
5735 else
5736 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
5737
5738 CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InGlue});
5739 }
5740
5741 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5742 ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
5743 CurDAG->RemoveDeadNode(Node);
5744 return;
5745 }
5746
5747 case ISD::SMUL_LOHI:
5748 case ISD::UMUL_LOHI: {
5749 SDValue N0 = Node->getOperand(0);
5750 SDValue N1 = Node->getOperand(1);
5751
5752 unsigned Opc, MOpc;
5753 unsigned LoReg, HiReg;
5754 bool IsSigned = Opcode == ISD::SMUL_LOHI;
5755 bool UseMULX = !IsSigned && Subtarget->hasBMI2();
5756 bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
5757 switch (NVT.SimpleTy) {
5758 default: llvm_unreachable("Unsupported VT!");
5759 case MVT::i32:
5760 Opc = UseMULXHi ? X86::MULX32Hrr
5761 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
5762 : IsSigned ? X86::IMUL32r
5763 : X86::MUL32r;
5764 MOpc = UseMULXHi ? X86::MULX32Hrm
5765 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
5766 : IsSigned ? X86::IMUL32m
5767 : X86::MUL32m;
5768 LoReg = UseMULX ? X86::EDX : X86::EAX;
5769 HiReg = X86::EDX;
5770 break;
5771 case MVT::i64:
5772 Opc = UseMULXHi ? X86::MULX64Hrr
5773 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
5774 : IsSigned ? X86::IMUL64r
5775 : X86::MUL64r;
5776 MOpc = UseMULXHi ? X86::MULX64Hrm
5777 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
5778 : IsSigned ? X86::IMUL64m
5779 : X86::MUL64m;
5780 LoReg = UseMULX ? X86::RDX : X86::RAX;
5781 HiReg = X86::RDX;
5782 break;
5783 }
5784
5785 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5786 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5787 // Multiply is commutative.
5788 if (!foldedLoad) {
5789 foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5790 if (foldedLoad)
5791 std::swap(N0, N1);
5792 }
5793
5794 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5795 N0, SDValue()).getValue(1);
5796 SDValue ResHi, ResLo;
5797 if (foldedLoad) {
5798 SDValue Chain;
5799 MachineSDNode *CNode = nullptr;
5800 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5801 InGlue };
5802 if (UseMULXHi) {
5803 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
5804 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5805 ResHi = SDValue(CNode, 0);
5806 Chain = SDValue(CNode, 1);
5807 } else if (UseMULX) {
5808 SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
5809 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5810 ResHi = SDValue(CNode, 0);
5811 ResLo = SDValue(CNode, 1);
5812 Chain = SDValue(CNode, 2);
5813 } else {
5814 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
5815 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5816 Chain = SDValue(CNode, 0);
5817 InGlue = SDValue(CNode, 1);
5818 }
5819
5820 // Update the chain.
5821 ReplaceUses(N1.getValue(1), Chain);
5822 // Record the mem-refs
5823 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5824 } else {
5825 SDValue Ops[] = { N1, InGlue };
5826 if (UseMULXHi) {
5827 SDVTList VTs = CurDAG->getVTList(NVT);
5828 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5829 ResHi = SDValue(CNode, 0);
5830 } else if (UseMULX) {
5831 SDVTList VTs = CurDAG->getVTList(NVT, NVT);
5832 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5833 ResHi = SDValue(CNode, 0);
5834 ResLo = SDValue(CNode, 1);
5835 } else {
5836 SDVTList VTs = CurDAG->getVTList(MVT::Glue);
5837 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5838 InGlue = SDValue(CNode, 0);
5839 }
5840 }
5841
5842 // Copy the low half of the result, if it is needed.
5843 if (!SDValue(Node, 0).use_empty()) {
5844 if (!ResLo) {
5845 assert(LoReg && "Register for low half is not defined!");
5846 ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
5847 NVT, InGlue);
5848 InGlue = ResLo.getValue(2);
5849 }
5850 ReplaceUses(SDValue(Node, 0), ResLo);
5851 LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
5852 dbgs() << '\n');
5853 }
5854 // Copy the high half of the result, if it is needed.
5855 if (!SDValue(Node, 1).use_empty()) {
5856 if (!ResHi) {
5857 assert(HiReg && "Register for high half is not defined!");
5858 ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
5859 NVT, InGlue);
5860 InGlue = ResHi.getValue(2);
5861 }
5862 ReplaceUses(SDValue(Node, 1), ResHi);
5863 LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
5864 dbgs() << '\n');
5865 }
5866
5867 CurDAG->RemoveDeadNode(Node);
5868 return;
5869 }
5870
5871 case ISD::SDIVREM:
5872 case ISD::UDIVREM: {
5873 SDValue N0 = Node->getOperand(0);
5874 SDValue N1 = Node->getOperand(1);
5875
5876 unsigned ROpc, MOpc;
5877 bool isSigned = Opcode == ISD::SDIVREM;
5878 if (!isSigned) {
5879 switch (NVT.SimpleTy) {
5880 default: llvm_unreachable("Unsupported VT!");
5881 case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break;
5882 case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
5883 case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
5884 case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
5885 }
5886 } else {
5887 switch (NVT.SimpleTy) {
5888 default: llvm_unreachable("Unsupported VT!");
5889 case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
5890 case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
5891 case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
5892 case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
5893 }
5894 }
5895
5896 unsigned LoReg, HiReg, ClrReg;
5897 unsigned SExtOpcode;
5898 switch (NVT.SimpleTy) {
5899 default: llvm_unreachable("Unsupported VT!");
5900 case MVT::i8:
5901 LoReg = X86::AL; ClrReg = HiReg = X86::AH;
5902 SExtOpcode = 0; // Not used.
5903 break;
5904 case MVT::i16:
5905 LoReg = X86::AX; HiReg = X86::DX;
5906 ClrReg = X86::DX;
5907 SExtOpcode = X86::CWD;
5908 break;
5909 case MVT::i32:
5910 LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
5911 SExtOpcode = X86::CDQ;
5912 break;
5913 case MVT::i64:
5914 LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
5915 SExtOpcode = X86::CQO;
5916 break;
5917 }
5918
5919 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5920 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5921 bool signBitIsZero = CurDAG->SignBitIsZero(N0);
5922
5923 SDValue InGlue;
5924 if (NVT == MVT::i8) {
5925 // Special case for div8, just use a move with zero extension to AX to
5926 // clear the upper 8 bits (AH).
5927 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
5928 MachineSDNode *Move;
5929 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5930 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5931 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
5932 : X86::MOVZX16rm8;
5933 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
5934 Chain = SDValue(Move, 1);
5935 ReplaceUses(N0.getValue(1), Chain);
5936 // Record the mem-refs
5937 CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
5938 } else {
5939 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
5940 : X86::MOVZX16rr8;
5941 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
5942 Chain = CurDAG->getEntryNode();
5943 }
5944 Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
5945 SDValue());
5946 InGlue = Chain.getValue(1);
5947 } else {
5948 InGlue =
5949 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
5950 LoReg, N0, SDValue()).getValue(1);
5951 if (isSigned && !signBitIsZero) {
5952 // Sign extend the low part into the high part.
5953 InGlue =
5954 SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InGlue),0);
5955 } else {
5956 // Zero out the high part, effectively zero extending the input.
5957 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
5958 SDValue ClrNode =
5959 SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, {}), 0);
5960 switch (NVT.SimpleTy) {
5961 case MVT::i16:
5962 ClrNode =
5963 SDValue(CurDAG->getMachineNode(
5964 TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
5965 CurDAG->getTargetConstant(X86::sub_16bit, dl,
5966 MVT::i32)),
5967 0);
5968 break;
5969 case MVT::i32:
5970 break;
5971 case MVT::i64:
5972 ClrNode =
5973 SDValue(CurDAG->getMachineNode(
5974 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
5975 CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
5976 CurDAG->getTargetConstant(X86::sub_32bit, dl,
5977 MVT::i32)),
5978 0);
5979 break;
5980 default:
5981 llvm_unreachable("Unexpected division source");
5982 }
5983
5984 InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
5985 ClrNode, InGlue).getValue(1);
5986 }
5987 }
5988
5989 if (foldedLoad) {
5990 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5991 InGlue };
5992 MachineSDNode *CNode =
5993 CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
5994 InGlue = SDValue(CNode, 1);
5995 // Update the chain.
5996 ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
5997 // Record the mem-refs
5998 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5999 } else {
6000 InGlue =
6001 SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InGlue), 0);
6002 }
6003
6004 // Prevent use of AH in a REX instruction by explicitly copying it to
6005 // an ABCD_L register.
6006 //
6007 // The current assumption of the register allocator is that isel
6008 // won't generate explicit references to the GR8_ABCD_H registers. If
6009 // the allocator and/or the backend get enhanced to be more robust in
6010 // that regard, this can be, and should be, removed.
6011 if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
6012 SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
6013 unsigned AHExtOpcode =
6014 isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
6015
6016 SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
6017 MVT::Glue, AHCopy, InGlue);
6018 SDValue Result(RNode, 0);
6019 InGlue = SDValue(RNode, 1);
6020
6021 Result =
6022 CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
6023
6024 ReplaceUses(SDValue(Node, 1), Result);
6025 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6026 dbgs() << '\n');
6027 }
6028 // Copy the division (low) result, if it is needed.
6029 if (!SDValue(Node, 0).use_empty()) {
6030 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
6031 LoReg, NVT, InGlue);
6032 InGlue = Result.getValue(2);
6033 ReplaceUses(SDValue(Node, 0), Result);
6034 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6035 dbgs() << '\n');
6036 }
6037 // Copy the remainder (high) result, if it is needed.
6038 if (!SDValue(Node, 1).use_empty()) {
6039 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
6040 HiReg, NVT, InGlue);
6041 InGlue = Result.getValue(2);
6042 ReplaceUses(SDValue(Node, 1), Result);
6043 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6044 dbgs() << '\n');
6045 }
6046 CurDAG->RemoveDeadNode(Node);
6047 return;
6048 }
6049
6050 case X86ISD::FCMP:
6052 case X86ISD::STRICT_FCMPS: {
6053 bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
6054 Node->getOpcode() == X86ISD::STRICT_FCMPS;
6055 SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0);
6056 SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1);
6057
6058 // Save the original VT of the compare.
6059 MVT CmpVT = N0.getSimpleValueType();
6060
6061 // Floating point needs special handling if we don't have FCOMI.
6062 if (Subtarget->canUseCMOV())
6063 break;
6064
6065 bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
6066
6067 unsigned Opc;
6068 switch (CmpVT.SimpleTy) {
6069 default: llvm_unreachable("Unexpected type!");
6070 case MVT::f32:
6071 Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
6072 break;
6073 case MVT::f64:
6074 Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
6075 break;
6076 case MVT::f80:
6077 Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
6078 break;
6079 }
6080
6081 SDValue Chain =
6082 IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode();
6083 SDValue Glue;
6084 if (IsStrictCmp) {
6085 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
6086 Chain = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0);
6087 Glue = Chain.getValue(1);
6088 } else {
6089 Glue = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N0, N1), 0);
6090 }
6091
6092 // Move FPSW to AX.
6093 SDValue FNSTSW =
6094 SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, Glue), 0);
6095
6096 // Extract upper 8-bits of AX.
6097 SDValue Extract =
6098 CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW);
6099
6100 // Move AH into flags.
6101 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
6102 assert(Subtarget->canUseLAHFSAHF() &&
6103 "Target doesn't support SAHF or FCOMI?");
6104 SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue());
6105 Chain = AH;
6106 SDValue SAHF = SDValue(
6107 CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0);
6108
6109 if (IsStrictCmp)
6110 ReplaceUses(SDValue(Node, 1), Chain);
6111
6112 ReplaceUses(SDValue(Node, 0), SAHF);
6113 CurDAG->RemoveDeadNode(Node);
6114 return;
6115 }
6116
6117 case X86ISD::CMP: {
6118 SDValue N0 = Node->getOperand(0);
6119 SDValue N1 = Node->getOperand(1);
6120
6121 // Optimizations for TEST compares.
6122 if (!isNullConstant(N1))
6123 break;
6124
6125 // Save the original VT of the compare.
6126 MVT CmpVT = N0.getSimpleValueType();
6127
6128 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
6129 // by a test instruction. The test should be removed later by
6130 // analyzeCompare if we are using only the zero flag.
6131 // TODO: Should we check the users and use the BEXTR flags directly?
6132 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
6133 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
6134 unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
6135 : X86::TEST32rr;
6136 SDValue BEXTR = SDValue(NewNode, 0);
6137 NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
6138 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6139 CurDAG->RemoveDeadNode(Node);
6140 return;
6141 }
6142 }
6143
6144 // We can peek through truncates, but we need to be careful below.
6145 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
6146 N0 = N0.getOperand(0);
6147
6148 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
6149 // use a smaller encoding.
6150 // Look past the truncate if CMP is the only use of it.
6151 if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
6152 N0.getValueType() != MVT::i8) {
6153 auto *MaskC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6154 if (!MaskC)
6155 break;
6156
6157 // We may have looked through a truncate so mask off any bits that
6158 // shouldn't be part of the compare.
6159 uint64_t Mask = MaskC->getZExtValue();
6161
6162 // Check if we can replace AND+IMM{32,64} with a shift. This is possible
6163 // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
6164 // zero flag.
6165 if (CmpVT == MVT::i64 && !isInt<8>(Mask) && isShiftedMask_64(Mask) &&
6166 onlyUsesZeroFlag(SDValue(Node, 0))) {
6167 unsigned ShiftOpcode = ISD::DELETED_NODE;
6168 unsigned ShiftAmt;
6169 unsigned SubRegIdx;
6170 MVT SubRegVT;
6171 unsigned TestOpcode;
6172 unsigned LeadingZeros = llvm::countl_zero(Mask);
6173 unsigned TrailingZeros = llvm::countr_zero(Mask);
6174
6175 // With leading/trailing zeros, the transform is profitable if we can
6176 // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
6177 // incurring any extra register moves.
6178 bool SavesBytes = !isInt<32>(Mask) || N0.getOperand(0).hasOneUse();
6179 if (LeadingZeros == 0 && SavesBytes) {
6180 // If the mask covers the most significant bit, then we can replace
6181 // TEST+AND with a SHR and check eflags.
6182 // This emits a redundant TEST which is subsequently eliminated.
6183 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6184 ShiftAmt = TrailingZeros;
6185 SubRegIdx = 0;
6186 TestOpcode = X86::TEST64rr;
6187 } else if (TrailingZeros == 0 && SavesBytes) {
6188 // If the mask covers the least significant bit, then we can replace
6189 // TEST+AND with a SHL and check eflags.
6190 // This emits a redundant TEST which is subsequently eliminated.
6191 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri);
6192 ShiftAmt = LeadingZeros;
6193 SubRegIdx = 0;
6194 TestOpcode = X86::TEST64rr;
6195 } else if (MaskC->hasOneUse() && !isInt<32>(Mask)) {
6196 // If the shifted mask extends into the high half and is 8/16/32 bits
6197 // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6198 unsigned PopCount = 64 - LeadingZeros - TrailingZeros;
6199 if (PopCount == 8) {
6200 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6201 ShiftAmt = TrailingZeros;
6202 SubRegIdx = X86::sub_8bit;
6203 SubRegVT = MVT::i8;
6204 TestOpcode = X86::TEST8rr;
6205 } else if (PopCount == 16) {
6206 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6207 ShiftAmt = TrailingZeros;
6208 SubRegIdx = X86::sub_16bit;
6209 SubRegVT = MVT::i16;
6210 TestOpcode = X86::TEST16rr;
6211 } else if (PopCount == 32) {
6212 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6213 ShiftAmt = TrailingZeros;
6214 SubRegIdx = X86::sub_32bit;
6215 SubRegVT = MVT::i32;
6216 TestOpcode = X86::TEST32rr;
6217 }
6218 }
6219 if (ShiftOpcode != ISD::DELETED_NODE) {
6220 SDValue ShiftC = CurDAG->getTargetConstant(ShiftAmt, dl, MVT::i64);
6221 SDValue Shift = SDValue(
6222 CurDAG->getMachineNode(ShiftOpcode, dl, MVT::i64, MVT::i32,
6223 N0.getOperand(0), ShiftC),
6224 0);
6225 if (SubRegIdx != 0) {
6226 Shift =
6227 CurDAG->getTargetExtractSubreg(SubRegIdx, dl, SubRegVT, Shift);
6228 }
6229 MachineSDNode *Test =
6230 CurDAG->getMachineNode(TestOpcode, dl, MVT::i32, Shift, Shift);
6231 ReplaceNode(Node, Test);
6232 return;
6233 }
6234 }
6235
6236 MVT VT;
6237 int SubRegOp;
6238 unsigned ROpc, MOpc;
6239
6240 // For each of these checks we need to be careful if the sign flag is
6241 // being used. It is only safe to use the sign flag in two conditions,
6242 // either the sign bit in the shrunken mask is zero or the final test
6243 // size is equal to the original compare size.
6244
6245 if (isUInt<8>(Mask) &&
6246 (!(Mask & 0x80) || CmpVT == MVT::i8 ||
6247 hasNoSignFlagUses(SDValue(Node, 0)))) {
6248 // For example, convert "testl %eax, $8" to "testb %al, $8"
6249 VT = MVT::i8;
6250 SubRegOp = X86::sub_8bit;
6251 ROpc = X86::TEST8ri;
6252 MOpc = X86::TEST8mi;
6253 } else if (OptForMinSize && isUInt<16>(Mask) &&
6254 (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
6255 hasNoSignFlagUses(SDValue(Node, 0)))) {
6256 // For example, "testl %eax, $32776" to "testw %ax, $32776".
6257 // NOTE: We only want to form TESTW instructions if optimizing for
6258 // min size. Otherwise we only save one byte and possibly get a length
6259 // changing prefix penalty in the decoders.
6260 VT = MVT::i16;
6261 SubRegOp = X86::sub_16bit;
6262 ROpc = X86::TEST16ri;
6263 MOpc = X86::TEST16mi;
6264 } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
6265 ((!(Mask & 0x80000000) &&
6266 // Without minsize 16-bit Cmps can get here so we need to
6267 // be sure we calculate the correct sign flag if needed.
6268 (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
6269 CmpVT == MVT::i32 ||
6270 hasNoSignFlagUses(SDValue(Node, 0)))) {
6271 // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6272 // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6273 // Otherwize, we find ourselves in a position where we have to do
6274 // promotion. If previous passes did not promote the and, we assume
6275 // they had a good reason not to and do not promote here.
6276 VT = MVT::i32;
6277 SubRegOp = X86::sub_32bit;
6278 ROpc = X86::TEST32ri;
6279 MOpc = X86::TEST32mi;
6280 } else {
6281 // No eligible transformation was found.
6282 break;
6283 }
6284
6285 SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
6286 SDValue Reg = N0.getOperand(0);
6287
6288 // Emit a testl or testw.
6289 MachineSDNode *NewNode;
6290 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6291 if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
6292 if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) {
6293 if (!LoadN->isSimple()) {
6294 unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits();
6295 if ((MOpc == X86::TEST8mi && NumVolBits != 8) ||
6296 (MOpc == X86::TEST16mi && NumVolBits != 16) ||
6297 (MOpc == X86::TEST32mi && NumVolBits != 32))
6298 break;
6299 }
6300 }
6301 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
6302 Reg.getOperand(0) };
6303 NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
6304 // Update the chain.
6305 ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
6306 // Record the mem-refs
6307 CurDAG->setNodeMemRefs(NewNode,
6308 {cast<LoadSDNode>(Reg)->getMemOperand()});
6309 } else {
6310 // Extract the subregister if necessary.
6311 if (N0.getValueType() != VT)
6312 Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
6313
6314 NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
6315 }
6316 // Replace CMP with TEST.
6317 ReplaceNode(Node, NewNode);
6318 return;
6319 }
6320 break;
6321 }
6322 case X86ISD::PCMPISTR: {
6323 if (!Subtarget->hasSSE42())
6324 break;
6325
6326 bool NeedIndex = !SDValue(Node, 0).use_empty();
6327 bool NeedMask = !SDValue(Node, 1).use_empty();
6328 // We can't fold a load if we are going to make two instructions.
6329 bool MayFoldLoad = !NeedIndex || !NeedMask;
6330
6331 MachineSDNode *CNode;
6332 if (NeedMask) {
6333 unsigned ROpc =
6334 Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
6335 unsigned MOpc =
6336 Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
6337 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
6338 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6339 }
6340 if (NeedIndex || !NeedMask) {
6341 unsigned ROpc =
6342 Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
6343 unsigned MOpc =
6344 Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
6345 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
6346 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6347 }
6348
6349 // Connect the flag usage to the last instruction created.
6350 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6351 CurDAG->RemoveDeadNode(Node);
6352 return;
6353 }
6354 case X86ISD::PCMPESTR: {
6355 if (!Subtarget->hasSSE42())
6356 break;
6357
6358 // Copy the two implicit register inputs.
6359 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
6360 Node->getOperand(1),
6361 SDValue()).getValue(1);
6362 InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
6363 Node->getOperand(3), InGlue).getValue(1);
6364
6365 bool NeedIndex = !SDValue(Node, 0).use_empty();
6366 bool NeedMask = !SDValue(Node, 1).use_empty();
6367 // We can't fold a load if we are going to make two instructions.
6368 bool MayFoldLoad = !NeedIndex || !NeedMask;
6369
6370 MachineSDNode *CNode;
6371 if (NeedMask) {
6372 unsigned ROpc =
6373 Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
6374 unsigned MOpc =
6375 Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
6376 CNode =
6377 emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, InGlue);
6378 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6379 }
6380 if (NeedIndex || !NeedMask) {
6381 unsigned ROpc =
6382 Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
6383 unsigned MOpc =
6384 Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
6385 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InGlue);
6386 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6387 }
6388 // Connect the flag usage to the last instruction created.
6389 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6390 CurDAG->RemoveDeadNode(Node);
6391 return;
6392 }
6393
6394 case ISD::SETCC: {
6395 if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
6396 return;
6397
6398 break;
6399 }
6400
6401 case ISD::STORE:
6402 if (foldLoadStoreIntoMemOperand(Node))
6403 return;
6404 break;
6405
6406 case X86ISD::SETCC_CARRY: {
6407 MVT VT = Node->getSimpleValueType(0);
6409 if (Subtarget->hasSBBDepBreaking()) {
6410 // We have to do this manually because tblgen will put the eflags copy in
6411 // the wrong place if we use an extract_subreg in the pattern.
6412 // Copy flags to the EFLAGS register and glue it to next node.
6413 SDValue EFLAGS =
6414 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
6415 Node->getOperand(1), SDValue());
6416
6417 // Create a 64-bit instruction if the result is 64-bits otherwise use the
6418 // 32-bit version.
6419 unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
6420 MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
6421 Result = SDValue(
6422 CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)),
6423 0);
6424 } else {
6425 // The target does not recognize sbb with the same reg operand as a
6426 // no-source idiom, so we explicitly zero the input values.
6427 Result = getSBBZero(Node);
6428 }
6429
6430 // For less than 32-bits we need to extract from the 32-bit node.
6431 if (VT == MVT::i8 || VT == MVT::i16) {
6432 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6433 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6434 }
6435
6436 ReplaceUses(SDValue(Node, 0), Result);
6437 CurDAG->RemoveDeadNode(Node);
6438 return;
6439 }
6440 case X86ISD::SBB: {
6441 if (isNullConstant(Node->getOperand(0)) &&
6442 isNullConstant(Node->getOperand(1))) {
6443 SDValue Result = getSBBZero(Node);
6444
6445 // Replace the flag use.
6446 ReplaceUses(SDValue(Node, 1), Result.getValue(1));
6447
6448 // Replace the result use.
6449 if (!SDValue(Node, 0).use_empty()) {
6450 // For less than 32-bits we need to extract from the 32-bit node.
6451 MVT VT = Node->getSimpleValueType(0);
6452 if (VT == MVT::i8 || VT == MVT::i16) {
6453 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6454 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6455 }
6456 ReplaceUses(SDValue(Node, 0), Result);
6457 }
6458
6459 CurDAG->RemoveDeadNode(Node);
6460 return;
6461 }
6462 break;
6463 }
6464 case X86ISD::MGATHER: {
6465 auto *Mgt = cast<X86MaskedGatherSDNode>(Node);
6466 SDValue IndexOp = Mgt->getIndex();
6467 SDValue Mask = Mgt->getMask();
6468 MVT IndexVT = IndexOp.getSimpleValueType();
6469 MVT ValueVT = Node->getSimpleValueType(0);
6470 MVT MaskVT = Mask.getSimpleValueType();
6471
6472 // This is just to prevent crashes if the nodes are malformed somehow. We're
6473 // otherwise only doing loose type checking in here based on type what
6474 // a type constraint would say just like table based isel.
6475 if (!ValueVT.isVector() || !MaskVT.isVector())
6476 break;
6477
6478 unsigned NumElts = ValueVT.getVectorNumElements();
6479 MVT ValueSVT = ValueVT.getVectorElementType();
6480
6481 bool IsFP = ValueSVT.isFloatingPoint();
6482 unsigned EltSize = ValueSVT.getSizeInBits();
6483
6484 unsigned Opc = 0;
6485 bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
6486 if (AVX512Gather) {
6487 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6488 Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
6489 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6490 Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
6491 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6492 Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
6493 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6494 Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
6495 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6496 Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
6497 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6498 Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
6499 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6500 Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
6501 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6502 Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
6503 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6504 Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
6505 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6506 Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
6507 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6508 Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
6509 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6510 Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
6511 } else {
6512 assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
6513 "Unexpected mask VT!");
6514 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6515 Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
6516 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6517 Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
6518 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6519 Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
6520 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6521 Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
6522 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6523 Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
6524 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6525 Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
6526 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6527 Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
6528 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6529 Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
6530 }
6531
6532 if (!Opc)
6533 break;
6534
6535 SDValue Base, Scale, Index, Disp, Segment;
6536 if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(),
6537 Base, Scale, Index, Disp, Segment))
6538 break;
6539
6540 SDValue PassThru = Mgt->getPassThru();
6541 SDValue Chain = Mgt->getChain();
6542 // Gather instructions have a mask output not in the ISD node.
6543 SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other);
6544
6545 MachineSDNode *NewNode;
6546 if (AVX512Gather) {
6547 SDValue Ops[] = {PassThru, Mask, Base, Scale,
6548 Index, Disp, Segment, Chain};
6549 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6550 } else {
6551 SDValue Ops[] = {PassThru, Base, Scale, Index,
6552 Disp, Segment, Mask, Chain};
6553 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6554 }
6555 CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()});
6556 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6557 ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2));
6558 CurDAG->RemoveDeadNode(Node);
6559 return;
6560 }
6561 case X86ISD::MSCATTER: {
6562 auto *Sc = cast<X86MaskedScatterSDNode>(Node);
6563 SDValue Value = Sc->getValue();
6564 SDValue IndexOp = Sc->getIndex();
6565 MVT IndexVT = IndexOp.getSimpleValueType();
6566 MVT ValueVT = Value.getSimpleValueType();
6567
6568 // This is just to prevent crashes if the nodes are malformed somehow. We're
6569 // otherwise only doing loose type checking in here based on type what
6570 // a type constraint would say just like table based isel.
6571 if (!ValueVT.isVector())
6572 break;
6573
6574 unsigned NumElts = ValueVT.getVectorNumElements();
6575 MVT ValueSVT = ValueVT.getVectorElementType();
6576
6577 bool IsFP = ValueSVT.isFloatingPoint();
6578 unsigned EltSize = ValueSVT.getSizeInBits();
6579
6580 unsigned Opc;
6581 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6582 Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
6583 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6584 Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
6585 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6586 Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
6587 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6588 Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
6589 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6590 Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
6591 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6592 Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
6593 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6594 Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
6595 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6596 Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
6597 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6598 Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
6599 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6600 Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
6601 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6602 Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
6603 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6604 Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
6605 else
6606 break;
6607
6608 SDValue Base, Scale, Index, Disp, Segment;
6609 if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(),
6610 Base, Scale, Index, Disp, Segment))
6611 break;
6612
6613 SDValue Mask = Sc->getMask();
6614 SDValue Chain = Sc->getChain();
6615 // Scatter instructions have a mask output not in the ISD node.
6616 SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other);
6617 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
6618
6619 MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6620 CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()});
6621 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1));
6622 CurDAG->RemoveDeadNode(Node);
6623 return;
6624 }
6626 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6627 auto CallId = MFI->getPreallocatedIdForCallSite(
6628 cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6629 SDValue Chain = Node->getOperand(0);
6630 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6631 MachineSDNode *New = CurDAG->getMachineNode(
6632 TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain);
6633 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain
6634 CurDAG->RemoveDeadNode(Node);
6635 return;
6636 }
6637 case ISD::PREALLOCATED_ARG: {
6638 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6639 auto CallId = MFI->getPreallocatedIdForCallSite(
6640 cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6641 SDValue Chain = Node->getOperand(0);
6642 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6643 SDValue ArgIndex = Node->getOperand(2);
6644 SDValue Ops[3];
6645 Ops[0] = CallIdValue;
6646 Ops[1] = ArgIndex;
6647 Ops[2] = Chain;
6648 MachineSDNode *New = CurDAG->getMachineNode(
6649 TargetOpcode::PREALLOCATED_ARG, dl,
6650 CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()),
6651 MVT::Other),
6652 Ops);
6653 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer
6654 ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain
6655 CurDAG->RemoveDeadNode(Node);
6656 return;
6657 }
6662 if (!Subtarget->hasWIDEKL())
6663 break;
6664
6665 unsigned Opcode;
6666 switch (Node->getOpcode()) {
6667 default:
6668 llvm_unreachable("Unexpected opcode!");
6670 Opcode = X86::AESENCWIDE128KL;
6671 break;
6673 Opcode = X86::AESDECWIDE128KL;
6674 break;
6676 Opcode = X86::AESENCWIDE256KL;
6677 break;
6679 Opcode = X86::AESDECWIDE256KL;
6680 break;
6681 }
6682
6683 SDValue Chain = Node->getOperand(0);
6684 SDValue Addr = Node->getOperand(1);
6685
6686 SDValue Base, Scale, Index, Disp, Segment;
6687 if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment))
6688 break;
6689
6690 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2),
6691 SDValue());
6692 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3),
6693 Chain.getValue(1));
6694 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4),
6695 Chain.getValue(1));
6696 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5),
6697 Chain.getValue(1));
6698 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6),
6699 Chain.getValue(1));
6700 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7),
6701 Chain.getValue(1));
6702 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8),
6703 Chain.getValue(1));
6704 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9),
6705 Chain.getValue(1));
6706
6707 MachineSDNode *Res = CurDAG->getMachineNode(
6708 Opcode, dl, Node->getVTList(),
6709 {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)});
6710 CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand());
6711 ReplaceNode(Node, Res);
6712 return;
6713 }
6715 SDValue Chain = Node->getOperand(0);
6716 Register Reg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
6717 SDValue Glue;
6718 if (Node->getNumValues() == 3)
6719 Glue = Node->getOperand(2);
6720 SDValue Copy =
6721 CurDAG->getCopyFromReg(Chain, dl, Reg, Node->getValueType(0), Glue);
6722 ReplaceNode(Node, Copy.getNode());
6723 return;
6724 }
6725 }
6726
6727 SelectCode(Node);
6728}
6729
6730bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6731 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
6732 std::vector<SDValue> &OutOps) {
6733 SDValue Op0, Op1, Op2, Op3, Op4;
6734 switch (ConstraintID) {
6735 default:
6736 llvm_unreachable("Unexpected asm memory constraint");
6737 case InlineAsm::ConstraintCode::o: // offsetable ??
6738 case InlineAsm::ConstraintCode::v: // not offsetable ??
6739 case InlineAsm::ConstraintCode::m: // memory
6740 case InlineAsm::ConstraintCode::X:
6741 case InlineAsm::ConstraintCode::p: // address
6742 if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
6743 return true;
6744 break;
6745 }
6746
6747 OutOps.push_back(Op0);
6748 OutOps.push_back(Op1);
6749 OutOps.push_back(Op2);
6750 OutOps.push_back(Op3);
6751 OutOps.push_back(Op4);
6752 return false;
6753}
6754
6757 std::make_unique<X86DAGToDAGISel>(TM, TM.getOptLevel())) {}
6758
6759/// This pass converts a legalized DAG into a X86-specific DAG,
6760/// ready for instruction scheduling.
6762 CodeGenOptLevel OptLevel) {
6763 return new X86DAGToDAGISelLegacy(TM, OptLevel);
6764}
unsigned SubReg
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis false
#define CASE(ATTRNAME, AANAME,...)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
dxil translate DXIL Translate Metadata
#define DEBUG_TYPE
Module.h This file contains the declarations for the Module class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
Register Reg
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
#define P(N)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
BaseType
A given derived pointer can have multiple base pointers through phi/selects.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, SDValue StoredVal, SelectionDAG *CurDAG, LoadSDNode *&LoadNode, SDValue &InputChain)
static void insertDAGNode(SelectionDAG *DAG, SDNode *Pos, SDValue N)
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
#define PASS_NAME
static bool isRIPRelative(const MCInst &MI, const MCInstrInfo &MCII)
Check if the instruction uses RIP relative addressing.
#define FROM_TO(FROM, TO)
#define GET_EGPR_IF_ENABLED(OPC)
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget)
static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM)
static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM)
static bool needBWI(MVT VT)
static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, bool FoldedBCast, bool Masked)
static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM, const X86Subtarget &Subtarget)
static bool mayUseCarryFlag(X86::CondCode CC)
static cl::opt< bool > EnablePromoteAnyextLoad("x86-promote-anyext-load", cl::init(true), cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden)
static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, SDValue Call, SDValue OrigChain)
Replace the original chain operand of the call with load's chain operand and move load below the call...
#define GET_ND_IF_ENABLED(OPC)
#define VPTESTM_BROADCAST_CASES(SUFFIX)
static cl::opt< bool > AndImmShrink("x86-and-imm-shrink", cl::init(true), cl::desc("Enable setting constant bits to reduce size of mask immediates"), cl::Hidden)
static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, X86ISelAddressMode &AM)
#define VPTESTM_FULL_CASES(SUFFIX)
static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq)
Return true if call address is a load and it can be moved below CALLSEQ_START and the chains leading ...
static bool isDispSafeForFrameIndexOrRegBase(int64_t Val)
static bool isEndbrImm64(uint64_t Imm)
cl::opt< bool > IndirectBranchTracking("x86-indirect-branch-tracking", cl::init(false), cl::Hidden, cl::desc("Enable X86 indirect branch tracking pass."))
#define GET_ND_IF_ENABLED(OPC)
#define CASE_ND(OP)
Value * RHS
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1023
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1497
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1607
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1540
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1258
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1665
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:709
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:729
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI std::optional< ConstantRange > getAbsoluteSymbolRange() const
If this is an absolute symbol reference, returns the range of the symbol, otherwise returns std::null...
Definition Globals.cpp:455
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition Module.cpp:358
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
int getNodeId() const
Return the unique node id.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
bool use_empty() const
Return true if there are no uses of this node.
const SDValue & getOperand(unsigned Num) const
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
iterator_range< user_iterator > users()
op_iterator op_end() const
op_iterator op_begin() const
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
unsigned getNumOperands() const
SelectionDAGISelPass(std::unique_ptr< SelectionDAGISel > Selector)
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
static int getUninvalidatedNodeId(SDNode *N)
virtual bool runOnMachineFunction(MachineFunction &mf)
static void InvalidateNodeId(SDNode *N)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
static constexpr unsigned MaxRecursionDepth
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
void RepositionNode(allnodes_iterator Position, SDNode *N)
Move node N in the AllNodes list to be immediately before the given iterator Position.
ilist< SDNode >::iterator allnodes_iterator
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
unsigned getID() const
Return the register class ID number.
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
X86ISelDAGToDAGPass(X86TargetMachine &TM)
size_t getPreallocatedIdForCallSite(const Value *CS)
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:819
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:853
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:993
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:665
@ PREALLOCATED_SETUP
PREALLOCATED_SETUP - This has 2 operands: an input chain and a SRCVALUE with the preallocated call Va...
@ TargetExternalSymbol
Definition ISDOpcodes.h:190
@ PREALLOCATED_ARG
PREALLOCATED_ARG - This has 3 operands: an input chain, a SRCVALUE with the preallocated call Value,...
@ BRIND
BRIND - Indirect branch.
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230
@ TargetGlobalAddress
TargetGlobalAddress - Like GlobalAddress, but the DAG does no folding or anything else with this node...
Definition ISDOpcodes.h:185
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition ISDOpcodes.h:135
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition ISDOpcodes.h:899
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:805
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:464
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:478
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:500
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:477
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:505
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition ISDOpcodes.h:921
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:458
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
@ CALLSEQ_START
CALLSEQ_START/CALLSEQ_END - These operators mark the beginning and end of a call sequence,...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ TargetGlobalTLSAddress
Definition ISDOpcodes.h:186
LLVM_ABI bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
@ GlobalBaseReg
The result of the mflr at function entry, used for PIC code.
@ X86
Windows x64, Windows Itanium (IA-64)
Definition MCAsmInfo.h:50
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
@ EVEX
EVEX - Specifies that this instruction use EVEX form which provides syntax support up to 32 512-bit r...
@ VEX
VEX - encoding using 0xC4/0xC5.
@ XOP
XOP - Opcode prefix used by XOP instructions.
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ CALL
These operations represent an abstract X86 call instruction, which includes a bunch of information.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FLD
This instruction implements an extending load to FP stack slots.
@ TC_RETURN
Tail call return.
@ FOR
Bitwise logical OR of floating point values.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ POP_FROM_X87_REG
The same as ISD::CopyFromReg except that this node makes it explicit that it may lower to an x87 FPU ...
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
int getCondSrcNoFromDesc(const MCInstrDesc &MCID)
Return the source operand # for condition code by MCID.
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false, bool IgnoreAlignment=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
constexpr uint16_t Magic
Definition SFrame.h:32
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:345
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:261
FunctionPass * createX86ISelDag(X86TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a X86-specific DAG, ready for instruction scheduling.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
@ And
Bitwise or logical AND of integers.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:870
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition ValueTypes.h:212
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54
Matching combinators.
LLVM_ABI unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
bool hasNoUnsignedWrap() const