LLVM 19.0.0git
X86ISelDAGToDAG.cpp
Go to the documentation of this file.
1//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines a DAG pattern matching instruction selector for X86,
10// converting from a legalized dag to a X86 dag.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86.h"
16#include "X86RegisterInfo.h"
17#include "X86Subtarget.h"
18#include "X86TargetMachine.h"
19#include "llvm/ADT/Statistic.h"
22#include "llvm/Config/llvm-config.h"
24#include "llvm/IR/Function.h"
26#include "llvm/IR/Intrinsics.h"
27#include "llvm/IR/IntrinsicsX86.h"
28#include "llvm/IR/Type.h"
29#include "llvm/Support/Debug.h"
33#include <cstdint>
34
35using namespace llvm;
36
37#define DEBUG_TYPE "x86-isel"
38#define PASS_NAME "X86 DAG->DAG Instruction Selection"
39
40STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
41
42static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
43 cl::desc("Enable setting constant bits to reduce size of mask immediates"),
45
47 "x86-promote-anyext-load", cl::init(true),
48 cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
49
51
52//===----------------------------------------------------------------------===//
53// Pattern Matcher Implementation
54//===----------------------------------------------------------------------===//
55
56namespace {
57 /// This corresponds to X86AddressMode, but uses SDValue's instead of register
58 /// numbers for the leaves of the matched tree.
59 struct X86ISelAddressMode {
60 enum {
61 RegBase,
62 FrameIndexBase
63 } BaseType = RegBase;
64
65 // This is really a union, discriminated by BaseType!
66 SDValue Base_Reg;
67 int Base_FrameIndex = 0;
68
69 unsigned Scale = 1;
70 SDValue IndexReg;
71 int32_t Disp = 0;
72 SDValue Segment;
73 const GlobalValue *GV = nullptr;
74 const Constant *CP = nullptr;
75 const BlockAddress *BlockAddr = nullptr;
76 const char *ES = nullptr;
77 MCSymbol *MCSym = nullptr;
78 int JT = -1;
79 Align Alignment; // CP alignment.
80 unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_*
81 bool NegateIndex = false;
82
83 X86ISelAddressMode() = default;
84
85 bool hasSymbolicDisplacement() const {
86 return GV != nullptr || CP != nullptr || ES != nullptr ||
87 MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
88 }
89
90 bool hasBaseOrIndexReg() const {
91 return BaseType == FrameIndexBase ||
92 IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
93 }
94
95 /// Return true if this addressing mode is already RIP-relative.
96 bool isRIPRelative() const {
97 if (BaseType != RegBase) return false;
98 if (RegisterSDNode *RegNode =
99 dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
100 return RegNode->getReg() == X86::RIP;
101 return false;
102 }
103
104 void setBaseReg(SDValue Reg) {
105 BaseType = RegBase;
106 Base_Reg = Reg;
107 }
108
109#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
110 void dump(SelectionDAG *DAG = nullptr) {
111 dbgs() << "X86ISelAddressMode " << this << '\n';
112 dbgs() << "Base_Reg ";
113 if (Base_Reg.getNode())
114 Base_Reg.getNode()->dump(DAG);
115 else
116 dbgs() << "nul\n";
117 if (BaseType == FrameIndexBase)
118 dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
119 dbgs() << " Scale " << Scale << '\n'
120 << "IndexReg ";
121 if (NegateIndex)
122 dbgs() << "negate ";
123 if (IndexReg.getNode())
124 IndexReg.getNode()->dump(DAG);
125 else
126 dbgs() << "nul\n";
127 dbgs() << " Disp " << Disp << '\n'
128 << "GV ";
129 if (GV)
130 GV->dump();
131 else
132 dbgs() << "nul";
133 dbgs() << " CP ";
134 if (CP)
135 CP->dump();
136 else
137 dbgs() << "nul";
138 dbgs() << '\n'
139 << "ES ";
140 if (ES)
141 dbgs() << ES;
142 else
143 dbgs() << "nul";
144 dbgs() << " MCSym ";
145 if (MCSym)
146 dbgs() << MCSym;
147 else
148 dbgs() << "nul";
149 dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
150 }
151#endif
152 };
153}
154
155namespace {
156 //===--------------------------------------------------------------------===//
157 /// ISel - X86-specific code to select X86 machine instructions for
158 /// SelectionDAG operations.
159 ///
160 class X86DAGToDAGISel final : public SelectionDAGISel {
161 /// Keep a pointer to the X86Subtarget around so that we can
162 /// make the right decision when generating code for different targets.
163 const X86Subtarget *Subtarget;
164
165 /// If true, selector should try to optimize for minimum code size.
166 bool OptForMinSize;
167
168 /// Disable direct TLS access through segment registers.
169 bool IndirectTlsSegRefs;
170
171 public:
172 static char ID;
173
174 X86DAGToDAGISel() = delete;
175
176 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
177 : SelectionDAGISel(ID, tm, OptLevel), Subtarget(nullptr),
178 OptForMinSize(false), IndirectTlsSegRefs(false) {}
179
180 bool runOnMachineFunction(MachineFunction &MF) override {
181 // Reset the subtarget each time through.
182 Subtarget = &MF.getSubtarget<X86Subtarget>();
183 IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
184 "indirect-tls-seg-refs");
185
186 // OptFor[Min]Size are used in pattern predicates that isel is matching.
187 OptForMinSize = MF.getFunction().hasMinSize();
188 assert((!OptForMinSize || MF.getFunction().hasOptSize()) &&
189 "OptForMinSize implies OptForSize");
190
192 return true;
193 }
194
195 void emitFunctionEntryCode() override;
196
197 bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
198
199 void PreprocessISelDAG() override;
200 void PostprocessISelDAG() override;
201
202// Include the pieces autogenerated from the target description.
203#include "X86GenDAGISel.inc"
204
205 private:
206 void Select(SDNode *N) override;
207
208 bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
209 bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
210 bool AllowSegmentRegForX32 = false);
211 bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
212 bool matchAddress(SDValue N, X86ISelAddressMode &AM);
213 bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
214 bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
215 SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
216 unsigned Depth);
217 bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
218 unsigned Depth);
219 bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
220 unsigned Depth);
221 bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
222 bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
223 SDValue &Scale, SDValue &Index, SDValue &Disp,
224 SDValue &Segment);
225 bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
226 SDValue ScaleOp, SDValue &Base, SDValue &Scale,
227 SDValue &Index, SDValue &Disp, SDValue &Segment);
228 bool selectMOV64Imm32(SDValue N, SDValue &Imm);
229 bool selectLEAAddr(SDValue N, SDValue &Base,
230 SDValue &Scale, SDValue &Index, SDValue &Disp,
231 SDValue &Segment);
232 bool selectLEA64_32Addr(SDValue N, SDValue &Base,
233 SDValue &Scale, SDValue &Index, SDValue &Disp,
234 SDValue &Segment);
235 bool selectTLSADDRAddr(SDValue N, SDValue &Base,
236 SDValue &Scale, SDValue &Index, SDValue &Disp,
237 SDValue &Segment);
238 bool selectRelocImm(SDValue N, SDValue &Op);
239
240 bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
241 SDValue &Base, SDValue &Scale,
242 SDValue &Index, SDValue &Disp,
243 SDValue &Segment);
244
245 // Convenience method where P is also root.
246 bool tryFoldLoad(SDNode *P, SDValue N,
247 SDValue &Base, SDValue &Scale,
248 SDValue &Index, SDValue &Disp,
249 SDValue &Segment) {
250 return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
251 }
252
253 bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
254 SDValue &Base, SDValue &Scale,
255 SDValue &Index, SDValue &Disp,
256 SDValue &Segment);
257
258 bool isProfitableToFormMaskedOp(SDNode *N) const;
259
260 /// Implement addressing mode selection for inline asm expressions.
262 InlineAsm::ConstraintCode ConstraintID,
263 std::vector<SDValue> &OutOps) override;
264
265 void emitSpecialCodeForMain();
266
267 inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
268 MVT VT, SDValue &Base, SDValue &Scale,
269 SDValue &Index, SDValue &Disp,
270 SDValue &Segment) {
271 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
272 Base = CurDAG->getTargetFrameIndex(
273 AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
274 else if (AM.Base_Reg.getNode())
275 Base = AM.Base_Reg;
276 else
277 Base = CurDAG->getRegister(0, VT);
278
279 Scale = getI8Imm(AM.Scale, DL);
280
281#define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
282 // Negate the index if needed.
283 if (AM.NegateIndex) {
284 unsigned NegOpc = VT == MVT::i64 ? GET_ND_IF_ENABLED(X86::NEG64r)
285 : GET_ND_IF_ENABLED(X86::NEG32r);
286 SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
287 AM.IndexReg), 0);
288 AM.IndexReg = Neg;
289 }
290
291 if (AM.IndexReg.getNode())
292 Index = AM.IndexReg;
293 else
294 Index = CurDAG->getRegister(0, VT);
295
296 // These are 32-bit even in 64-bit mode since RIP-relative offset
297 // is 32-bit.
298 if (AM.GV)
299 Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
300 MVT::i32, AM.Disp,
301 AM.SymbolFlags);
302 else if (AM.CP)
303 Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment,
304 AM.Disp, AM.SymbolFlags);
305 else if (AM.ES) {
306 assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
307 Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
308 } else if (AM.MCSym) {
309 assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
310 assert(AM.SymbolFlags == 0 && "oo");
311 Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
312 } else if (AM.JT != -1) {
313 assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
314 Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
315 } else if (AM.BlockAddr)
316 Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
317 AM.SymbolFlags);
318 else
319 Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
320
321 if (AM.Segment.getNode())
322 Segment = AM.Segment;
323 else
324 Segment = CurDAG->getRegister(0, MVT::i16);
325 }
326
327 // Utility function to determine whether we should avoid selecting
328 // immediate forms of instructions for better code size or not.
329 // At a high level, we'd like to avoid such instructions when
330 // we have similar constants used within the same basic block
331 // that can be kept in a register.
332 //
333 bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
334 uint32_t UseCount = 0;
335
336 // Do not want to hoist if we're not optimizing for size.
337 // TODO: We'd like to remove this restriction.
338 // See the comment in X86InstrInfo.td for more info.
339 if (!CurDAG->shouldOptForSize())
340 return false;
341
342 // Walk all the users of the immediate.
343 for (const SDNode *User : N->uses()) {
344 if (UseCount >= 2)
345 break;
346
347 // This user is already selected. Count it as a legitimate use and
348 // move on.
349 if (User->isMachineOpcode()) {
350 UseCount++;
351 continue;
352 }
353
354 // We want to count stores of immediates as real uses.
355 if (User->getOpcode() == ISD::STORE &&
356 User->getOperand(1).getNode() == N) {
357 UseCount++;
358 continue;
359 }
360
361 // We don't currently match users that have > 2 operands (except
362 // for stores, which are handled above)
363 // Those instruction won't match in ISEL, for now, and would
364 // be counted incorrectly.
365 // This may change in the future as we add additional instruction
366 // types.
367 if (User->getNumOperands() != 2)
368 continue;
369
370 // If this is a sign-extended 8-bit integer immediate used in an ALU
371 // instruction, there is probably an opcode encoding to save space.
372 auto *C = dyn_cast<ConstantSDNode>(N);
373 if (C && isInt<8>(C->getSExtValue()))
374 continue;
375
376 // Immediates that are used for offsets as part of stack
377 // manipulation should be left alone. These are typically
378 // used to indicate SP offsets for argument passing and
379 // will get pulled into stores/pushes (implicitly).
380 if (User->getOpcode() == X86ISD::ADD ||
381 User->getOpcode() == ISD::ADD ||
382 User->getOpcode() == X86ISD::SUB ||
383 User->getOpcode() == ISD::SUB) {
384
385 // Find the other operand of the add/sub.
386 SDValue OtherOp = User->getOperand(0);
387 if (OtherOp.getNode() == N)
388 OtherOp = User->getOperand(1);
389
390 // Don't count if the other operand is SP.
391 RegisterSDNode *RegNode;
392 if (OtherOp->getOpcode() == ISD::CopyFromReg &&
393 (RegNode = dyn_cast_or_null<RegisterSDNode>(
394 OtherOp->getOperand(1).getNode())))
395 if ((RegNode->getReg() == X86::ESP) ||
396 (RegNode->getReg() == X86::RSP))
397 continue;
398 }
399
400 // ... otherwise, count this and move on.
401 UseCount++;
402 }
403
404 // If we have more than 1 use, then recommend for hoisting.
405 return (UseCount > 1);
406 }
407
408 /// Return a target constant with the specified value of type i8.
409 inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
410 return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
411 }
412
413 /// Return a target constant with the specified value, of type i32.
414 inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
415 return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
416 }
417
418 /// Return a target constant with the specified value, of type i64.
419 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
420 return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
421 }
422
423 SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
424 const SDLoc &DL) {
425 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
426 uint64_t Index = N->getConstantOperandVal(1);
427 MVT VecVT = N->getOperand(0).getSimpleValueType();
428 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
429 }
430
431 SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
432 const SDLoc &DL) {
433 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
434 uint64_t Index = N->getConstantOperandVal(2);
435 MVT VecVT = N->getSimpleValueType(0);
436 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
437 }
438
439 SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth,
440 const SDLoc &DL) {
441 assert(VecWidth == 128 && "Unexpected vector width");
442 uint64_t Index = N->getConstantOperandVal(2);
443 MVT VecVT = N->getSimpleValueType(0);
444 uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
445 assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index");
446 // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
447 // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
448 return getI8Imm(InsertIdx ? 0x02 : 0x30, DL);
449 }
450
451 SDValue getSBBZero(SDNode *N) {
452 SDLoc dl(N);
453 MVT VT = N->getSimpleValueType(0);
454
455 // Create zero.
456 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
458 CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, std::nullopt), 0);
459 if (VT == MVT::i64) {
460 Zero = SDValue(
461 CurDAG->getMachineNode(
462 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
463 CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
464 CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
465 0);
466 }
467
468 // Copy flags to the EFLAGS register and glue it to next node.
469 unsigned Opcode = N->getOpcode();
470 assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
471 "Unexpected opcode for SBB materialization");
472 unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
473 SDValue EFLAGS =
474 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
475 N->getOperand(FlagOpIndex), SDValue());
476
477 // Create a 64-bit instruction if the result is 64-bits otherwise use the
478 // 32-bit version.
479 unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
480 MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
481 VTs = CurDAG->getVTList(SBBVT, MVT::i32);
482 return SDValue(
483 CurDAG->getMachineNode(Opc, dl, VTs,
484 {Zero, Zero, EFLAGS, EFLAGS.getValue(1)}),
485 0);
486 }
487
488 // Helper to detect unneeded and instructions on shift amounts. Called
489 // from PatFrags in tablegen.
490 bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
491 assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
492 const APInt &Val = N->getConstantOperandAPInt(1);
493
494 if (Val.countr_one() >= Width)
495 return true;
496
497 APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
498 return Mask.countr_one() >= Width;
499 }
500
501 /// Return an SDNode that returns the value of the global base register.
502 /// Output instructions required to initialize the global base register,
503 /// if necessary.
504 SDNode *getGlobalBaseReg();
505
506 /// Return a reference to the TargetMachine, casted to the target-specific
507 /// type.
508 const X86TargetMachine &getTargetMachine() const {
509 return static_cast<const X86TargetMachine &>(TM);
510 }
511
512 /// Return a reference to the TargetInstrInfo, casted to the target-specific
513 /// type.
514 const X86InstrInfo *getInstrInfo() const {
515 return Subtarget->getInstrInfo();
516 }
517
518 /// Return a condition code of the given SDNode
519 X86::CondCode getCondFromNode(SDNode *N) const;
520
521 /// Address-mode matching performs shift-of-and to and-of-shift
522 /// reassociation in order to expose more scaled addressing
523 /// opportunities.
524 bool ComplexPatternFuncMutatesDAG() const override {
525 return true;
526 }
527
528 bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
529
530 // Indicates we should prefer to use a non-temporal load for this load.
531 bool useNonTemporalLoad(LoadSDNode *N) const {
532 if (!N->isNonTemporal())
533 return false;
534
535 unsigned StoreSize = N->getMemoryVT().getStoreSize();
536
537 if (N->getAlign().value() < StoreSize)
538 return false;
539
540 switch (StoreSize) {
541 default: llvm_unreachable("Unsupported store size");
542 case 4:
543 case 8:
544 return false;
545 case 16:
546 return Subtarget->hasSSE41();
547 case 32:
548 return Subtarget->hasAVX2();
549 case 64:
550 return Subtarget->hasAVX512();
551 }
552 }
553
554 bool foldLoadStoreIntoMemOperand(SDNode *Node);
555 MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
556 bool matchBitExtract(SDNode *Node);
557 bool shrinkAndImmediate(SDNode *N);
558 bool isMaskZeroExtended(SDNode *N) const;
559 bool tryShiftAmountMod(SDNode *N);
560 bool tryShrinkShlLogicImm(SDNode *N);
561 bool tryVPTERNLOG(SDNode *N);
562 bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB,
563 SDNode *ParentC, SDValue A, SDValue B, SDValue C,
564 uint8_t Imm);
565 bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
566 bool tryMatchBitSelect(SDNode *N);
567
568 MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
569 const SDLoc &dl, MVT VT, SDNode *Node);
570 MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
571 const SDLoc &dl, MVT VT, SDNode *Node,
572 SDValue &InGlue);
573
574 bool tryOptimizeRem8Extend(SDNode *N);
575
576 bool onlyUsesZeroFlag(SDValue Flags) const;
577 bool hasNoSignFlagUses(SDValue Flags) const;
578 bool hasNoCarryFlagUses(SDValue Flags) const;
579 };
580}
581
582char X86DAGToDAGISel::ID = 0;
583
584INITIALIZE_PASS(X86DAGToDAGISel, DEBUG_TYPE, PASS_NAME, false, false)
585
586// Returns true if this masked compare can be implemented legally with this
587// type.
588static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
589 unsigned Opcode = N->getOpcode();
590 if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
591 Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
592 Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
593 // We can get 256-bit 8 element types here without VLX being enabled. When
594 // this happens we will use 512-bit operations and the mask will not be
595 // zero extended.
596 EVT OpVT = N->getOperand(0).getValueType();
597 // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
598 // second operand.
599 if (Opcode == X86ISD::STRICT_CMPM)
600 OpVT = N->getOperand(1).getValueType();
601 if (OpVT.is256BitVector() || OpVT.is128BitVector())
602 return Subtarget->hasVLX();
603
604 return true;
605 }
606 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
607 if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
608 Opcode == X86ISD::FSETCCM_SAE)
609 return true;
610
611 return false;
612}
613
614// Returns true if we can assume the writer of the mask has zero extended it
615// for us.
616bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
617 // If this is an AND, check if we have a compare on either side. As long as
618 // one side guarantees the mask is zero extended, the AND will preserve those
619 // zeros.
620 if (N->getOpcode() == ISD::AND)
621 return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
622 isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
623
624 return isLegalMaskCompare(N, Subtarget);
625}
626
627bool
628X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
629 if (OptLevel == CodeGenOptLevel::None)
630 return false;
631
632 if (!N.hasOneUse())
633 return false;
634
635 if (N.getOpcode() != ISD::LOAD)
636 return true;
637
638 // Don't fold non-temporal loads if we have an instruction for them.
639 if (useNonTemporalLoad(cast<LoadSDNode>(N)))
640 return false;
641
642 // If N is a load, do additional profitability checks.
643 if (U == Root) {
644 switch (U->getOpcode()) {
645 default: break;
646 case X86ISD::ADD:
647 case X86ISD::ADC:
648 case X86ISD::SUB:
649 case X86ISD::SBB:
650 case X86ISD::AND:
651 case X86ISD::XOR:
652 case X86ISD::OR:
653 case ISD::ADD:
654 case ISD::UADDO_CARRY:
655 case ISD::AND:
656 case ISD::OR:
657 case ISD::XOR: {
658 SDValue Op1 = U->getOperand(1);
659
660 // If the other operand is a 8-bit immediate we should fold the immediate
661 // instead. This reduces code size.
662 // e.g.
663 // movl 4(%esp), %eax
664 // addl $4, %eax
665 // vs.
666 // movl $4, %eax
667 // addl 4(%esp), %eax
668 // The former is 2 bytes shorter. In case where the increment is 1, then
669 // the saving can be 4 bytes (by using incl %eax).
670 if (auto *Imm = dyn_cast<ConstantSDNode>(Op1)) {
671 if (Imm->getAPIntValue().isSignedIntN(8))
672 return false;
673
674 // If this is a 64-bit AND with an immediate that fits in 32-bits,
675 // prefer using the smaller and over folding the load. This is needed to
676 // make sure immediates created by shrinkAndImmediate are always folded.
677 // Ideally we would narrow the load during DAG combine and get the
678 // best of both worlds.
679 if (U->getOpcode() == ISD::AND &&
680 Imm->getAPIntValue().getBitWidth() == 64 &&
681 Imm->getAPIntValue().isIntN(32))
682 return false;
683
684 // If this really a zext_inreg that can be represented with a movzx
685 // instruction, prefer that.
686 // TODO: We could shrink the load and fold if it is non-volatile.
687 if (U->getOpcode() == ISD::AND &&
688 (Imm->getAPIntValue() == UINT8_MAX ||
689 Imm->getAPIntValue() == UINT16_MAX ||
690 Imm->getAPIntValue() == UINT32_MAX))
691 return false;
692
693 // ADD/SUB with can negate the immediate and use the opposite operation
694 // to fit 128 into a sign extended 8 bit immediate.
695 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
696 (-Imm->getAPIntValue()).isSignedIntN(8))
697 return false;
698
699 if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
700 (-Imm->getAPIntValue()).isSignedIntN(8) &&
701 hasNoCarryFlagUses(SDValue(U, 1)))
702 return false;
703 }
704
705 // If the other operand is a TLS address, we should fold it instead.
706 // This produces
707 // movl %gs:0, %eax
708 // leal i@NTPOFF(%eax), %eax
709 // instead of
710 // movl $i@NTPOFF, %eax
711 // addl %gs:0, %eax
712 // if the block also has an access to a second TLS address this will save
713 // a load.
714 // FIXME: This is probably also true for non-TLS addresses.
715 if (Op1.getOpcode() == X86ISD::Wrapper) {
716 SDValue Val = Op1.getOperand(0);
718 return false;
719 }
720
721 // Don't fold load if this matches the BTS/BTR/BTC patterns.
722 // BTS: (or X, (shl 1, n))
723 // BTR: (and X, (rotl -2, n))
724 // BTC: (xor X, (shl 1, n))
725 if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
726 if (U->getOperand(0).getOpcode() == ISD::SHL &&
727 isOneConstant(U->getOperand(0).getOperand(0)))
728 return false;
729
730 if (U->getOperand(1).getOpcode() == ISD::SHL &&
731 isOneConstant(U->getOperand(1).getOperand(0)))
732 return false;
733 }
734 if (U->getOpcode() == ISD::AND) {
735 SDValue U0 = U->getOperand(0);
736 SDValue U1 = U->getOperand(1);
737 if (U0.getOpcode() == ISD::ROTL) {
738 auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
739 if (C && C->getSExtValue() == -2)
740 return false;
741 }
742
743 if (U1.getOpcode() == ISD::ROTL) {
744 auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
745 if (C && C->getSExtValue() == -2)
746 return false;
747 }
748 }
749
750 break;
751 }
752 case ISD::SHL:
753 case ISD::SRA:
754 case ISD::SRL:
755 // Don't fold a load into a shift by immediate. The BMI2 instructions
756 // support folding a load, but not an immediate. The legacy instructions
757 // support folding an immediate, but can't fold a load. Folding an
758 // immediate is preferable to folding a load.
759 if (isa<ConstantSDNode>(U->getOperand(1)))
760 return false;
761
762 break;
763 }
764 }
765
766 // Prevent folding a load if this can implemented with an insert_subreg or
767 // a move that implicitly zeroes.
768 if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
769 isNullConstant(Root->getOperand(2)) &&
770 (Root->getOperand(0).isUndef() ||
772 return false;
773
774 return true;
775}
776
777// Indicates it is profitable to form an AVX512 masked operation. Returning
778// false will favor a masked register-register masked move or vblendm and the
779// operation will be selected separately.
780bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
781 assert(
782 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
783 "Unexpected opcode!");
784
785 // If the operation has additional users, the operation will be duplicated.
786 // Check the use count to prevent that.
787 // FIXME: Are there cheap opcodes we might want to duplicate?
788 return N->getOperand(1).hasOneUse();
789}
790
791/// Replace the original chain operand of the call with
792/// load's chain operand and move load below the call's chain operand.
793static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
794 SDValue Call, SDValue OrigChain) {
796 SDValue Chain = OrigChain.getOperand(0);
797 if (Chain.getNode() == Load.getNode())
798 Ops.push_back(Load.getOperand(0));
799 else {
800 assert(Chain.getOpcode() == ISD::TokenFactor &&
801 "Unexpected chain operand");
802 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
803 if (Chain.getOperand(i).getNode() == Load.getNode())
804 Ops.push_back(Load.getOperand(0));
805 else
806 Ops.push_back(Chain.getOperand(i));
807 SDValue NewChain =
808 CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
809 Ops.clear();
810 Ops.push_back(NewChain);
811 }
812 Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
813 CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
814 CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
815 Load.getOperand(1), Load.getOperand(2));
816
817 Ops.clear();
818 Ops.push_back(SDValue(Load.getNode(), 1));
819 Ops.append(Call->op_begin() + 1, Call->op_end());
820 CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
821}
822
823/// Return true if call address is a load and it can be
824/// moved below CALLSEQ_START and the chains leading up to the call.
825/// Return the CALLSEQ_START by reference as a second output.
826/// In the case of a tail call, there isn't a callseq node between the call
827/// chain and the load.
828static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
829 // The transformation is somewhat dangerous if the call's chain was glued to
830 // the call. After MoveBelowOrigChain the load is moved between the call and
831 // the chain, this can create a cycle if the load is not folded. So it is
832 // *really* important that we are sure the load will be folded.
833 if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
834 return false;
835 auto *LD = dyn_cast<LoadSDNode>(Callee.getNode());
836 if (!LD ||
837 !LD->isSimple() ||
838 LD->getAddressingMode() != ISD::UNINDEXED ||
839 LD->getExtensionType() != ISD::NON_EXTLOAD)
840 return false;
841
842 // Now let's find the callseq_start.
843 while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
844 if (!Chain.hasOneUse())
845 return false;
846 Chain = Chain.getOperand(0);
847 }
848
849 if (!Chain.getNumOperands())
850 return false;
851 // Since we are not checking for AA here, conservatively abort if the chain
852 // writes to memory. It's not safe to move the callee (a load) across a store.
853 if (isa<MemSDNode>(Chain.getNode()) &&
854 cast<MemSDNode>(Chain.getNode())->writeMem())
855 return false;
856 if (Chain.getOperand(0).getNode() == Callee.getNode())
857 return true;
858 if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
859 Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
860 Callee.getValue(1).hasOneUse())
861 return true;
862 return false;
863}
864
865static bool isEndbrImm64(uint64_t Imm) {
866// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
867// i.g: 0xF3660F1EFA, 0xF3670F1EFA
868 if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
869 return false;
870
871 uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
872 0x65, 0x66, 0x67, 0xf0, 0xf2};
873 int i = 24; // 24bit 0x0F1EFA has matched
874 while (i < 64) {
875 uint8_t Byte = (Imm >> i) & 0xFF;
876 if (Byte == 0xF3)
877 return true;
878 if (!llvm::is_contained(OptionalPrefixBytes, Byte))
879 return false;
880 i += 8;
881 }
882
883 return false;
884}
885
886static bool needBWI(MVT VT) {
887 return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8);
888}
889
890void X86DAGToDAGISel::PreprocessISelDAG() {
891 bool MadeChange = false;
892 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
893 E = CurDAG->allnodes_end(); I != E; ) {
894 SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
895
896 // This is for CET enhancement.
897 //
898 // ENDBR32 and ENDBR64 have specific opcodes:
899 // ENDBR32: F3 0F 1E FB
900 // ENDBR64: F3 0F 1E FA
901 // And we want that attackers won’t find unintended ENDBR32/64
902 // opcode matches in the binary
903 // Here’s an example:
904 // If the compiler had to generate asm for the following code:
905 // a = 0xF30F1EFA
906 // it could, for example, generate:
907 // mov 0xF30F1EFA, dword ptr[a]
908 // In such a case, the binary would include a gadget that starts
909 // with a fake ENDBR64 opcode. Therefore, we split such generation
910 // into multiple operations, let it not shows in the binary
911 if (N->getOpcode() == ISD::Constant) {
912 MVT VT = N->getSimpleValueType(0);
913 int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
914 int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
915 if (Imm == EndbrImm || isEndbrImm64(Imm)) {
916 // Check that the cf-protection-branch is enabled.
917 Metadata *CFProtectionBranch =
918 MF->getMMI().getModule()->getModuleFlag("cf-protection-branch");
919 if (CFProtectionBranch || IndirectBranchTracking) {
920 SDLoc dl(N);
921 SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true);
922 Complement = CurDAG->getNOT(dl, Complement, VT);
923 --I;
924 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement);
925 ++I;
926 MadeChange = true;
927 continue;
928 }
929 }
930 }
931
932 // If this is a target specific AND node with no flag usages, turn it back
933 // into ISD::AND to enable test instruction matching.
934 if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
935 SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
936 N->getOperand(0), N->getOperand(1));
937 --I;
938 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
939 ++I;
940 MadeChange = true;
941 continue;
942 }
943
944 // Convert vector increment or decrement to sub/add with an all-ones
945 // constant:
946 // add X, <1, 1...> --> sub X, <-1, -1...>
947 // sub X, <1, 1...> --> add X, <-1, -1...>
948 // The all-ones vector constant can be materialized using a pcmpeq
949 // instruction that is commonly recognized as an idiom (has no register
950 // dependency), so that's better/smaller than loading a splat 1 constant.
951 //
952 // But don't do this if it would inhibit a potentially profitable load
953 // folding opportunity for the other operand. That only occurs with the
954 // intersection of:
955 // (1) The other operand (op0) is load foldable.
956 // (2) The op is an add (otherwise, we are *creating* an add and can still
957 // load fold the other op).
958 // (3) The target has AVX (otherwise, we have a destructive add and can't
959 // load fold the other op without killing the constant op).
960 // (4) The constant 1 vector has multiple uses (so it is profitable to load
961 // into a register anyway).
962 auto mayPreventLoadFold = [&]() {
963 return X86::mayFoldLoad(N->getOperand(0), *Subtarget) &&
964 N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
965 !N->getOperand(1).hasOneUse();
966 };
967 if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
968 N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
969 APInt SplatVal;
970 if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
971 SplatVal.isOne()) {
972 SDLoc DL(N);
973
974 MVT VT = N->getSimpleValueType(0);
975 unsigned NumElts = VT.getSizeInBits() / 32;
977 CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts));
978 AllOnes = CurDAG->getBitcast(VT, AllOnes);
979
980 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
981 SDValue Res =
982 CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes);
983 --I;
984 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
985 ++I;
986 MadeChange = true;
987 continue;
988 }
989 }
990
991 switch (N->getOpcode()) {
992 case X86ISD::VBROADCAST: {
993 MVT VT = N->getSimpleValueType(0);
994 // Emulate v32i16/v64i8 broadcast without BWI.
995 if (!Subtarget->hasBWI() && needBWI(VT)) {
996 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
997 SDLoc dl(N);
998 SDValue NarrowBCast =
999 CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0));
1000 SDValue Res =
1001 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1002 NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1003 unsigned Index = NarrowVT.getVectorMinNumElements();
1004 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1005 CurDAG->getIntPtrConstant(Index, dl));
1006
1007 --I;
1008 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1009 ++I;
1010 MadeChange = true;
1011 continue;
1012 }
1013
1014 break;
1015 }
1017 MVT VT = N->getSimpleValueType(0);
1018 // Emulate v32i16/v64i8 broadcast without BWI.
1019 if (!Subtarget->hasBWI() && needBWI(VT)) {
1020 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1021 auto *MemNode = cast<MemSDNode>(N);
1022 SDLoc dl(N);
1023 SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
1024 SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1025 SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1026 X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(),
1027 MemNode->getMemOperand());
1028 SDValue Res =
1029 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1030 NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1031 unsigned Index = NarrowVT.getVectorMinNumElements();
1032 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1033 CurDAG->getIntPtrConstant(Index, dl));
1034
1035 --I;
1036 SDValue To[] = {Res, NarrowBCast.getValue(1)};
1037 CurDAG->ReplaceAllUsesWith(N, To);
1038 ++I;
1039 MadeChange = true;
1040 continue;
1041 }
1042
1043 break;
1044 }
1045 case ISD::LOAD: {
1046 // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1047 // load, then just extract the lower subvector and avoid the second load.
1048 auto *Ld = cast<LoadSDNode>(N);
1049 MVT VT = N->getSimpleValueType(0);
1050 if (!ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
1051 !(VT.is128BitVector() || VT.is256BitVector()))
1052 break;
1053
1054 MVT MaxVT = VT;
1055 SDNode *MaxLd = nullptr;
1056 SDValue Ptr = Ld->getBasePtr();
1057 SDValue Chain = Ld->getChain();
1058 for (SDNode *User : Ptr->uses()) {
1059 auto *UserLd = dyn_cast<LoadSDNode>(User);
1060 MVT UserVT = User->getSimpleValueType(0);
1061 if (User != N && UserLd && ISD::isNormalLoad(User) &&
1062 UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1063 !User->hasAnyUseOfValue(1) &&
1064 (UserVT.is256BitVector() || UserVT.is512BitVector()) &&
1065 UserVT.getSizeInBits() > VT.getSizeInBits() &&
1066 (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1067 MaxLd = User;
1068 MaxVT = UserVT;
1069 }
1070 }
1071 if (MaxLd) {
1072 SDLoc dl(N);
1073 unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1074 MVT SubVT = MVT::getVectorVT(MaxVT.getScalarType(), NumSubElts);
1075 SDValue Extract = CurDAG->getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT,
1076 SDValue(MaxLd, 0),
1077 CurDAG->getIntPtrConstant(0, dl));
1078 SDValue Res = CurDAG->getBitcast(VT, Extract);
1079
1080 --I;
1081 SDValue To[] = {Res, SDValue(MaxLd, 1)};
1082 CurDAG->ReplaceAllUsesWith(N, To);
1083 ++I;
1084 MadeChange = true;
1085 continue;
1086 }
1087 break;
1088 }
1089 case ISD::VSELECT: {
1090 // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1091 EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();
1092 if (EleVT == MVT::i1)
1093 break;
1094
1095 assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1096 assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
1097 "We can't replace VSELECT with BLENDV in vXi16!");
1098 SDValue R;
1099 if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(N->getOperand(0)) ==
1100 EleVT.getSizeInBits()) {
1101 R = CurDAG->getNode(X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0),
1102 N->getOperand(0), N->getOperand(1), N->getOperand(2),
1103 CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
1104 } else {
1105 R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
1106 N->getOperand(0), N->getOperand(1),
1107 N->getOperand(2));
1108 }
1109 --I;
1110 CurDAG->ReplaceAllUsesWith(N, R.getNode());
1111 ++I;
1112 MadeChange = true;
1113 continue;
1114 }
1115 case ISD::FP_ROUND:
1117 case ISD::FP_TO_SINT:
1118 case ISD::FP_TO_UINT:
1121 // Replace vector fp_to_s/uint with their X86 specific equivalent so we
1122 // don't need 2 sets of patterns.
1123 if (!N->getSimpleValueType(0).isVector())
1124 break;
1125
1126 unsigned NewOpc;
1127 switch (N->getOpcode()) {
1128 default: llvm_unreachable("Unexpected opcode!");
1129 case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
1130 case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
1131 case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1132 case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
1133 case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1134 case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
1135 }
1136 SDValue Res;
1137 if (N->isStrictFPOpcode())
1138 Res =
1139 CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other},
1140 {N->getOperand(0), N->getOperand(1)});
1141 else
1142 Res =
1143 CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1144 N->getOperand(0));
1145 --I;
1146 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1147 ++I;
1148 MadeChange = true;
1149 continue;
1150 }
1151 case ISD::SHL:
1152 case ISD::SRA:
1153 case ISD::SRL: {
1154 // Replace vector shifts with their X86 specific equivalent so we don't
1155 // need 2 sets of patterns.
1156 if (!N->getValueType(0).isVector())
1157 break;
1158
1159 unsigned NewOpc;
1160 switch (N->getOpcode()) {
1161 default: llvm_unreachable("Unexpected opcode!");
1162 case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1163 case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1164 case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1165 }
1166 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1167 N->getOperand(0), N->getOperand(1));
1168 --I;
1169 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1170 ++I;
1171 MadeChange = true;
1172 continue;
1173 }
1174 case ISD::ANY_EXTEND:
1176 // Replace vector any extend with the zero extend equivalents so we don't
1177 // need 2 sets of patterns. Ignore vXi1 extensions.
1178 if (!N->getValueType(0).isVector())
1179 break;
1180
1181 unsigned NewOpc;
1182 if (N->getOperand(0).getScalarValueSizeInBits() == 1) {
1183 assert(N->getOpcode() == ISD::ANY_EXTEND &&
1184 "Unexpected opcode for mask vector!");
1185 NewOpc = ISD::SIGN_EXTEND;
1186 } else {
1187 NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1190 }
1191
1192 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1193 N->getOperand(0));
1194 --I;
1195 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1196 ++I;
1197 MadeChange = true;
1198 continue;
1199 }
1200 case ISD::FCEIL:
1201 case ISD::STRICT_FCEIL:
1202 case ISD::FFLOOR:
1203 case ISD::STRICT_FFLOOR:
1204 case ISD::FTRUNC:
1205 case ISD::STRICT_FTRUNC:
1206 case ISD::FROUNDEVEN:
1208 case ISD::FNEARBYINT:
1210 case ISD::FRINT:
1211 case ISD::STRICT_FRINT: {
1212 // Replace fp rounding with their X86 specific equivalent so we don't
1213 // need 2 sets of patterns.
1214 unsigned Imm;
1215 switch (N->getOpcode()) {
1216 default: llvm_unreachable("Unexpected opcode!");
1217 case ISD::STRICT_FCEIL:
1218 case ISD::FCEIL: Imm = 0xA; break;
1219 case ISD::STRICT_FFLOOR:
1220 case ISD::FFLOOR: Imm = 0x9; break;
1221 case ISD::STRICT_FTRUNC:
1222 case ISD::FTRUNC: Imm = 0xB; break;
1224 case ISD::FROUNDEVEN: Imm = 0x8; break;
1226 case ISD::FNEARBYINT: Imm = 0xC; break;
1227 case ISD::STRICT_FRINT:
1228 case ISD::FRINT: Imm = 0x4; break;
1229 }
1230 SDLoc dl(N);
1231 bool IsStrict = N->isStrictFPOpcode();
1232 SDValue Res;
1233 if (IsStrict)
1234 Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
1235 {N->getValueType(0), MVT::Other},
1236 {N->getOperand(0), N->getOperand(1),
1237 CurDAG->getTargetConstant(Imm, dl, MVT::i32)});
1238 else
1239 Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
1240 N->getOperand(0),
1241 CurDAG->getTargetConstant(Imm, dl, MVT::i32));
1242 --I;
1243 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1244 ++I;
1245 MadeChange = true;
1246 continue;
1247 }
1248 case X86ISD::FANDN:
1249 case X86ISD::FAND:
1250 case X86ISD::FOR:
1251 case X86ISD::FXOR: {
1252 // Widen scalar fp logic ops to vector to reduce isel patterns.
1253 // FIXME: Can we do this during lowering/combine.
1254 MVT VT = N->getSimpleValueType(0);
1255 if (VT.isVector() || VT == MVT::f128)
1256 break;
1257
1258 MVT VecVT = VT == MVT::f64 ? MVT::v2f64
1259 : VT == MVT::f32 ? MVT::v4f32
1260 : MVT::v8f16;
1261
1262 SDLoc dl(N);
1263 SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1264 N->getOperand(0));
1265 SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1266 N->getOperand(1));
1267
1268 SDValue Res;
1269 if (Subtarget->hasSSE2()) {
1270 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
1271 Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
1272 Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
1273 unsigned Opc;
1274 switch (N->getOpcode()) {
1275 default: llvm_unreachable("Unexpected opcode!");
1276 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1277 case X86ISD::FAND: Opc = ISD::AND; break;
1278 case X86ISD::FOR: Opc = ISD::OR; break;
1279 case X86ISD::FXOR: Opc = ISD::XOR; break;
1280 }
1281 Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
1282 Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
1283 } else {
1284 Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
1285 }
1286 Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
1287 CurDAG->getIntPtrConstant(0, dl));
1288 --I;
1289 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1290 ++I;
1291 MadeChange = true;
1292 continue;
1293 }
1294 }
1295
1296 if (OptLevel != CodeGenOptLevel::None &&
1297 // Only do this when the target can fold the load into the call or
1298 // jmp.
1299 !Subtarget->useIndirectThunkCalls() &&
1300 ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
1301 (N->getOpcode() == X86ISD::TC_RETURN &&
1302 (Subtarget->is64Bit() ||
1303 !getTargetMachine().isPositionIndependent())))) {
1304 /// Also try moving call address load from outside callseq_start to just
1305 /// before the call to allow it to be folded.
1306 ///
1307 /// [Load chain]
1308 /// ^
1309 /// |
1310 /// [Load]
1311 /// ^ ^
1312 /// | |
1313 /// / \--
1314 /// / |
1315 ///[CALLSEQ_START] |
1316 /// ^ |
1317 /// | |
1318 /// [LOAD/C2Reg] |
1319 /// | |
1320 /// \ /
1321 /// \ /
1322 /// [CALL]
1323 bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1324 SDValue Chain = N->getOperand(0);
1325 SDValue Load = N->getOperand(1);
1326 if (!isCalleeLoad(Load, Chain, HasCallSeq))
1327 continue;
1328 moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
1329 ++NumLoadMoved;
1330 MadeChange = true;
1331 continue;
1332 }
1333
1334 // Lower fpround and fpextend nodes that target the FP stack to be store and
1335 // load to the stack. This is a gross hack. We would like to simply mark
1336 // these as being illegal, but when we do that, legalize produces these when
1337 // it expands calls, then expands these in the same legalize pass. We would
1338 // like dag combine to be able to hack on these between the call expansion
1339 // and the node legalization. As such this pass basically does "really
1340 // late" legalization of these inline with the X86 isel pass.
1341 // FIXME: This should only happen when not compiled with -O0.
1342 switch (N->getOpcode()) {
1343 default: continue;
1344 case ISD::FP_ROUND:
1345 case ISD::FP_EXTEND:
1346 {
1347 MVT SrcVT = N->getOperand(0).getSimpleValueType();
1348 MVT DstVT = N->getSimpleValueType(0);
1349
1350 // If any of the sources are vectors, no fp stack involved.
1351 if (SrcVT.isVector() || DstVT.isVector())
1352 continue;
1353
1354 // If the source and destination are SSE registers, then this is a legal
1355 // conversion that should not be lowered.
1356 const X86TargetLowering *X86Lowering =
1357 static_cast<const X86TargetLowering *>(TLI);
1358 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1359 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1360 if (SrcIsSSE && DstIsSSE)
1361 continue;
1362
1363 if (!SrcIsSSE && !DstIsSSE) {
1364 // If this is an FPStack extension, it is a noop.
1365 if (N->getOpcode() == ISD::FP_EXTEND)
1366 continue;
1367 // If this is a value-preserving FPStack truncation, it is a noop.
1368 if (N->getConstantOperandVal(1))
1369 continue;
1370 }
1371
1372 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1373 // FPStack has extload and truncstore. SSE can fold direct loads into other
1374 // operations. Based on this, decide what we want to do.
1375 MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1376 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1377 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1378 MachinePointerInfo MPI =
1379 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1380 SDLoc dl(N);
1381
1382 // FIXME: optimize the case where the src/dest is a load or store?
1383
1384 SDValue Store = CurDAG->getTruncStore(
1385 CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT);
1386 SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store,
1387 MemTmp, MPI, MemVT);
1388
1389 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1390 // extload we created. This will cause general havok on the dag because
1391 // anything below the conversion could be folded into other existing nodes.
1392 // To avoid invalidating 'I', back it up to the convert node.
1393 --I;
1394 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1395 break;
1396 }
1397
1398 //The sequence of events for lowering STRICT_FP versions of these nodes requires
1399 //dealing with the chain differently, as there is already a preexisting chain.
1402 {
1403 MVT SrcVT = N->getOperand(1).getSimpleValueType();
1404 MVT DstVT = N->getSimpleValueType(0);
1405
1406 // If any of the sources are vectors, no fp stack involved.
1407 if (SrcVT.isVector() || DstVT.isVector())
1408 continue;
1409
1410 // If the source and destination are SSE registers, then this is a legal
1411 // conversion that should not be lowered.
1412 const X86TargetLowering *X86Lowering =
1413 static_cast<const X86TargetLowering *>(TLI);
1414 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1415 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1416 if (SrcIsSSE && DstIsSSE)
1417 continue;
1418
1419 if (!SrcIsSSE && !DstIsSSE) {
1420 // If this is an FPStack extension, it is a noop.
1421 if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1422 continue;
1423 // If this is a value-preserving FPStack truncation, it is a noop.
1424 if (N->getConstantOperandVal(2))
1425 continue;
1426 }
1427
1428 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1429 // FPStack has extload and truncstore. SSE can fold direct loads into other
1430 // operations. Based on this, decide what we want to do.
1431 MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1432 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1433 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1434 MachinePointerInfo MPI =
1435 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1436 SDLoc dl(N);
1437
1438 // FIXME: optimize the case where the src/dest is a load or store?
1439
1440 //Since the operation is StrictFP, use the preexisting chain.
1442 if (!SrcIsSSE) {
1443 SDVTList VTs = CurDAG->getVTList(MVT::Other);
1444 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
1445 Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
1446 MPI, /*Align*/ std::nullopt,
1448 if (N->getFlags().hasNoFPExcept()) {
1449 SDNodeFlags Flags = Store->getFlags();
1450 Flags.setNoFPExcept(true);
1451 Store->setFlags(Flags);
1452 }
1453 } else {
1454 assert(SrcVT == MemVT && "Unexpected VT!");
1455 Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
1456 MPI);
1457 }
1458
1459 if (!DstIsSSE) {
1460 SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
1461 SDValue Ops[] = {Store, MemTmp};
1462 Result = CurDAG->getMemIntrinsicNode(
1463 X86ISD::FLD, dl, VTs, Ops, MemVT, MPI,
1464 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
1465 if (N->getFlags().hasNoFPExcept()) {
1466 SDNodeFlags Flags = Result->getFlags();
1467 Flags.setNoFPExcept(true);
1468 Result->setFlags(Flags);
1469 }
1470 } else {
1471 assert(DstVT == MemVT && "Unexpected VT!");
1472 Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI);
1473 }
1474
1475 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1476 // extload we created. This will cause general havok on the dag because
1477 // anything below the conversion could be folded into other existing nodes.
1478 // To avoid invalidating 'I', back it up to the convert node.
1479 --I;
1480 CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1481 break;
1482 }
1483 }
1484
1485
1486 // Now that we did that, the node is dead. Increment the iterator to the
1487 // next node to process, then delete N.
1488 ++I;
1489 MadeChange = true;
1490 }
1491
1492 // Remove any dead nodes that may have been left behind.
1493 if (MadeChange)
1494 CurDAG->RemoveDeadNodes();
1495}
1496
1497// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1498bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1499 unsigned Opc = N->getMachineOpcode();
1500 if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1501 Opc != X86::MOVSX64rr8)
1502 return false;
1503
1504 SDValue N0 = N->getOperand(0);
1505
1506 // We need to be extracting the lower bit of an extend.
1507 if (!N0.isMachineOpcode() ||
1508 N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1509 N0.getConstantOperandVal(1) != X86::sub_8bit)
1510 return false;
1511
1512 // We're looking for either a movsx or movzx to match the original opcode.
1513 unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1514 : X86::MOVSX32rr8_NOREX;
1515 SDValue N00 = N0.getOperand(0);
1516 if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1517 return false;
1518
1519 if (Opc == X86::MOVSX64rr8) {
1520 // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1521 // to 64.
1522 MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1523 MVT::i64, N00);
1524 ReplaceUses(N, Extend);
1525 } else {
1526 // Ok we can drop this extend and just use the original extend.
1527 ReplaceUses(N, N00.getNode());
1528 }
1529
1530 return true;
1531}
1532
1533void X86DAGToDAGISel::PostprocessISelDAG() {
1534 // Skip peepholes at -O0.
1535 if (TM.getOptLevel() == CodeGenOptLevel::None)
1536 return;
1537
1538 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1539
1540 bool MadeChange = false;
1541 while (Position != CurDAG->allnodes_begin()) {
1542 SDNode *N = &*--Position;
1543 // Skip dead nodes and any non-machine opcodes.
1544 if (N->use_empty() || !N->isMachineOpcode())
1545 continue;
1546
1547 if (tryOptimizeRem8Extend(N)) {
1548 MadeChange = true;
1549 continue;
1550 }
1551
1552 unsigned Opc = N->getMachineOpcode();
1553 switch (Opc) {
1554 default:
1555 continue;
1556 // TESTrr+ANDrr/rm -> TESTrr/TESTmr
1557 case X86::TEST8rr:
1558 case X86::TEST16rr:
1559 case X86::TEST32rr:
1560 case X86::TEST64rr: {
1561 auto &Op0 = N->getOperand(0);
1562 if (Op0 != N->getOperand(1) || !Op0->hasNUsesOfValue(2, Op0.getResNo()) ||
1563 !Op0.isMachineOpcode())
1564 continue;
1565 SDValue And = N->getOperand(0);
1566#define CASE_ND(OP) \
1567 case X86::OP: \
1568 case X86::OP##_ND:
1569 switch (And.getMachineOpcode()) {
1570 default:
1571 continue;
1572 CASE_ND(AND8rr)
1573 CASE_ND(AND16rr)
1574 CASE_ND(AND32rr)
1575 CASE_ND(AND64rr) {
1576 if (And->hasAnyUseOfValue(1))
1577 continue;
1578 MachineSDNode *Test = CurDAG->getMachineNode(
1579 Opc, SDLoc(N), MVT::i32, And.getOperand(0), And.getOperand(1));
1580 ReplaceUses(N, Test);
1581 MadeChange = true;
1582 continue;
1583 }
1584 CASE_ND(AND8rm)
1585 CASE_ND(AND16rm)
1586 CASE_ND(AND32rm)
1587 CASE_ND(AND64rm) {
1588 if (And->hasAnyUseOfValue(1))
1589 continue;
1590 unsigned NewOpc;
1591#define FROM_TO(A, B) \
1592 CASE_ND(A) NewOpc = X86::B; \
1593 break;
1594 switch (And.getMachineOpcode()) {
1595 FROM_TO(AND8rm, TEST8mr);
1596 FROM_TO(AND16rm, TEST16mr);
1597 FROM_TO(AND32rm, TEST32mr);
1598 FROM_TO(AND64rm, TEST64mr);
1599 }
1600#undef FROM_TO
1601#undef CASE_ND
1602 // Need to swap the memory and register operand.
1603 SDValue Ops[] = {And.getOperand(1), And.getOperand(2),
1604 And.getOperand(3), And.getOperand(4),
1605 And.getOperand(5), And.getOperand(0),
1606 And.getOperand(6) /* Chain */};
1607 MachineSDNode *Test = CurDAG->getMachineNode(
1608 NewOpc, SDLoc(N), MVT::i32, MVT::Other, Ops);
1609 CurDAG->setNodeMemRefs(
1610 Test, cast<MachineSDNode>(And.getNode())->memoperands());
1611 ReplaceUses(And.getValue(2), SDValue(Test, 1));
1612 ReplaceUses(SDValue(N, 0), SDValue(Test, 0));
1613 MadeChange = true;
1614 continue;
1615 }
1616 }
1617 }
1618 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1619 // used. We're doing this late so we can prefer to fold the AND into masked
1620 // comparisons. Doing that can be better for the live range of the mask
1621 // register.
1622 case X86::KORTESTBrr:
1623 case X86::KORTESTWrr:
1624 case X86::KORTESTDrr:
1625 case X86::KORTESTQrr: {
1626 SDValue Op0 = N->getOperand(0);
1627 if (Op0 != N->getOperand(1) || !N->isOnlyUserOf(Op0.getNode()) ||
1628 !Op0.isMachineOpcode() || !onlyUsesZeroFlag(SDValue(N, 0)))
1629 continue;
1630#define CASE(A) \
1631 case X86::A: \
1632 break;
1633 switch (Op0.getMachineOpcode()) {
1634 default:
1635 continue;
1636 CASE(KANDBrr)
1637 CASE(KANDWrr)
1638 CASE(KANDDrr)
1639 CASE(KANDQrr)
1640 }
1641 unsigned NewOpc;
1642#define FROM_TO(A, B) \
1643 case X86::A: \
1644 NewOpc = X86::B; \
1645 break;
1646 switch (Opc) {
1647 FROM_TO(KORTESTBrr, KTESTBrr)
1648 FROM_TO(KORTESTWrr, KTESTWrr)
1649 FROM_TO(KORTESTDrr, KTESTDrr)
1650 FROM_TO(KORTESTQrr, KTESTQrr)
1651 }
1652 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1653 // KAND instructions and KTEST use the same ISA feature.
1654 if (NewOpc == X86::KTESTWrr && !Subtarget->hasDQI())
1655 continue;
1656#undef FROM_TO
1657 MachineSDNode *KTest = CurDAG->getMachineNode(
1658 NewOpc, SDLoc(N), MVT::i32, Op0.getOperand(0), Op0.getOperand(1));
1659 ReplaceUses(N, KTest);
1660 MadeChange = true;
1661 continue;
1662 }
1663 // Attempt to remove vectors moves that were inserted to zero upper bits.
1664 case TargetOpcode::SUBREG_TO_REG: {
1665 unsigned SubRegIdx = N->getConstantOperandVal(2);
1666 if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1667 continue;
1668
1669 SDValue Move = N->getOperand(1);
1670 if (!Move.isMachineOpcode())
1671 continue;
1672
1673 // Make sure its one of the move opcodes we recognize.
1674 switch (Move.getMachineOpcode()) {
1675 default:
1676 continue;
1677 CASE(VMOVAPDrr) CASE(VMOVUPDrr)
1678 CASE(VMOVAPSrr) CASE(VMOVUPSrr)
1679 CASE(VMOVDQArr) CASE(VMOVDQUrr)
1680 CASE(VMOVAPDYrr) CASE(VMOVUPDYrr)
1681 CASE(VMOVAPSYrr) CASE(VMOVUPSYrr)
1682 CASE(VMOVDQAYrr) CASE(VMOVDQUYrr)
1683 CASE(VMOVAPDZ128rr) CASE(VMOVUPDZ128rr)
1684 CASE(VMOVAPSZ128rr) CASE(VMOVUPSZ128rr)
1685 CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr)
1686 CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr)
1687 CASE(VMOVAPDZ256rr) CASE(VMOVUPDZ256rr)
1688 CASE(VMOVAPSZ256rr) CASE(VMOVUPSZ256rr)
1689 CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr)
1690 CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr)
1691 }
1692#undef CASE
1693
1694 SDValue In = Move.getOperand(0);
1695 if (!In.isMachineOpcode() ||
1696 In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1697 continue;
1698
1699 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1700 // the SHA instructions which use a legacy encoding.
1701 uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1702 if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1703 (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1704 (TSFlags & X86II::EncodingMask) != X86II::XOP)
1705 continue;
1706
1707 // Producing instruction is another vector instruction. We can drop the
1708 // move.
1709 CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1710 MadeChange = true;
1711 }
1712 }
1713 }
1714
1715 if (MadeChange)
1716 CurDAG->RemoveDeadNodes();
1717}
1718
1719
1720/// Emit any code that needs to be executed only in the main function.
1721void X86DAGToDAGISel::emitSpecialCodeForMain() {
1722 if (Subtarget->isTargetCygMing()) {
1724 auto &DL = CurDAG->getDataLayout();
1725
1727 CLI.setChain(CurDAG->getRoot())
1728 .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1729 CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1730 std::move(Args));
1731 const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1732 std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1733 CurDAG->setRoot(Result.second);
1734 }
1735}
1736
1737void X86DAGToDAGISel::emitFunctionEntryCode() {
1738 // If this is main, emit special code for main.
1739 const Function &F = MF->getFunction();
1740 if (F.hasExternalLinkage() && F.getName() == "main")
1741 emitSpecialCodeForMain();
1742}
1743
1744static bool isDispSafeForFrameIndex(int64_t Val) {
1745 // On 64-bit platforms, we can run into an issue where a frame index
1746 // includes a displacement that, when added to the explicit displacement,
1747 // will overflow the displacement field. Assuming that the frame index
1748 // displacement fits into a 31-bit integer (which is only slightly more
1749 // aggressive than the current fundamental assumption that it fits into
1750 // a 32-bit integer), a 31-bit disp should always be safe.
1751 return isInt<31>(Val);
1752}
1753
1754bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1755 X86ISelAddressMode &AM) {
1756 // We may have already matched a displacement and the caller just added the
1757 // symbolic displacement. So we still need to do the checks even if Offset
1758 // is zero.
1759
1760 int64_t Val = AM.Disp + Offset;
1761
1762 // Cannot combine ExternalSymbol displacements with integer offsets.
1763 if (Val != 0 && (AM.ES || AM.MCSym))
1764 return true;
1765
1766 CodeModel::Model M = TM.getCodeModel();
1767 if (Subtarget->is64Bit()) {
1768 if (Val != 0 &&
1770 AM.hasSymbolicDisplacement()))
1771 return true;
1772 // In addition to the checks required for a register base, check that
1773 // we do not try to use an unsafe Disp with a frame index.
1774 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1776 return true;
1777 // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1778 // 64 bits. Instructions with 32-bit register addresses perform this zero
1779 // extension for us and we can safely ignore the high bits of Offset.
1780 // Instructions with only a 32-bit immediate address do not, though: they
1781 // sign extend instead. This means only address the low 2GB of address space
1782 // is directly addressable, we need indirect addressing for the high 2GB of
1783 // address space.
1784 // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1785 // implicit zero extension of instructions would cover up any problem.
1786 // However, we have asserts elsewhere that get triggered if we do, so keep
1787 // the checks for now.
1788 // TODO: We would actually be able to accept these, as well as the same
1789 // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1790 // to get an address size override to be emitted. However, this
1791 // pseudo-register is not part of any register class and therefore causes
1792 // MIR verification to fail.
1793 if (Subtarget->isTarget64BitILP32() && !isUInt<31>(Val) &&
1794 !AM.hasBaseOrIndexReg())
1795 return true;
1796 }
1797 AM.Disp = Val;
1798 return false;
1799}
1800
1801bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1802 bool AllowSegmentRegForX32) {
1803 SDValue Address = N->getOperand(1);
1804
1805 // load gs:0 -> GS segment register.
1806 // load fs:0 -> FS segment register.
1807 //
1808 // This optimization is generally valid because the GNU TLS model defines that
1809 // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1810 // with 32-bit registers, as we get in ILP32 mode, those registers are first
1811 // zero-extended to 64 bits and then added it to the base address, which gives
1812 // unwanted results when the register holds a negative value.
1813 // For more information see http://people.redhat.com/drepper/tls.pdf
1814 if (isNullConstant(Address) && AM.Segment.getNode() == nullptr &&
1815 !IndirectTlsSegRefs &&
1816 (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1817 Subtarget->isTargetFuchsia())) {
1818 if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1819 return true;
1820 switch (N->getPointerInfo().getAddrSpace()) {
1821 case X86AS::GS:
1822 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1823 return false;
1824 case X86AS::FS:
1825 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1826 return false;
1827 // Address space X86AS::SS is not handled here, because it is not used to
1828 // address TLS areas.
1829 }
1830 }
1831
1832 return true;
1833}
1834
1835/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1836/// mode. These wrap things that will resolve down into a symbol reference.
1837/// If no match is possible, this returns true, otherwise it returns false.
1838bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1839 // If the addressing mode already has a symbol as the displacement, we can
1840 // never match another symbol.
1841 if (AM.hasSymbolicDisplacement())
1842 return true;
1843
1844 bool IsRIPRelTLS = false;
1845 bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1846 if (IsRIPRel) {
1847 SDValue Val = N.getOperand(0);
1849 IsRIPRelTLS = true;
1850 }
1851
1852 // We can't use an addressing mode in the 64-bit large code model.
1853 // Global TLS addressing is an exception. In the medium code model,
1854 // we use can use a mode when RIP wrappers are present.
1855 // That signifies access to globals that are known to be "near",
1856 // such as the GOT itself.
1857 CodeModel::Model M = TM.getCodeModel();
1858 if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1859 return true;
1860
1861 // Base and index reg must be 0 in order to use %rip as base.
1862 if (IsRIPRel && AM.hasBaseOrIndexReg())
1863 return true;
1864
1865 // Make a local copy in case we can't do this fold.
1866 X86ISelAddressMode Backup = AM;
1867
1868 int64_t Offset = 0;
1869 SDValue N0 = N.getOperand(0);
1870 if (auto *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1871 AM.GV = G->getGlobal();
1872 AM.SymbolFlags = G->getTargetFlags();
1873 Offset = G->getOffset();
1874 } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1875 AM.CP = CP->getConstVal();
1876 AM.Alignment = CP->getAlign();
1877 AM.SymbolFlags = CP->getTargetFlags();
1878 Offset = CP->getOffset();
1879 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1880 AM.ES = S->getSymbol();
1881 AM.SymbolFlags = S->getTargetFlags();
1882 } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1883 AM.MCSym = S->getMCSymbol();
1884 } else if (auto *J = dyn_cast<JumpTableSDNode>(N0)) {
1885 AM.JT = J->getIndex();
1886 AM.SymbolFlags = J->getTargetFlags();
1887 } else if (auto *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1888 AM.BlockAddr = BA->getBlockAddress();
1889 AM.SymbolFlags = BA->getTargetFlags();
1890 Offset = BA->getOffset();
1891 } else
1892 llvm_unreachable("Unhandled symbol reference node.");
1893
1894 // Can't use an addressing mode with large globals.
1895 if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1896 TM.isLargeGlobalValue(AM.GV)) {
1897 AM = Backup;
1898 return true;
1899 }
1900
1901 if (foldOffsetIntoAddress(Offset, AM)) {
1902 AM = Backup;
1903 return true;
1904 }
1905
1906 if (IsRIPRel)
1907 AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1908
1909 // Commit the changes now that we know this fold is safe.
1910 return false;
1911}
1912
1913/// Add the specified node to the specified addressing mode, returning true if
1914/// it cannot be done. This just pattern matches for the addressing mode.
1915bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1916 if (matchAddressRecursively(N, AM, 0))
1917 return true;
1918
1919 // Post-processing: Make a second attempt to fold a load, if we now know
1920 // that there will not be any other register. This is only performed for
1921 // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
1922 // any foldable load the first time.
1923 if (Subtarget->isTarget64BitILP32() &&
1924 AM.BaseType == X86ISelAddressMode::RegBase &&
1925 AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
1926 SDValue Save_Base_Reg = AM.Base_Reg;
1927 if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) {
1928 AM.Base_Reg = SDValue();
1929 if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true))
1930 AM.Base_Reg = Save_Base_Reg;
1931 }
1932 }
1933
1934 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1935 // a smaller encoding and avoids a scaled-index.
1936 if (AM.Scale == 2 &&
1937 AM.BaseType == X86ISelAddressMode::RegBase &&
1938 AM.Base_Reg.getNode() == nullptr) {
1939 AM.Base_Reg = AM.IndexReg;
1940 AM.Scale = 1;
1941 }
1942
1943 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
1944 // because it has a smaller encoding.
1945 if (TM.getCodeModel() != CodeModel::Large &&
1946 (!AM.GV || !TM.isLargeGlobalValue(AM.GV)) && Subtarget->is64Bit() &&
1947 AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
1948 AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
1949 AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
1950 AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
1951 }
1952
1953 return false;
1954}
1955
1956bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
1957 unsigned Depth) {
1958 // Add an artificial use to this node so that we can keep track of
1959 // it if it gets CSE'd with a different node.
1960 HandleSDNode Handle(N);
1961
1962 X86ISelAddressMode Backup = AM;
1963 if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
1964 !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
1965 return false;
1966 AM = Backup;
1967
1968 // Try again after commutating the operands.
1969 if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM,
1970 Depth + 1) &&
1971 !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1))
1972 return false;
1973 AM = Backup;
1974
1975 // If we couldn't fold both operands into the address at the same time,
1976 // see if we can just put each operand into a register and fold at least
1977 // the add.
1978 if (AM.BaseType == X86ISelAddressMode::RegBase &&
1979 !AM.Base_Reg.getNode() &&
1980 !AM.IndexReg.getNode()) {
1981 N = Handle.getValue();
1982 AM.Base_Reg = N.getOperand(0);
1983 AM.IndexReg = N.getOperand(1);
1984 AM.Scale = 1;
1985 return false;
1986 }
1987 N = Handle.getValue();
1988 return true;
1989}
1990
1991// Insert a node into the DAG at least before the Pos node's position. This
1992// will reposition the node as needed, and will assign it a node ID that is <=
1993// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
1994// IDs! The selection DAG must no longer depend on their uniqueness when this
1995// is used.
1996static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
1997 if (N->getNodeId() == -1 ||
2000 DAG.RepositionNode(Pos->getIterator(), N.getNode());
2001 // Mark Node as invalid for pruning as after this it may be a successor to a
2002 // selected node but otherwise be in the same position of Pos.
2003 // Conservatively mark it with the same -abs(Id) to assure node id
2004 // invariant is preserved.
2005 N->setNodeId(Pos->getNodeId());
2007 }
2008}
2009
2010// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
2011// safe. This allows us to convert the shift and and into an h-register
2012// extract and a scaled index. Returns false if the simplification is
2013// performed.
2015 uint64_t Mask,
2016 SDValue Shift, SDValue X,
2017 X86ISelAddressMode &AM) {
2018 if (Shift.getOpcode() != ISD::SRL ||
2019 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2020 !Shift.hasOneUse())
2021 return true;
2022
2023 int ScaleLog = 8 - Shift.getConstantOperandVal(1);
2024 if (ScaleLog <= 0 || ScaleLog >= 4 ||
2025 Mask != (0xffu << ScaleLog))
2026 return true;
2027
2028 MVT XVT = X.getSimpleValueType();
2029 MVT VT = N.getSimpleValueType();
2030 SDLoc DL(N);
2031 SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
2032 SDValue NewMask = DAG.getConstant(0xff, DL, XVT);
2033 SDValue Srl = DAG.getNode(ISD::SRL, DL, XVT, X, Eight);
2034 SDValue And = DAG.getNode(ISD::AND, DL, XVT, Srl, NewMask);
2035 SDValue Ext = DAG.getZExtOrTrunc(And, DL, VT);
2036 SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
2037 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Ext, ShlCount);
2038
2039 // Insert the new nodes into the topological ordering. We must do this in
2040 // a valid topological ordering as nothing is going to go back and re-sort
2041 // these nodes. We continually insert before 'N' in sequence as this is
2042 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2043 // hierarchy left to express.
2044 insertDAGNode(DAG, N, Eight);
2045 insertDAGNode(DAG, N, NewMask);
2046 insertDAGNode(DAG, N, Srl);
2047 insertDAGNode(DAG, N, And);
2048 insertDAGNode(DAG, N, Ext);
2049 insertDAGNode(DAG, N, ShlCount);
2050 insertDAGNode(DAG, N, Shl);
2051 DAG.ReplaceAllUsesWith(N, Shl);
2052 DAG.RemoveDeadNode(N.getNode());
2053 AM.IndexReg = Ext;
2054 AM.Scale = (1 << ScaleLog);
2055 return false;
2056}
2057
2058// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2059// allows us to fold the shift into this addressing mode. Returns false if the
2060// transform succeeded.
2062 X86ISelAddressMode &AM) {
2063 SDValue Shift = N.getOperand(0);
2064
2065 // Use a signed mask so that shifting right will insert sign bits. These
2066 // bits will be removed when we shift the result left so it doesn't matter
2067 // what we use. This might allow a smaller immediate encoding.
2068 int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
2069
2070 // If we have an any_extend feeding the AND, look through it to see if there
2071 // is a shift behind it. But only if the AND doesn't use the extended bits.
2072 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2073 bool FoundAnyExtend = false;
2074 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2075 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
2076 isUInt<32>(Mask)) {
2077 FoundAnyExtend = true;
2078 Shift = Shift.getOperand(0);
2079 }
2080
2081 if (Shift.getOpcode() != ISD::SHL ||
2082 !isa<ConstantSDNode>(Shift.getOperand(1)))
2083 return true;
2084
2085 SDValue X = Shift.getOperand(0);
2086
2087 // Not likely to be profitable if either the AND or SHIFT node has more
2088 // than one use (unless all uses are for address computation). Besides,
2089 // isel mechanism requires their node ids to be reused.
2090 if (!N.hasOneUse() || !Shift.hasOneUse())
2091 return true;
2092
2093 // Verify that the shift amount is something we can fold.
2094 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2095 if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
2096 return true;
2097
2098 MVT VT = N.getSimpleValueType();
2099 SDLoc DL(N);
2100 if (FoundAnyExtend) {
2101 SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
2102 insertDAGNode(DAG, N, NewX);
2103 X = NewX;
2104 }
2105
2106 SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
2107 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
2108 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
2109
2110 // Insert the new nodes into the topological ordering. We must do this in
2111 // a valid topological ordering as nothing is going to go back and re-sort
2112 // these nodes. We continually insert before 'N' in sequence as this is
2113 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2114 // hierarchy left to express.
2115 insertDAGNode(DAG, N, NewMask);
2116 insertDAGNode(DAG, N, NewAnd);
2117 insertDAGNode(DAG, N, NewShift);
2118 DAG.ReplaceAllUsesWith(N, NewShift);
2119 DAG.RemoveDeadNode(N.getNode());
2120
2121 AM.Scale = 1 << ShiftAmt;
2122 AM.IndexReg = NewAnd;
2123 return false;
2124}
2125
2126// Implement some heroics to detect shifts of masked values where the mask can
2127// be replaced by extending the shift and undoing that in the addressing mode
2128// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2129// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2130// the addressing mode. This results in code such as:
2131//
2132// int f(short *y, int *lookup_table) {
2133// ...
2134// return *y + lookup_table[*y >> 11];
2135// }
2136//
2137// Turning into:
2138// movzwl (%rdi), %eax
2139// movl %eax, %ecx
2140// shrl $11, %ecx
2141// addl (%rsi,%rcx,4), %eax
2142//
2143// Instead of:
2144// movzwl (%rdi), %eax
2145// movl %eax, %ecx
2146// shrl $9, %ecx
2147// andl $124, %rcx
2148// addl (%rsi,%rcx), %eax
2149//
2150// Note that this function assumes the mask is provided as a mask *after* the
2151// value is shifted. The input chain may or may not match that, but computing
2152// such a mask is trivial.
2154 uint64_t Mask,
2155 SDValue Shift, SDValue X,
2156 X86ISelAddressMode &AM) {
2157 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
2158 !isa<ConstantSDNode>(Shift.getOperand(1)))
2159 return true;
2160
2161 // We need to ensure that mask is a continuous run of bits.
2162 unsigned MaskIdx, MaskLen;
2163 if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2164 return true;
2165 unsigned MaskLZ = 64 - (MaskIdx + MaskLen);
2166
2167 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2168
2169 // The amount of shift we're trying to fit into the addressing mode is taken
2170 // from the shifted mask index (number of trailing zeros of the mask).
2171 unsigned AMShiftAmt = MaskIdx;
2172
2173 // There is nothing we can do here unless the mask is removing some bits.
2174 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2175 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2176
2177 // Scale the leading zero count down based on the actual size of the value.
2178 // Also scale it down based on the size of the shift.
2179 unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2180 if (MaskLZ < ScaleDown)
2181 return true;
2182 MaskLZ -= ScaleDown;
2183
2184 // The final check is to ensure that any masked out high bits of X are
2185 // already known to be zero. Otherwise, the mask has a semantic impact
2186 // other than masking out a couple of low bits. Unfortunately, because of
2187 // the mask, zero extensions will be removed from operands in some cases.
2188 // This code works extra hard to look through extensions because we can
2189 // replace them with zero extensions cheaply if necessary.
2190 bool ReplacingAnyExtend = false;
2191 if (X.getOpcode() == ISD::ANY_EXTEND) {
2192 unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2193 X.getOperand(0).getSimpleValueType().getSizeInBits();
2194 // Assume that we'll replace the any-extend with a zero-extend, and
2195 // narrow the search to the extended value.
2196 X = X.getOperand(0);
2197 MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
2198 ReplacingAnyExtend = true;
2199 }
2200 APInt MaskedHighBits =
2201 APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
2202 if (!DAG.MaskedValueIsZero(X, MaskedHighBits))
2203 return true;
2204
2205 // We've identified a pattern that can be transformed into a single shift
2206 // and an addressing mode. Make it so.
2207 MVT VT = N.getSimpleValueType();
2208 if (ReplacingAnyExtend) {
2209 assert(X.getValueType() != VT);
2210 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2211 SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
2212 insertDAGNode(DAG, N, NewX);
2213 X = NewX;
2214 }
2215
2216 MVT XVT = X.getSimpleValueType();
2217 SDLoc DL(N);
2218 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2219 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2220 SDValue NewExt = DAG.getZExtOrTrunc(NewSRL, DL, VT);
2221 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2222 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2223
2224 // Insert the new nodes into the topological ordering. We must do this in
2225 // a valid topological ordering as nothing is going to go back and re-sort
2226 // these nodes. We continually insert before 'N' in sequence as this is
2227 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2228 // hierarchy left to express.
2229 insertDAGNode(DAG, N, NewSRLAmt);
2230 insertDAGNode(DAG, N, NewSRL);
2231 insertDAGNode(DAG, N, NewExt);
2232 insertDAGNode(DAG, N, NewSHLAmt);
2233 insertDAGNode(DAG, N, NewSHL);
2234 DAG.ReplaceAllUsesWith(N, NewSHL);
2235 DAG.RemoveDeadNode(N.getNode());
2236
2237 AM.Scale = 1 << AMShiftAmt;
2238 AM.IndexReg = NewExt;
2239 return false;
2240}
2241
2242// Transform "(X >> SHIFT) & (MASK << C1)" to
2243// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2244// matched to a BEXTR later. Returns false if the simplification is performed.
2246 uint64_t Mask,
2247 SDValue Shift, SDValue X,
2248 X86ISelAddressMode &AM,
2249 const X86Subtarget &Subtarget) {
2250 if (Shift.getOpcode() != ISD::SRL ||
2251 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2252 !Shift.hasOneUse() || !N.hasOneUse())
2253 return true;
2254
2255 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2256 if (!Subtarget.hasTBM() &&
2257 !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2258 return true;
2259
2260 // We need to ensure that mask is a continuous run of bits.
2261 unsigned MaskIdx, MaskLen;
2262 if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2263 return true;
2264
2265 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2266
2267 // The amount of shift we're trying to fit into the addressing mode is taken
2268 // from the shifted mask index (number of trailing zeros of the mask).
2269 unsigned AMShiftAmt = MaskIdx;
2270
2271 // There is nothing we can do here unless the mask is removing some bits.
2272 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2273 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2274
2275 MVT XVT = X.getSimpleValueType();
2276 MVT VT = N.getSimpleValueType();
2277 SDLoc DL(N);
2278 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2279 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2280 SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, XVT);
2281 SDValue NewAnd = DAG.getNode(ISD::AND, DL, XVT, NewSRL, NewMask);
2282 SDValue NewExt = DAG.getZExtOrTrunc(NewAnd, DL, VT);
2283 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2284 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2285
2286 // Insert the new nodes into the topological ordering. We must do this in
2287 // a valid topological ordering as nothing is going to go back and re-sort
2288 // these nodes. We continually insert before 'N' in sequence as this is
2289 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2290 // hierarchy left to express.
2291 insertDAGNode(DAG, N, NewSRLAmt);
2292 insertDAGNode(DAG, N, NewSRL);
2293 insertDAGNode(DAG, N, NewMask);
2294 insertDAGNode(DAG, N, NewAnd);
2295 insertDAGNode(DAG, N, NewExt);
2296 insertDAGNode(DAG, N, NewSHLAmt);
2297 insertDAGNode(DAG, N, NewSHL);
2298 DAG.ReplaceAllUsesWith(N, NewSHL);
2299 DAG.RemoveDeadNode(N.getNode());
2300
2301 AM.Scale = 1 << AMShiftAmt;
2302 AM.IndexReg = NewExt;
2303 return false;
2304}
2305
2306// Attempt to peek further into a scaled index register, collecting additional
2307// extensions / offsets / etc. Returns /p N if we can't peek any further.
2308SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2309 X86ISelAddressMode &AM,
2310 unsigned Depth) {
2311 assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2312 assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) &&
2313 "Illegal index scale");
2314
2315 // Limit recursion.
2317 return N;
2318
2319 EVT VT = N.getValueType();
2320 unsigned Opc = N.getOpcode();
2321
2322 // index: add(x,c) -> index: x, disp + c
2323 if (CurDAG->isBaseWithConstantOffset(N)) {
2324 auto *AddVal = cast<ConstantSDNode>(N.getOperand(1));
2325 uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2326 if (!foldOffsetIntoAddress(Offset, AM))
2327 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2328 }
2329
2330 // index: add(x,x) -> index: x, scale * 2
2331 if (Opc == ISD::ADD && N.getOperand(0) == N.getOperand(1)) {
2332 if (AM.Scale <= 4) {
2333 AM.Scale *= 2;
2334 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2335 }
2336 }
2337
2338 // index: shl(x,i) -> index: x, scale * (1 << i)
2339 if (Opc == X86ISD::VSHLI) {
2340 uint64_t ShiftAmt = N.getConstantOperandVal(1);
2341 uint64_t ScaleAmt = 1ULL << ShiftAmt;
2342 if ((AM.Scale * ScaleAmt) <= 8) {
2343 AM.Scale *= ScaleAmt;
2344 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2345 }
2346 }
2347
2348 // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2349 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2350 if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2351 SDValue Src = N.getOperand(0);
2352 if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() &&
2353 Src.hasOneUse()) {
2354 if (CurDAG->isBaseWithConstantOffset(Src)) {
2355 SDValue AddSrc = Src.getOperand(0);
2356 auto *AddVal = cast<ConstantSDNode>(Src.getOperand(1));
2357 uint64_t Offset = (uint64_t)AddVal->getSExtValue();
2358 if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2359 SDLoc DL(N);
2360 SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2361 SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2362 SDValue ExtAdd = CurDAG->getNode(ISD::ADD, DL, VT, ExtSrc, ExtVal);
2363 insertDAGNode(*CurDAG, N, ExtSrc);
2364 insertDAGNode(*CurDAG, N, ExtVal);
2365 insertDAGNode(*CurDAG, N, ExtAdd);
2366 CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2367 CurDAG->RemoveDeadNode(N.getNode());
2368 return ExtSrc;
2369 }
2370 }
2371 }
2372 }
2373
2374 // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2375 // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2376 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2377 if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2378 SDValue Src = N.getOperand(0);
2379 unsigned SrcOpc = Src.getOpcode();
2380 if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) ||
2381 CurDAG->isADDLike(Src, /*NoWrap=*/true)) &&
2382 Src.hasOneUse()) {
2383 if (CurDAG->isBaseWithConstantOffset(Src)) {
2384 SDValue AddSrc = Src.getOperand(0);
2385 uint64_t Offset = Src.getConstantOperandVal(1);
2386 if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2387 SDLoc DL(N);
2388 SDValue Res;
2389 // If we're also scaling, see if we can use that as well.
2390 if (AddSrc.getOpcode() == ISD::SHL &&
2391 isa<ConstantSDNode>(AddSrc.getOperand(1))) {
2392 SDValue ShVal = AddSrc.getOperand(0);
2393 uint64_t ShAmt = AddSrc.getConstantOperandVal(1);
2394 APInt HiBits =
2396 uint64_t ScaleAmt = 1ULL << ShAmt;
2397 if ((AM.Scale * ScaleAmt) <= 8 &&
2398 (AddSrc->getFlags().hasNoUnsignedWrap() ||
2399 CurDAG->MaskedValueIsZero(ShVal, HiBits))) {
2400 AM.Scale *= ScaleAmt;
2401 SDValue ExtShVal = CurDAG->getNode(Opc, DL, VT, ShVal);
2402 SDValue ExtShift = CurDAG->getNode(ISD::SHL, DL, VT, ExtShVal,
2403 AddSrc.getOperand(1));
2404 insertDAGNode(*CurDAG, N, ExtShVal);
2405 insertDAGNode(*CurDAG, N, ExtShift);
2406 AddSrc = ExtShift;
2407 Res = ExtShVal;
2408 }
2409 }
2410 SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2411 SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2412 SDValue ExtAdd = CurDAG->getNode(SrcOpc, DL, VT, ExtSrc, ExtVal);
2413 insertDAGNode(*CurDAG, N, ExtSrc);
2414 insertDAGNode(*CurDAG, N, ExtVal);
2415 insertDAGNode(*CurDAG, N, ExtAdd);
2416 CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2417 CurDAG->RemoveDeadNode(N.getNode());
2418 return Res ? Res : ExtSrc;
2419 }
2420 }
2421 }
2422 }
2423
2424 // TODO: Handle extensions, shifted masks etc.
2425 return N;
2426}
2427
2428bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2429 unsigned Depth) {
2430 SDLoc dl(N);
2431 LLVM_DEBUG({
2432 dbgs() << "MatchAddress: ";
2433 AM.dump(CurDAG);
2434 });
2435 // Limit recursion.
2437 return matchAddressBase(N, AM);
2438
2439 // If this is already a %rip relative address, we can only merge immediates
2440 // into it. Instead of handling this in every case, we handle it here.
2441 // RIP relative addressing: %rip + 32-bit displacement!
2442 if (AM.isRIPRelative()) {
2443 // FIXME: JumpTable and ExternalSymbol address currently don't like
2444 // displacements. It isn't very important, but this should be fixed for
2445 // consistency.
2446 if (!(AM.ES || AM.MCSym) && AM.JT != -1)
2447 return true;
2448
2449 if (auto *Cst = dyn_cast<ConstantSDNode>(N))
2450 if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
2451 return false;
2452 return true;
2453 }
2454
2455 switch (N.getOpcode()) {
2456 default: break;
2457 case ISD::LOCAL_RECOVER: {
2458 if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
2459 if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
2460 // Use the symbol and don't prefix it.
2461 AM.MCSym = ESNode->getMCSymbol();
2462 return false;
2463 }
2464 break;
2465 }
2466 case ISD::Constant: {
2467 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2468 if (!foldOffsetIntoAddress(Val, AM))
2469 return false;
2470 break;
2471 }
2472
2473 case X86ISD::Wrapper:
2474 case X86ISD::WrapperRIP:
2475 if (!matchWrapper(N, AM))
2476 return false;
2477 break;
2478
2479 case ISD::LOAD:
2480 if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
2481 return false;
2482 break;
2483
2484 case ISD::FrameIndex:
2485 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2486 AM.Base_Reg.getNode() == nullptr &&
2487 (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
2488 AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2489 AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
2490 return false;
2491 }
2492 break;
2493
2494 case ISD::SHL:
2495 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2496 break;
2497
2498 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
2499 unsigned Val = CN->getZExtValue();
2500 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2501 // that the base operand remains free for further matching. If
2502 // the base doesn't end up getting used, a post-processing step
2503 // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2504 if (Val == 1 || Val == 2 || Val == 3) {
2505 SDValue ShVal = N.getOperand(0);
2506 AM.Scale = 1 << Val;
2507 AM.IndexReg = matchIndexRecursively(ShVal, AM, Depth + 1);
2508 return false;
2509 }
2510 }
2511 break;
2512
2513 case ISD::SRL: {
2514 // Scale must not be used already.
2515 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2516
2517 // We only handle up to 64-bit values here as those are what matter for
2518 // addressing mode optimizations.
2519 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2520 "Unexpected value size!");
2521
2522 SDValue And = N.getOperand(0);
2523 if (And.getOpcode() != ISD::AND) break;
2524 SDValue X = And.getOperand(0);
2525
2526 // The mask used for the transform is expected to be post-shift, but we
2527 // found the shift first so just apply the shift to the mask before passing
2528 // it down.
2529 if (!isa<ConstantSDNode>(N.getOperand(1)) ||
2530 !isa<ConstantSDNode>(And.getOperand(1)))
2531 break;
2532 uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
2533
2534 // Try to fold the mask and shift into the scale, and return false if we
2535 // succeed.
2536 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
2537 return false;
2538 break;
2539 }
2540
2541 case ISD::SMUL_LOHI:
2542 case ISD::UMUL_LOHI:
2543 // A mul_lohi where we need the low part can be folded as a plain multiply.
2544 if (N.getResNo() != 0) break;
2545 [[fallthrough]];
2546 case ISD::MUL:
2547 case X86ISD::MUL_IMM:
2548 // X*[3,5,9] -> X+X*[2,4,8]
2549 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2550 AM.Base_Reg.getNode() == nullptr &&
2551 AM.IndexReg.getNode() == nullptr) {
2552 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
2553 if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
2554 CN->getZExtValue() == 9) {
2555 AM.Scale = unsigned(CN->getZExtValue())-1;
2556
2557 SDValue MulVal = N.getOperand(0);
2558 SDValue Reg;
2559
2560 // Okay, we know that we have a scale by now. However, if the scaled
2561 // value is an add of something and a constant, we can fold the
2562 // constant into the disp field here.
2563 if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2564 isa<ConstantSDNode>(MulVal.getOperand(1))) {
2565 Reg = MulVal.getOperand(0);
2566 auto *AddVal = cast<ConstantSDNode>(MulVal.getOperand(1));
2567 uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2568 if (foldOffsetIntoAddress(Disp, AM))
2569 Reg = N.getOperand(0);
2570 } else {
2571 Reg = N.getOperand(0);
2572 }
2573
2574 AM.IndexReg = AM.Base_Reg = Reg;
2575 return false;
2576 }
2577 }
2578 break;
2579
2580 case ISD::SUB: {
2581 // Given A-B, if A can be completely folded into the address and
2582 // the index field with the index field unused, use -B as the index.
2583 // This is a win if a has multiple parts that can be folded into
2584 // the address. Also, this saves a mov if the base register has
2585 // other uses, since it avoids a two-address sub instruction, however
2586 // it costs an additional mov if the index register has other uses.
2587
2588 // Add an artificial use to this node so that we can keep track of
2589 // it if it gets CSE'd with a different node.
2590 HandleSDNode Handle(N);
2591
2592 // Test if the LHS of the sub can be folded.
2593 X86ISelAddressMode Backup = AM;
2594 if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2595 N = Handle.getValue();
2596 AM = Backup;
2597 break;
2598 }
2599 N = Handle.getValue();
2600 // Test if the index field is free for use.
2601 if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2602 AM = Backup;
2603 break;
2604 }
2605
2606 int Cost = 0;
2607 SDValue RHS = N.getOperand(1);
2608 // If the RHS involves a register with multiple uses, this
2609 // transformation incurs an extra mov, due to the neg instruction
2610 // clobbering its operand.
2611 if (!RHS.getNode()->hasOneUse() ||
2612 RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2613 RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2614 RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2615 (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2616 RHS.getOperand(0).getValueType() == MVT::i32))
2617 ++Cost;
2618 // If the base is a register with multiple uses, this
2619 // transformation may save a mov.
2620 if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2621 !AM.Base_Reg.getNode()->hasOneUse()) ||
2622 AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2623 --Cost;
2624 // If the folded LHS was interesting, this transformation saves
2625 // address arithmetic.
2626 if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2627 ((AM.Disp != 0) && (Backup.Disp == 0)) +
2628 (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2629 --Cost;
2630 // If it doesn't look like it may be an overall win, don't do it.
2631 if (Cost >= 0) {
2632 AM = Backup;
2633 break;
2634 }
2635
2636 // Ok, the transformation is legal and appears profitable. Go for it.
2637 // Negation will be emitted later to avoid creating dangling nodes if this
2638 // was an unprofitable LEA.
2639 AM.IndexReg = RHS;
2640 AM.NegateIndex = true;
2641 AM.Scale = 1;
2642 return false;
2643 }
2644
2645 case ISD::OR:
2646 case ISD::XOR:
2647 // See if we can treat the OR/XOR node as an ADD node.
2648 if (!CurDAG->isADDLike(N))
2649 break;
2650 [[fallthrough]];
2651 case ISD::ADD:
2652 if (!matchAdd(N, AM, Depth))
2653 return false;
2654 break;
2655
2656 case ISD::AND: {
2657 // Perform some heroic transforms on an and of a constant-count shift
2658 // with a constant to enable use of the scaled offset field.
2659
2660 // Scale must not be used already.
2661 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2662
2663 // We only handle up to 64-bit values here as those are what matter for
2664 // addressing mode optimizations.
2665 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2666 "Unexpected value size!");
2667
2668 if (!isa<ConstantSDNode>(N.getOperand(1)))
2669 break;
2670
2671 if (N.getOperand(0).getOpcode() == ISD::SRL) {
2672 SDValue Shift = N.getOperand(0);
2673 SDValue X = Shift.getOperand(0);
2674
2675 uint64_t Mask = N.getConstantOperandVal(1);
2676
2677 // Try to fold the mask and shift into an extract and scale.
2678 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2679 return false;
2680
2681 // Try to fold the mask and shift directly into the scale.
2682 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2683 return false;
2684
2685 // Try to fold the mask and shift into BEXTR and scale.
2686 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2687 return false;
2688 }
2689
2690 // Try to swap the mask and shift to place shifts which can be done as
2691 // a scale on the outside of the mask.
2692 if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2693 return false;
2694
2695 break;
2696 }
2697 case ISD::ZERO_EXTEND: {
2698 // Try to widen a zexted shift left to the same size as its use, so we can
2699 // match the shift as a scale factor.
2700 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2701 break;
2702
2703 SDValue Src = N.getOperand(0);
2704
2705 // See if we can match a zext(addlike(x,c)).
2706 // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2707 if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR)
2708 if (SDValue Index = matchIndexRecursively(N, AM, Depth + 1))
2709 if (Index != N) {
2710 AM.IndexReg = Index;
2711 return false;
2712 }
2713
2714 // Peek through mask: zext(and(shl(x,c1),c2))
2715 APInt Mask = APInt::getAllOnes(Src.getScalarValueSizeInBits());
2716 if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2717 if (auto *MaskC = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
2718 Mask = MaskC->getAPIntValue();
2719 Src = Src.getOperand(0);
2720 }
2721
2722 if (Src.getOpcode() == ISD::SHL && Src.hasOneUse()) {
2723 // Give up if the shift is not a valid scale factor [1,2,3].
2724 SDValue ShlSrc = Src.getOperand(0);
2725 SDValue ShlAmt = Src.getOperand(1);
2726 auto *ShAmtC = dyn_cast<ConstantSDNode>(ShlAmt);
2727 if (!ShAmtC)
2728 break;
2729 unsigned ShAmtV = ShAmtC->getZExtValue();
2730 if (ShAmtV > 3)
2731 break;
2732
2733 // The narrow shift must only shift out zero bits (it must be 'nuw').
2734 // That makes it safe to widen to the destination type.
2735 APInt HighZeros =
2736 APInt::getHighBitsSet(ShlSrc.getValueSizeInBits(), ShAmtV);
2737 if (!Src->getFlags().hasNoUnsignedWrap() &&
2738 !CurDAG->MaskedValueIsZero(ShlSrc, HighZeros & Mask))
2739 break;
2740
2741 // zext (shl nuw i8 %x, C1) to i32
2742 // --> shl (zext i8 %x to i32), (zext C1)
2743 // zext (and (shl nuw i8 %x, C1), C2) to i32
2744 // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2745 MVT SrcVT = ShlSrc.getSimpleValueType();
2746 MVT VT = N.getSimpleValueType();
2747 SDLoc DL(N);
2748
2749 SDValue Res = ShlSrc;
2750 if (!Mask.isAllOnes()) {
2751 Res = CurDAG->getConstant(Mask.lshr(ShAmtV), DL, SrcVT);
2752 insertDAGNode(*CurDAG, N, Res);
2753 Res = CurDAG->getNode(ISD::AND, DL, SrcVT, ShlSrc, Res);
2754 insertDAGNode(*CurDAG, N, Res);
2755 }
2756 SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Res);
2757 insertDAGNode(*CurDAG, N, Zext);
2758 SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, ShlAmt);
2759 insertDAGNode(*CurDAG, N, NewShl);
2760 CurDAG->ReplaceAllUsesWith(N, NewShl);
2761 CurDAG->RemoveDeadNode(N.getNode());
2762
2763 // Convert the shift to scale factor.
2764 AM.Scale = 1 << ShAmtV;
2765 // If matchIndexRecursively is not called here,
2766 // Zext may be replaced by other nodes but later used to call a builder
2767 // method
2768 AM.IndexReg = matchIndexRecursively(Zext, AM, Depth + 1);
2769 return false;
2770 }
2771
2772 if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2773 // Try to fold the mask and shift into an extract and scale.
2774 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask.getZExtValue(), Src,
2775 Src.getOperand(0), AM))
2776 return false;
2777
2778 // Try to fold the mask and shift directly into the scale.
2779 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask.getZExtValue(), Src,
2780 Src.getOperand(0), AM))
2781 return false;
2782
2783 // Try to fold the mask and shift into BEXTR and scale.
2784 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask.getZExtValue(), Src,
2785 Src.getOperand(0), AM, *Subtarget))
2786 return false;
2787 }
2788
2789 break;
2790 }
2791 }
2792
2793 return matchAddressBase(N, AM);
2794}
2795
2796/// Helper for MatchAddress. Add the specified node to the
2797/// specified addressing mode without any further recursion.
2798bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2799 // Is the base register already occupied?
2800 if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2801 // If so, check to see if the scale index register is set.
2802 if (!AM.IndexReg.getNode()) {
2803 AM.IndexReg = N;
2804 AM.Scale = 1;
2805 return false;
2806 }
2807
2808 // Otherwise, we cannot select it.
2809 return true;
2810 }
2811
2812 // Default, generate it as a register.
2813 AM.BaseType = X86ISelAddressMode::RegBase;
2814 AM.Base_Reg = N;
2815 return false;
2816}
2817
2818bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2819 X86ISelAddressMode &AM,
2820 unsigned Depth) {
2821 SDLoc dl(N);
2822 LLVM_DEBUG({
2823 dbgs() << "MatchVectorAddress: ";
2824 AM.dump(CurDAG);
2825 });
2826 // Limit recursion.
2828 return matchAddressBase(N, AM);
2829
2830 // TODO: Support other operations.
2831 switch (N.getOpcode()) {
2832 case ISD::Constant: {
2833 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2834 if (!foldOffsetIntoAddress(Val, AM))
2835 return false;
2836 break;
2837 }
2838 case X86ISD::Wrapper:
2839 if (!matchWrapper(N, AM))
2840 return false;
2841 break;
2842 case ISD::ADD: {
2843 // Add an artificial use to this node so that we can keep track of
2844 // it if it gets CSE'd with a different node.
2845 HandleSDNode Handle(N);
2846
2847 X86ISelAddressMode Backup = AM;
2848 if (!matchVectorAddressRecursively(N.getOperand(0), AM, Depth + 1) &&
2849 !matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2850 Depth + 1))
2851 return false;
2852 AM = Backup;
2853
2854 // Try again after commuting the operands.
2855 if (!matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2856 Depth + 1) &&
2857 !matchVectorAddressRecursively(Handle.getValue().getOperand(0), AM,
2858 Depth + 1))
2859 return false;
2860 AM = Backup;
2861
2862 N = Handle.getValue();
2863 break;
2864 }
2865 }
2866
2867 return matchAddressBase(N, AM);
2868}
2869
2870/// Helper for selectVectorAddr. Handles things that can be folded into a
2871/// gather/scatter address. The index register and scale should have already
2872/// been handled.
2873bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2874 return matchVectorAddressRecursively(N, AM, 0);
2875}
2876
2877bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2878 SDValue IndexOp, SDValue ScaleOp,
2879 SDValue &Base, SDValue &Scale,
2880 SDValue &Index, SDValue &Disp,
2881 SDValue &Segment) {
2882 X86ISelAddressMode AM;
2883 AM.Scale = ScaleOp->getAsZExtVal();
2884
2885 // Attempt to match index patterns, as long as we're not relying on implicit
2886 // sign-extension, which is performed BEFORE scale.
2887 if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2888 AM.IndexReg = matchIndexRecursively(IndexOp, AM, 0);
2889 else
2890 AM.IndexReg = IndexOp;
2891
2892 unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2893 if (AddrSpace == X86AS::GS)
2894 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2895 if (AddrSpace == X86AS::FS)
2896 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2897 if (AddrSpace == X86AS::SS)
2898 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2899
2900 SDLoc DL(BasePtr);
2901 MVT VT = BasePtr.getSimpleValueType();
2902
2903 // Try to match into the base and displacement fields.
2904 if (matchVectorAddress(BasePtr, AM))
2905 return false;
2906
2907 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2908 return true;
2909}
2910
2911/// Returns true if it is able to pattern match an addressing mode.
2912/// It returns the operands which make up the maximal addressing mode it can
2913/// match by reference.
2914///
2915/// Parent is the parent node of the addr operand that is being matched. It
2916/// is always a load, store, atomic node, or null. It is only null when
2917/// checking memory operands for inline asm nodes.
2918bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2919 SDValue &Scale, SDValue &Index,
2920 SDValue &Disp, SDValue &Segment) {
2921 X86ISelAddressMode AM;
2922
2923 if (Parent &&
2924 // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2925 // that are not a MemSDNode, and thus don't have proper addrspace info.
2926 Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
2927 Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
2928 Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
2929 Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
2930 Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
2931 Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
2932 Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
2933 unsigned AddrSpace =
2934 cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
2935 if (AddrSpace == X86AS::GS)
2936 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2937 if (AddrSpace == X86AS::FS)
2938 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2939 if (AddrSpace == X86AS::SS)
2940 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2941 }
2942
2943 // Save the DL and VT before calling matchAddress, it can invalidate N.
2944 SDLoc DL(N);
2945 MVT VT = N.getSimpleValueType();
2946
2947 if (matchAddress(N, AM))
2948 return false;
2949
2950 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2951 return true;
2952}
2953
2954bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
2955 // Cannot use 32 bit constants to reference objects in kernel/large code
2956 // model.
2957 if (TM.getCodeModel() == CodeModel::Kernel ||
2958 TM.getCodeModel() == CodeModel::Large)
2959 return false;
2960
2961 // In static codegen with small code model, we can get the address of a label
2962 // into a register with 'movl'
2963 if (N->getOpcode() != X86ISD::Wrapper)
2964 return false;
2965
2966 N = N.getOperand(0);
2967
2968 // At least GNU as does not accept 'movl' for TPOFF relocations.
2969 // FIXME: We could use 'movl' when we know we are targeting MC.
2970 if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
2971 return false;
2972
2973 Imm = N;
2974 // Small/medium code model can reference non-TargetGlobalAddress objects with
2975 // 32 bit constants.
2976 if (N->getOpcode() != ISD::TargetGlobalAddress) {
2977 return TM.getCodeModel() == CodeModel::Small ||
2978 TM.getCodeModel() == CodeModel::Medium;
2979 }
2980
2981 const GlobalValue *GV = cast<GlobalAddressSDNode>(N)->getGlobal();
2982 if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
2983 return CR->getUnsignedMax().ult(1ull << 32);
2984
2985 return !TM.isLargeGlobalValue(GV);
2986}
2987
2988bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
2989 SDValue &Scale, SDValue &Index,
2990 SDValue &Disp, SDValue &Segment) {
2991 // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
2992 SDLoc DL(N);
2993
2994 if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
2995 return false;
2996
2997 auto *RN = dyn_cast<RegisterSDNode>(Base);
2998 if (RN && RN->getReg() == 0)
2999 Base = CurDAG->getRegister(0, MVT::i64);
3000 else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
3001 // Base could already be %rip, particularly in the x32 ABI.
3002 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3003 MVT::i64), 0);
3004 Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
3005 Base);
3006 }
3007
3008 RN = dyn_cast<RegisterSDNode>(Index);
3009 if (RN && RN->getReg() == 0)
3010 Index = CurDAG->getRegister(0, MVT::i64);
3011 else {
3012 assert(Index.getValueType() == MVT::i32 &&
3013 "Expect to be extending 32-bit registers for use in LEA");
3014 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3015 MVT::i64), 0);
3016 Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
3017 Index);
3018 }
3019
3020 return true;
3021}
3022
3023/// Calls SelectAddr and determines if the maximal addressing
3024/// mode it matches can be cost effectively emitted as an LEA instruction.
3025bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3026 SDValue &Base, SDValue &Scale,
3027 SDValue &Index, SDValue &Disp,
3028 SDValue &Segment) {
3029 X86ISelAddressMode AM;
3030
3031 // Save the DL and VT before calling matchAddress, it can invalidate N.
3032 SDLoc DL(N);
3033 MVT VT = N.getSimpleValueType();
3034
3035 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3036 // segments.
3037 SDValue Copy = AM.Segment;
3038 SDValue T = CurDAG->getRegister(0, MVT::i32);
3039 AM.Segment = T;
3040 if (matchAddress(N, AM))
3041 return false;
3042 assert (T == AM.Segment);
3043 AM.Segment = Copy;
3044
3045 unsigned Complexity = 0;
3046 if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3047 Complexity = 1;
3048 else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3049 Complexity = 4;
3050
3051 if (AM.IndexReg.getNode())
3052 Complexity++;
3053
3054 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3055 // a simple shift.
3056 if (AM.Scale > 1)
3057 Complexity++;
3058
3059 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3060 // to a LEA. This is determined with some experimentation but is by no means
3061 // optimal (especially for code size consideration). LEA is nice because of
3062 // its three-address nature. Tweak the cost function again when we can run
3063 // convertToThreeAddress() at register allocation time.
3064 if (AM.hasSymbolicDisplacement()) {
3065 // For X86-64, always use LEA to materialize RIP-relative addresses.
3066 if (Subtarget->is64Bit())
3067 Complexity = 4;
3068 else
3069 Complexity += 2;
3070 }
3071
3072 // Heuristic: try harder to form an LEA from ADD if the operands set flags.
3073 // Unlike ADD, LEA does not affect flags, so we will be less likely to require
3074 // duplicating flag-producing instructions later in the pipeline.
3075 if (N.getOpcode() == ISD::ADD) {
3076 auto isMathWithFlags = [](SDValue V) {
3077 switch (V.getOpcode()) {
3078 case X86ISD::ADD:
3079 case X86ISD::SUB:
3080 case X86ISD::ADC:
3081 case X86ISD::SBB:
3082 case X86ISD::SMUL:
3083 case X86ISD::UMUL:
3084 /* TODO: These opcodes can be added safely, but we may want to justify
3085 their inclusion for different reasons (better for reg-alloc).
3086 case X86ISD::OR:
3087 case X86ISD::XOR:
3088 case X86ISD::AND:
3089 */
3090 // Value 1 is the flag output of the node - verify it's not dead.
3091 return !SDValue(V.getNode(), 1).use_empty();
3092 default:
3093 return false;
3094 }
3095 };
3096 // TODO: We might want to factor in whether there's a load folding
3097 // opportunity for the math op that disappears with LEA.
3098 if (isMathWithFlags(N.getOperand(0)) || isMathWithFlags(N.getOperand(1)))
3099 Complexity++;
3100 }
3101
3102 if (AM.Disp)
3103 Complexity++;
3104
3105 // If it isn't worth using an LEA, reject it.
3106 if (Complexity <= 2)
3107 return false;
3108
3109 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3110 return true;
3111}
3112
3113/// This is only run on TargetGlobalTLSAddress nodes.
3114bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3115 SDValue &Scale, SDValue &Index,
3116 SDValue &Disp, SDValue &Segment) {
3117 assert(N.getOpcode() == ISD::TargetGlobalTLSAddress ||
3118 N.getOpcode() == ISD::TargetExternalSymbol);
3119
3120 X86ISelAddressMode AM;
3121 if (auto *GA = dyn_cast<GlobalAddressSDNode>(N)) {
3122 AM.GV = GA->getGlobal();
3123 AM.Disp += GA->getOffset();
3124 AM.SymbolFlags = GA->getTargetFlags();
3125 } else {
3126 auto *SA = cast<ExternalSymbolSDNode>(N);
3127 AM.ES = SA->getSymbol();
3128 AM.SymbolFlags = SA->getTargetFlags();
3129 }
3130
3131 if (Subtarget->is32Bit()) {
3132 AM.Scale = 1;
3133 AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
3134 }
3135
3136 MVT VT = N.getSimpleValueType();
3137 getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
3138 return true;
3139}
3140
3141bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3142 // Keep track of the original value type and whether this value was
3143 // truncated. If we see a truncation from pointer type to VT that truncates
3144 // bits that are known to be zero, we can use a narrow reference.
3145 EVT VT = N.getValueType();
3146 bool WasTruncated = false;
3147 if (N.getOpcode() == ISD::TRUNCATE) {
3148 WasTruncated = true;
3149 N = N.getOperand(0);
3150 }
3151
3152 if (N.getOpcode() != X86ISD::Wrapper)
3153 return false;
3154
3155 // We can only use non-GlobalValues as immediates if they were not truncated,
3156 // as we do not have any range information. If we have a GlobalValue and the
3157 // address was not truncated, we can select it as an operand directly.
3158 unsigned Opc = N.getOperand(0)->getOpcode();
3159 if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
3160 Op = N.getOperand(0);
3161 // We can only select the operand directly if we didn't have to look past a
3162 // truncate.
3163 return !WasTruncated;
3164 }
3165
3166 // Check that the global's range fits into VT.
3167 auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
3168 std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3169 if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
3170 return false;
3171
3172 // Okay, we can use a narrow reference.
3173 Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
3174 GA->getOffset(), GA->getTargetFlags());
3175 return true;
3176}
3177
3178bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
3179 SDValue &Base, SDValue &Scale,
3180 SDValue &Index, SDValue &Disp,
3181 SDValue &Segment) {
3182 assert(Root && P && "Unknown root/parent nodes");
3183 if (!ISD::isNON_EXTLoad(N.getNode()) ||
3184 !IsProfitableToFold(N, P, Root) ||
3185 !IsLegalToFold(N, P, Root, OptLevel))
3186 return false;
3187
3188 return selectAddr(N.getNode(),
3189 N.getOperand(1), Base, Scale, Index, Disp, Segment);
3190}
3191
3192bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
3193 SDValue &Base, SDValue &Scale,
3194 SDValue &Index, SDValue &Disp,
3195 SDValue &Segment) {
3196 assert(Root && P && "Unknown root/parent nodes");
3197 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
3198 !IsProfitableToFold(N, P, Root) ||
3199 !IsLegalToFold(N, P, Root, OptLevel))
3200 return false;
3201
3202 return selectAddr(N.getNode(),
3203 N.getOperand(1), Base, Scale, Index, Disp, Segment);
3204}
3205
3206/// Return an SDNode that returns the value of the global base register.
3207/// Output instructions required to initialize the global base register,
3208/// if necessary.
3209SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3210 unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3211 auto &DL = MF->getDataLayout();
3212 return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
3213}
3214
3215bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
3216 if (N->getOpcode() == ISD::TRUNCATE)
3217 N = N->getOperand(0).getNode();
3218 if (N->getOpcode() != X86ISD::Wrapper)
3219 return false;
3220
3221 auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
3222 if (!GA)
3223 return false;
3224
3225 auto *GV = GA->getGlobal();
3226 std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3227 if (CR)
3228 return CR->getSignedMin().sge(-1ull << Width) &&
3229 CR->getSignedMax().slt(1ull << Width);
3230 // In the kernel code model, globals are in the negative 2GB of the address
3231 // space, so globals can be a sign extended 32-bit immediate.
3232 // In other code models, small globals are in the low 2GB of the address
3233 // space, so sign extending them is equivalent to zero extending them.
3234 return Width == 32 && !TM.isLargeGlobalValue(GV);
3235}
3236
3237X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
3238 assert(N->isMachineOpcode() && "Unexpected node");
3239 unsigned Opc = N->getMachineOpcode();
3240 const MCInstrDesc &MCID = getInstrInfo()->get(Opc);
3241 int CondNo = X86::getCondSrcNoFromDesc(MCID);
3242 if (CondNo < 0)
3243 return X86::COND_INVALID;
3244
3245 return static_cast<X86::CondCode>(N->getConstantOperandVal(CondNo));
3246}
3247
3248/// Test whether the given X86ISD::CMP node has any users that use a flag
3249/// other than ZF.
3250bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3251 // Examine each user of the node.
3252 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3253 UI != UE; ++UI) {
3254 // Only check things that use the flags.
3255 if (UI.getUse().getResNo() != Flags.getResNo())
3256 continue;
3257 // Only examine CopyToReg uses that copy to EFLAGS.
3258 if (UI->getOpcode() != ISD::CopyToReg ||
3259 cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
3260 return false;
3261 // Examine each user of the CopyToReg use.
3262 for (SDNode::use_iterator FlagUI = UI->use_begin(),
3263 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
3264 // Only examine the Flag result.
3265 if (FlagUI.getUse().getResNo() != 1) continue;
3266 // Anything unusual: assume conservatively.
3267 if (!FlagUI->isMachineOpcode()) return false;
3268 // Examine the condition code of the user.
3269 X86::CondCode CC = getCondFromNode(*FlagUI);
3270
3271 switch (CC) {
3272 // Comparisons which only use the zero flag.
3273 case X86::COND_E: case X86::COND_NE:
3274 continue;
3275 // Anything else: assume conservatively.
3276 default:
3277 return false;
3278 }
3279 }
3280 }
3281 return true;
3282}
3283
3284/// Test whether the given X86ISD::CMP node has any uses which require the SF
3285/// flag to be accurate.
3286bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3287 // Examine each user of the node.
3288 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3289 UI != UE; ++UI) {
3290 // Only check things that use the flags.
3291 if (UI.getUse().getResNo() != Flags.getResNo())
3292 continue;
3293 // Only examine CopyToReg uses that copy to EFLAGS.
3294 if (UI->getOpcode() != ISD::CopyToReg ||
3295 cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
3296 return false;
3297 // Examine each user of the CopyToReg use.
3298 for (SDNode::use_iterator FlagUI = UI->use_begin(),
3299 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
3300 // Only examine the Flag result.
3301 if (FlagUI.getUse().getResNo() != 1) continue;
3302 // Anything unusual: assume conservatively.
3303 if (!FlagUI->isMachineOpcode()) return false;
3304 // Examine the condition code of the user.
3305 X86::CondCode CC = getCondFromNode(*FlagUI);
3306
3307 switch (CC) {
3308 // Comparisons which don't examine the SF flag.
3309 case X86::COND_A: case X86::COND_AE:
3310 case X86::COND_B: case X86::COND_BE:
3311 case X86::COND_E: case X86::COND_NE:
3312 case X86::COND_O: case X86::COND_NO:
3313 case X86::COND_P: case X86::COND_NP:
3314 continue;
3315 // Anything else: assume conservatively.
3316 default:
3317 return false;
3318 }
3319 }
3320 }
3321 return true;
3322}
3323
3325 switch (CC) {
3326 // Comparisons which don't examine the CF flag.
3327 case X86::COND_O: case X86::COND_NO:
3328 case X86::COND_E: case X86::COND_NE:
3329 case X86::COND_S: case X86::COND_NS:
3330 case X86::COND_P: case X86::COND_NP:
3331 case X86::COND_L: case X86::COND_GE:
3332 case X86::COND_G: case X86::COND_LE:
3333 return false;
3334 // Anything else: assume conservatively.
3335 default:
3336 return true;
3337 }
3338}
3339
3340/// Test whether the given node which sets flags has any uses which require the
3341/// CF flag to be accurate.
3342 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3343 // Examine each user of the node.
3344 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3345 UI != UE; ++UI) {
3346 // Only check things that use the flags.
3347 if (UI.getUse().getResNo() != Flags.getResNo())
3348 continue;
3349
3350 unsigned UIOpc = UI->getOpcode();
3351
3352 if (UIOpc == ISD::CopyToReg) {
3353 // Only examine CopyToReg uses that copy to EFLAGS.
3354 if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
3355 return false;
3356 // Examine each user of the CopyToReg use.
3357 for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
3358 FlagUI != FlagUE; ++FlagUI) {
3359 // Only examine the Flag result.
3360 if (FlagUI.getUse().getResNo() != 1)
3361 continue;
3362 // Anything unusual: assume conservatively.
3363 if (!FlagUI->isMachineOpcode())
3364 return false;
3365 // Examine the condition code of the user.
3366 X86::CondCode CC = getCondFromNode(*FlagUI);
3367
3368 if (mayUseCarryFlag(CC))
3369 return false;
3370 }
3371
3372 // This CopyToReg is ok. Move on to the next user.
3373 continue;
3374 }
3375
3376 // This might be an unselected node. So look for the pre-isel opcodes that
3377 // use flags.
3378 unsigned CCOpNo;
3379 switch (UIOpc) {
3380 default:
3381 // Something unusual. Be conservative.
3382 return false;
3383 case X86ISD::SETCC: CCOpNo = 0; break;
3384 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
3385 case X86ISD::CMOV: CCOpNo = 2; break;
3386 case X86ISD::BRCOND: CCOpNo = 2; break;
3387 }
3388
3389 X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo);
3390 if (mayUseCarryFlag(CC))
3391 return false;
3392 }
3393 return true;
3394}
3395
3396/// Check whether or not the chain ending in StoreNode is suitable for doing
3397/// the {load; op; store} to modify transformation.
3399 SDValue StoredVal, SelectionDAG *CurDAG,
3400 unsigned LoadOpNo,
3401 LoadSDNode *&LoadNode,
3402 SDValue &InputChain) {
3403 // Is the stored value result 0 of the operation?
3404 if (StoredVal.getResNo() != 0) return false;
3405
3406 // Are there other uses of the operation other than the store?
3407 if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
3408
3409 // Is the store non-extending and non-indexed?
3410 if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
3411 return false;
3412
3413 SDValue Load = StoredVal->getOperand(LoadOpNo);
3414 // Is the stored value a non-extending and non-indexed load?
3415 if (!ISD::isNormalLoad(Load.getNode())) return false;
3416
3417 // Return LoadNode by reference.
3418 LoadNode = cast<LoadSDNode>(Load);
3419
3420 // Is store the only read of the loaded value?
3421 if (!Load.hasOneUse())
3422 return false;
3423
3424 // Is the address of the store the same as the load?
3425 if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
3426 LoadNode->getOffset() != StoreNode->getOffset())
3427 return false;
3428
3429 bool FoundLoad = false;
3430 SmallVector<SDValue, 4> ChainOps;
3431 SmallVector<const SDNode *, 4> LoopWorklist;
3433 const unsigned int Max = 1024;
3434
3435 // Visualization of Load-Op-Store fusion:
3436 // -------------------------
3437 // Legend:
3438 // *-lines = Chain operand dependencies.
3439 // |-lines = Normal operand dependencies.
3440 // Dependencies flow down and right. n-suffix references multiple nodes.
3441 //
3442 // C Xn C
3443 // * * *
3444 // * * *
3445 // Xn A-LD Yn TF Yn
3446 // * * \ | * |
3447 // * * \ | * |
3448 // * * \ | => A--LD_OP_ST
3449 // * * \| \
3450 // TF OP \
3451 // * | \ Zn
3452 // * | \
3453 // A-ST Zn
3454 //
3455
3456 // This merge induced dependences from: #1: Xn -> LD, OP, Zn
3457 // #2: Yn -> LD
3458 // #3: ST -> Zn
3459
3460 // Ensure the transform is safe by checking for the dual
3461 // dependencies to make sure we do not induce a loop.
3462
3463 // As LD is a predecessor to both OP and ST we can do this by checking:
3464 // a). if LD is a predecessor to a member of Xn or Yn.
3465 // b). if a Zn is a predecessor to ST.
3466
3467 // However, (b) can only occur through being a chain predecessor to
3468 // ST, which is the same as Zn being a member or predecessor of Xn,
3469 // which is a subset of LD being a predecessor of Xn. So it's
3470 // subsumed by check (a).
3471
3472 SDValue Chain = StoreNode->getChain();
3473
3474 // Gather X elements in ChainOps.
3475 if (Chain == Load.getValue(1)) {
3476 FoundLoad = true;
3477 ChainOps.push_back(Load.getOperand(0));
3478 } else if (Chain.getOpcode() == ISD::TokenFactor) {
3479 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
3480 SDValue Op = Chain.getOperand(i);
3481 if (Op == Load.getValue(1)) {
3482 FoundLoad = true;
3483 // Drop Load, but keep its chain. No cycle check necessary.
3484 ChainOps.push_back(Load.getOperand(0));
3485 continue;
3486 }
3487 LoopWorklist.push_back(Op.getNode());
3488 ChainOps.push_back(Op);
3489 }
3490 }
3491
3492 if (!FoundLoad)
3493 return false;
3494
3495 // Worklist is currently Xn. Add Yn to worklist.
3496 for (SDValue Op : StoredVal->ops())
3497 if (Op.getNode() != LoadNode)
3498 LoopWorklist.push_back(Op.getNode());
3499
3500 // Check (a) if Load is a predecessor to Xn + Yn
3501 if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
3502 true))
3503 return false;
3504
3505 InputChain =
3506 CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
3507 return true;
3508}
3509
3510// Change a chain of {load; op; store} of the same value into a simple op
3511// through memory of that value, if the uses of the modified value and its
3512// address are suitable.
3513//
3514// The tablegen pattern memory operand pattern is currently not able to match
3515// the case where the EFLAGS on the original operation are used.
3516//
3517// To move this to tablegen, we'll need to improve tablegen to allow flags to
3518// be transferred from a node in the pattern to the result node, probably with
3519// a new keyword. For example, we have this
3520// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3521// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
3522// (implicit EFLAGS)]>;
3523// but maybe need something like this
3524// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3525// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
3526// (transferrable EFLAGS)]>;
3527//
3528// Until then, we manually fold these and instruction select the operation
3529// here.
3530bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3531 auto *StoreNode = cast<StoreSDNode>(Node);
3532 SDValue StoredVal = StoreNode->getOperand(1);
3533 unsigned Opc = StoredVal->getOpcode();
3534
3535 // Before we try to select anything, make sure this is memory operand size
3536 // and opcode we can handle. Note that this must match the code below that
3537 // actually lowers the opcodes.
3538 EVT MemVT = StoreNode->getMemoryVT();
3539 if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3540 MemVT != MVT::i8)
3541 return false;
3542
3543 bool IsCommutable = false;
3544 bool IsNegate = false;
3545 switch (Opc) {
3546 default:
3547 return false;
3548 case X86ISD::SUB:
3549 IsNegate = isNullConstant(StoredVal.getOperand(0));
3550 break;
3551 case X86ISD::SBB:
3552 break;
3553 case X86ISD::ADD:
3554 case X86ISD::ADC:
3555 case X86ISD::AND:
3556 case X86ISD::OR:
3557 case X86ISD::XOR:
3558 IsCommutable = true;
3559 break;
3560 }
3561
3562 unsigned LoadOpNo = IsNegate ? 1 : 0;
3563 LoadSDNode *LoadNode = nullptr;
3564 SDValue InputChain;
3565 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3566 LoadNode, InputChain)) {
3567 if (!IsCommutable)
3568 return false;
3569
3570 // This operation is commutable, try the other operand.
3571 LoadOpNo = 1;
3572 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3573 LoadNode, InputChain))
3574 return false;
3575 }
3576
3577 SDValue Base, Scale, Index, Disp, Segment;
3578 if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3579 Segment))
3580 return false;
3581
3582 auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3583 unsigned Opc8) {
3584 switch (MemVT.getSimpleVT().SimpleTy) {
3585 case MVT::i64:
3586 return Opc64;
3587 case MVT::i32:
3588 return Opc32;
3589 case MVT::i16:
3590 return Opc16;
3591 case MVT::i8:
3592 return Opc8;
3593 default:
3594 llvm_unreachable("Invalid size!");
3595 }
3596 };
3597
3599 switch (Opc) {
3600 case X86ISD::SUB:
3601 // Handle negate.
3602 if (IsNegate) {
3603 unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3604 X86::NEG8m);
3605 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3606 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3607 MVT::Other, Ops);
3608 break;
3609 }
3610 [[fallthrough]];
3611 case X86ISD::ADD:
3612 // Try to match inc/dec.
3613 if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
3614 bool IsOne = isOneConstant(StoredVal.getOperand(1));
3615 bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
3616 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3617 if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
3618 unsigned NewOpc =
3619 ((Opc == X86ISD::ADD) == IsOne)
3620 ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3621 : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3622 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3623 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3624 MVT::Other, Ops);
3625 break;
3626 }
3627 }
3628 [[fallthrough]];
3629 case X86ISD::ADC:
3630 case X86ISD::SBB:
3631 case X86ISD::AND:
3632 case X86ISD::OR:
3633 case X86ISD::XOR: {
3634 auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3635 switch (Opc) {
3636 case X86ISD::ADD:
3637 return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3638 X86::ADD8mr);
3639 case X86ISD::ADC:
3640 return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3641 X86::ADC8mr);
3642 case X86ISD::SUB:
3643 return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3644 X86::SUB8mr);
3645 case X86ISD::SBB:
3646 return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3647 X86::SBB8mr);
3648 case X86ISD::AND:
3649 return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3650 X86::AND8mr);
3651 case X86ISD::OR:
3652 return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3653 case X86ISD::XOR:
3654 return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3655 X86::XOR8mr);
3656 default:
3657 llvm_unreachable("Invalid opcode!");
3658 }
3659 };
3660 auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3661 switch (Opc) {
3662 case X86ISD::ADD:
3663 return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3664 X86::ADD8mi);
3665 case X86ISD::ADC:
3666 return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3667 X86::ADC8mi);
3668 case X86ISD::SUB:
3669 return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3670 X86::SUB8mi);
3671 case X86ISD::SBB:
3672 return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3673 X86::SBB8mi);
3674 case X86ISD::AND:
3675 return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3676 X86::AND8mi);
3677 case X86ISD::OR:
3678 return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3679 X86::OR8mi);
3680 case X86ISD::XOR:
3681 return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3682 X86::XOR8mi);
3683 default:
3684 llvm_unreachable("Invalid opcode!");
3685 }
3686 };
3687
3688 unsigned NewOpc = SelectRegOpcode(Opc);
3689 SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3690
3691 // See if the operand is a constant that we can fold into an immediate
3692 // operand.
3693 if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3694 int64_t OperandV = OperandC->getSExtValue();
3695
3696 // Check if we can shrink the operand enough to fit in an immediate (or
3697 // fit into a smaller immediate) by negating it and switching the
3698 // operation.
3699 if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3700 ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3701 (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3702 isInt<32>(-OperandV))) &&
3703 hasNoCarryFlagUses(StoredVal.getValue(1))) {
3704 OperandV = -OperandV;
3705 Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3706 }
3707
3708 if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3709 Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
3710 NewOpc = SelectImmOpcode(Opc);
3711 }
3712 }
3713
3714 if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3715 SDValue CopyTo =
3716 CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3717 StoredVal.getOperand(2), SDValue());
3718
3719 const SDValue Ops[] = {Base, Scale, Index, Disp,
3720 Segment, Operand, CopyTo, CopyTo.getValue(1)};
3721 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3722 Ops);
3723 } else {
3724 const SDValue Ops[] = {Base, Scale, Index, Disp,
3725 Segment, Operand, InputChain};
3726 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3727 Ops);
3728 }
3729 break;
3730 }
3731 default:
3732 llvm_unreachable("Invalid opcode!");
3733 }
3734
3735 MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3736 LoadNode->getMemOperand()};
3737 CurDAG->setNodeMemRefs(Result, MemOps);
3738
3739 // Update Load Chain uses as well.
3740 ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3741 ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3742 ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3743 CurDAG->RemoveDeadNode(Node);
3744 return true;
3745}
3746
3747// See if this is an X & Mask that we can match to BEXTR/BZHI.
3748// Where Mask is one of the following patterns:
3749// a) x & (1 << nbits) - 1
3750// b) x & ~(-1 << nbits)
3751// c) x & (-1 >> (32 - y))
3752// d) x << (32 - y) >> (32 - y)
3753// e) (1 << nbits) - 1
3754bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3755 assert(
3756 (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND ||
3757 Node->getOpcode() == ISD::SRL) &&
3758 "Should be either an and-mask, or right-shift after clearing high bits.");
3759
3760 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3761 if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3762 return false;
3763
3764 MVT NVT = Node->getSimpleValueType(0);
3765
3766 // Only supported for 32 and 64 bits.
3767 if (NVT != MVT::i32 && NVT != MVT::i64)
3768 return false;
3769
3770 SDValue NBits;
3771 bool NegateNBits;
3772
3773 // If we have BMI2's BZHI, we are ok with muti-use patterns.
3774 // Else, if we only have BMI1's BEXTR, we require one-use.
3775 const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3776 auto checkUses = [AllowExtraUsesByDefault](
3777 SDValue Op, unsigned NUses,
3778 std::optional<bool> AllowExtraUses) {
3779 return AllowExtraUses.value_or(AllowExtraUsesByDefault) ||
3780 Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3781 };
3782 auto checkOneUse = [checkUses](SDValue Op,
3783 std::optional<bool> AllowExtraUses =
3784 std::nullopt) {
3785 return checkUses(Op, 1, AllowExtraUses);
3786 };
3787 auto checkTwoUse = [checkUses](SDValue Op,
3788 std::optional<bool> AllowExtraUses =
3789 std::nullopt) {
3790 return checkUses(Op, 2, AllowExtraUses);
3791 };
3792
3793 auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3794 if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3795 assert(V.getSimpleValueType() == MVT::i32 &&
3796 V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3797 "Expected i64 -> i32 truncation");
3798 V = V.getOperand(0);
3799 }
3800 return V;
3801 };
3802
3803 // a) x & ((1 << nbits) + (-1))
3804 auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3805 &NegateNBits](SDValue Mask) -> bool {
3806 // Match `add`. Must only have one use!
3807 if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3808 return false;
3809 // We should be adding all-ones constant (i.e. subtracting one.)
3810 if (!isAllOnesConstant(Mask->getOperand(1)))
3811 return false;
3812 // Match `1 << nbits`. Might be truncated. Must only have one use!
3813 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3814 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3815 return false;
3816 if (!isOneConstant(M0->getOperand(0)))
3817 return false;
3818 NBits = M0->getOperand(1);
3819 NegateNBits = false;
3820 return true;
3821 };
3822
3823 auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3824 V = peekThroughOneUseTruncation(V);
3825 return CurDAG->MaskedValueIsAllOnes(
3826 V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
3827 NVT.getSizeInBits()));
3828 };
3829
3830 // b) x & ~(-1 << nbits)
3831 auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3832 &NBits, &NegateNBits](SDValue Mask) -> bool {
3833 // Match `~()`. Must only have one use!
3834 if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3835 return false;
3836 // The -1 only has to be all-ones for the final Node's NVT.
3837 if (!isAllOnes(Mask->getOperand(1)))
3838 return false;
3839 // Match `-1 << nbits`. Might be truncated. Must only have one use!
3840 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3841 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3842 return false;
3843 // The -1 only has to be all-ones for the final Node's NVT.
3844 if (!isAllOnes(M0->getOperand(0)))
3845 return false;
3846 NBits = M0->getOperand(1);
3847 NegateNBits = false;
3848 return true;
3849 };
3850
3851 // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
3852 // or leave the shift amount as-is, but then we'll have to negate it.
3853 auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
3854 unsigned Bitwidth) {
3855 NBits = ShiftAmt;
3856 NegateNBits = true;
3857 // Skip over a truncate of the shift amount, if any.
3858 if (NBits.getOpcode() == ISD::TRUNCATE)
3859 NBits = NBits.getOperand(0);
3860 // Try to match the shift amount as (bitwidth - y). It should go away, too.
3861 // If it doesn't match, that's fine, we'll just negate it ourselves.
3862 if (NBits.getOpcode() != ISD::SUB)
3863 return;
3864 auto *V0 = dyn_cast<ConstantSDNode>(NBits.getOperand(0));
3865 if (!V0 || V0->getZExtValue() != Bitwidth)
3866 return;
3867 NBits = NBits.getOperand(1);
3868 NegateNBits = false;
3869 };
3870
3871 // c) x & (-1 >> z) but then we'll have to subtract z from bitwidth
3872 // or
3873 // c) x & (-1 >> (32 - y))
3874 auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
3875 canonicalizeShiftAmt](SDValue Mask) -> bool {
3876 // The mask itself may be truncated.
3877 Mask = peekThroughOneUseTruncation(Mask);
3878 unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3879 // Match `l>>`. Must only have one use!
3880 if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3881 return false;
3882 // We should be shifting truly all-ones constant.
3883 if (!isAllOnesConstant(Mask.getOperand(0)))
3884 return false;
3885 SDValue M1 = Mask.getOperand(1);
3886 // The shift amount should not be used externally.
3887 if (!checkOneUse(M1))
3888 return false;
3889 canonicalizeShiftAmt(M1, Bitwidth);
3890 // Pattern c. is non-canonical, and is expanded into pattern d. iff there
3891 // is no extra use of the mask. Clearly, there was one since we are here.
3892 // But at the same time, if we need to negate the shift amount,
3893 // then we don't want the mask to stick around, else it's unprofitable.
3894 return !NegateNBits;
3895 };
3896
3897 SDValue X;
3898
3899 // d) x << z >> z but then we'll have to subtract z from bitwidth
3900 // or
3901 // d) x << (32 - y) >> (32 - y)
3902 auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
3903 AllowExtraUsesByDefault, &NegateNBits,
3904 &X](SDNode *Node) -> bool {
3905 if (Node->getOpcode() != ISD::SRL)
3906 return false;
3907 SDValue N0 = Node->getOperand(0);
3908 if (N0->getOpcode() != ISD::SHL)
3909 return false;
3910 unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3911 SDValue N1 = Node->getOperand(1);
3912 SDValue N01 = N0->getOperand(1);
3913 // Both of the shifts must be by the exact same value.
3914 if (N1 != N01)
3915 return false;
3916 canonicalizeShiftAmt(N1, Bitwidth);
3917 // There should not be any external uses of the inner shift / shift amount.
3918 // Note that while we are generally okay with external uses given BMI2,
3919 // iff we need to negate the shift amount, we are not okay with extra uses.
3920 const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
3921 if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses))
3922 return false;
3923 X = N0->getOperand(0);
3924 return true;
3925 };
3926
3927 auto matchLowBitMask = [matchPatternA, matchPatternB,
3928 matchPatternC](SDValue Mask) -> bool {
3929 return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
3930 };
3931
3932 if (Node->getOpcode() == ISD::AND) {
3933 X = Node->getOperand(0);
3934 SDValue Mask = Node->getOperand(1);
3935
3936 if (matchLowBitMask(Mask)) {
3937 // Great.
3938 } else {
3939 std::swap(X, Mask);
3940 if (!matchLowBitMask(Mask))
3941 return false;
3942 }
3943 } else if (matchLowBitMask(SDValue(Node, 0))) {
3944 X = CurDAG->getAllOnesConstant(SDLoc(Node), NVT);
3945 } else if (!matchPatternD(Node))
3946 return false;
3947
3948 // If we need to negate the shift amount, require BMI2 BZHI support.
3949 // It's just too unprofitable for BMI1 BEXTR.
3950 if (NegateNBits && !Subtarget->hasBMI2())
3951 return false;
3952
3953 SDLoc DL(Node);
3954
3955 // Truncate the shift amount.
3956 NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
3957 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3958
3959 // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
3960 // All the other bits are undefined, we do not care about them.
3961 SDValue ImplDef = SDValue(
3962 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
3963 insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
3964
3965 SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
3966 insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
3967 NBits = SDValue(CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
3968 MVT::i32, ImplDef, NBits, SRIdxVal),
3969 0);
3970 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3971
3972 // We might have matched the amount of high bits to be cleared,
3973 // but we want the amount of low bits to be kept, so negate it then.
3974 if (NegateNBits) {
3975 SDValue BitWidthC = CurDAG->getConstant(NVT.getSizeInBits(), DL, MVT::i32);
3976 insertDAGNode(*CurDAG, SDValue(Node, 0), BitWidthC);
3977
3978 NBits = CurDAG->getNode(ISD::SUB, DL, MVT::i32, BitWidthC, NBits);
3979 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3980 }
3981
3982 if (Subtarget->hasBMI2()) {
3983 // Great, just emit the BZHI..
3984 if (NVT != MVT::i32) {
3985 // But have to place the bit count into the wide-enough register first.
3986 NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
3987 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3988 }
3989
3990 SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
3991 ReplaceNode(Node, Extract.getNode());
3992 SelectCode(Extract.getNode());
3993 return true;
3994 }
3995
3996 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
3997 // *logically* shifted (potentially with one-use trunc inbetween),
3998 // and the truncation was the only use of the shift,
3999 // and if so look past one-use truncation.
4000 {
4001 SDValue RealX = peekThroughOneUseTruncation(X);
4002 // FIXME: only if the shift is one-use?
4003 if (RealX != X && RealX.getOpcode() == ISD::SRL)
4004 X = RealX;
4005 }
4006
4007 MVT XVT = X.getSimpleValueType();
4008
4009 // Else, emitting BEXTR requires one more step.
4010 // The 'control' of BEXTR has the pattern of:
4011 // [15...8 bit][ 7...0 bit] location
4012 // [ bit count][ shift] name
4013 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4014
4015 // Shift NBits left by 8 bits, thus producing 'control'.
4016 // This makes the low 8 bits to be zero.
4017 SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
4018 insertDAGNode(*CurDAG, SDValue(Node, 0), C8);
4019 SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
4020 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4021
4022 // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
4023 // FIXME: only if the shift is one-use?
4024 if (X.getOpcode() == ISD::SRL) {
4025 SDValue ShiftAmt = X.getOperand(1);
4026 X = X.getOperand(0);
4027
4028 assert(ShiftAmt.getValueType() == MVT::i8 &&
4029 "Expected shift amount to be i8");
4030
4031 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
4032 // We could zext to i16 in some form, but we intentionally don't do that.
4033 SDValue OrigShiftAmt = ShiftAmt;
4034 ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
4035 insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
4036
4037 // And now 'or' these low 8 bits of shift amount into the 'control'.
4038 Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
4039 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4040 }
4041
4042 // But have to place the 'control' into the wide-enough register first.
4043 if (XVT != MVT::i32) {
4044 Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
4045 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4046 }
4047
4048 // And finally, form the BEXTR itself.
4049 SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
4050
4051 // The 'X' was originally truncated. Do that now.
4052 if (XVT != NVT) {
4053 insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
4054 Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
4055 }
4056
4057 ReplaceNode(Node, Extract.getNode());
4058 SelectCode(Extract.getNode());
4059
4060 return true;
4061}
4062
4063// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4064MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
4065 MVT NVT = Node->getSimpleValueType(0);
4066 SDLoc dl(Node);
4067
4068 SDValue N0 = Node->getOperand(0);
4069 SDValue N1 = Node->getOperand(1);
4070
4071 // If we have TBM we can use an immediate for the control. If we have BMI
4072 // we should only do this if the BEXTR instruction is implemented well.
4073 // Otherwise moving the control into a register makes this more costly.
4074 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4075 // hoisting the move immediate would make it worthwhile with a less optimal
4076 // BEXTR?
4077 bool PreferBEXTR =
4078 Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4079 if (!PreferBEXTR && !Subtarget->hasBMI2())
4080 return nullptr;
4081
4082 // Must have a shift right.
4083 if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
4084 return nullptr;
4085
4086 // Shift can't have additional users.
4087 if (!N0->hasOneUse())
4088 return nullptr;
4089
4090 // Only supported for 32 and 64 bits.
4091 if (NVT != MVT::i32 && NVT != MVT::i64)
4092 return nullptr;
4093
4094 // Shift amount and RHS of and must be constant.
4095 auto *MaskCst = dyn_cast<ConstantSDNode>(N1);
4096 auto *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
4097 if (!MaskCst || !ShiftCst)
4098 return nullptr;
4099
4100 // And RHS must be a mask.
4101 uint64_t Mask = MaskCst->getZExtValue();
4102 if (!isMask_64(Mask))
4103 return nullptr;
4104
4105 uint64_t Shift = ShiftCst->getZExtValue();
4106 uint64_t MaskSize = llvm::popcount(Mask);
4107
4108 // Don't interfere with something that can be handled by extracting AH.
4109 // TODO: If we are able to fold a load, BEXTR might still be better than AH.
4110 if (Shift == 8 && MaskSize == 8)
4111 return nullptr;
4112
4113 // Make sure we are only using bits that were in the original value, not
4114 // shifted in.
4115 if (Shift + MaskSize > NVT.getSizeInBits())
4116 return nullptr;
4117
4118 // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4119 // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4120 // does not fit into 32 bits. Load folding is not a sufficient reason.
4121 if (!PreferBEXTR && MaskSize <= 32)
4122 return nullptr;
4123
4124 SDValue Control;
4125 unsigned ROpc, MOpc;
4126
4127#define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4128 if (!PreferBEXTR) {
4129 assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4130 // If we can't make use of BEXTR then we can't fuse shift+mask stages.
4131 // Let's perform the mask first, and apply shift later. Note that we need to
4132 // widen the mask to account for the fact that we'll apply shift afterwards!
4133 Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
4134 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4135 : GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4136 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4137 : GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4138 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4139 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4140 } else {
4141 // The 'control' of BEXTR has the pattern of:
4142 // [15...8 bit][ 7...0 bit] location
4143 // [ bit count][ shift] name
4144 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4145 Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
4146 if (Subtarget->hasTBM()) {
4147 ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4148 MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4149 } else {
4150 assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4151 // BMI requires the immediate to placed in a register.
4152 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4153 : GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4154 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4155 : GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4156 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4157 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4158 }
4159 }
4160
4161 MachineSDNode *NewNode;
4162 SDValue Input = N0->getOperand(0);
4163 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4164 if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4165 SDValue Ops[] = {
4166 Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
4167 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4168 NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4169 // Update the chain.
4170 ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
4171 // Record the mem-refs
4172 CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
4173 } else {
4174 NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
4175 }
4176
4177 if (!PreferBEXTR) {
4178 // We still need to apply the shift.
4179 SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
4180 unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4181 : GET_ND_IF_ENABLED(X86::SHR32ri);
4182 NewNode =
4183 CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
4184 }
4185
4186 return NewNode;
4187}
4188
4189// Emit a PCMISTR(I/M) instruction.
4190MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
4191 bool MayFoldLoad, const SDLoc &dl,
4192 MVT VT, SDNode *Node) {
4193 SDValue N0 = Node->getOperand(0);
4194 SDValue N1 = Node->getOperand(1);
4195 SDValue Imm = Node->getOperand(2);
4196 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4197 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4198
4199 // Try to fold a load. No need to check alignment.
4200 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4201 if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4202 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4203 N1.getOperand(0) };
4204 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
4205 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4206 // Update the chain.
4207 ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
4208 // Record the mem-refs
4209 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4210 return CNode;
4211 }
4212
4213 SDValue Ops[] = { N0, N1, Imm };
4214 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
4215 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4216 return CNode;
4217}
4218
4219// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4220// to emit a second instruction after this one. This is needed since we have two
4221// copyToReg nodes glued before this and we need to continue that glue through.
4222MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
4223 bool MayFoldLoad, const SDLoc &dl,
4224 MVT VT, SDNode *Node,
4225 SDValue &InGlue) {
4226 SDValue N0 = Node->getOperand(0);
4227 SDValue N2 = Node->getOperand(2);
4228 SDValue Imm = Node->getOperand(4);
4229 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4230 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4231
4232 // Try to fold a load. No need to check alignment.
4233 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4234 if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4235 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4236 N2.getOperand(0), InGlue };
4237 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
4238 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4239 InGlue = SDValue(CNode, 3);
4240 // Update the chain.
4241 ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
4242 // Record the mem-refs
4243 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
4244 return CNode;
4245 }
4246
4247 SDValue Ops[] = { N0, N2, Imm, InGlue };
4248 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
4249 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4250 InGlue = SDValue(CNode, 2);
4251 return CNode;
4252}
4253
4254bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4255 EVT VT = N->getValueType(0);
4256
4257 // Only handle scalar shifts.
4258 if (VT.isVector())
4259 return false;
4260
4261 // Narrower shifts only mask to 5 bits in hardware.
4262 unsigned Size = VT == MVT::i64 ? 64 : 32;
4263
4264 SDValue OrigShiftAmt = N->getOperand(1);
4265 SDValue ShiftAmt = OrigShiftAmt;
4266 SDLoc DL(N);
4267
4268 // Skip over a truncate of the shift amount.
4269 if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
4270 ShiftAmt = ShiftAmt->getOperand(0);
4271
4272 // This function is called after X86DAGToDAGISel::matchBitExtract(),
4273 // so we are not afraid that we might mess up BZHI/BEXTR pattern.
4274
4275 SDValue NewShiftAmt;
4276 if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB ||
4277 ShiftAmt->getOpcode() == ISD::XOR) {
4278 SDValue Add0 = ShiftAmt->getOperand(0);
4279 SDValue Add1 = ShiftAmt->getOperand(1);
4280 auto *Add0C = dyn_cast<ConstantSDNode>(Add0);
4281 auto *Add1C = dyn_cast<ConstantSDNode>(Add1);
4282 // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4283 // to avoid the ADD/SUB/XOR.
4284 if (Add1C && Add1C->getAPIntValue().urem(Size) == 0) {
4285 NewShiftAmt = Add0;
4286
4287 } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4288 ((Add0C && Add0C->getAPIntValue().urem(Size) == Size - 1) ||
4289 (Add1C && Add1C->getAPIntValue().urem(Size) == Size - 1))) {
4290 // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
4291 // we can replace it with a NOT. In the XOR case it may save some code
4292 // size, in the SUB case it also may save a move.
4293 assert(Add0C == nullptr || Add1C == nullptr);
4294
4295 // We can only do N-X, not X-N
4296 if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr)
4297 return false;
4298
4299 EVT OpVT = ShiftAmt.getValueType();
4300
4301 SDValue AllOnes = CurDAG->getAllOnesConstant(DL, OpVT);
4302 NewShiftAmt = CurDAG->getNode(ISD::XOR, DL, OpVT,
4303 Add0C == nullptr ? Add0 : Add1, AllOnes);
4304 insertDAGNode(*CurDAG, OrigShiftAmt, AllOnes);
4305 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4306 // If we are shifting by N-X where N == 0 mod Size, then just shift by
4307 // -X to generate a NEG instead of a SUB of a constant.
4308 } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
4309 Add0C->getZExtValue() != 0) {
4310 EVT SubVT = ShiftAmt.getValueType();
4311 SDValue X;
4312 if (Add0C->getZExtValue() % Size == 0)
4313 X = Add1;
4314 else if (ShiftAmt.hasOneUse() && Size == 64 &&
4315 Add0C->getZExtValue() % 32 == 0) {
4316 // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
4317 // This is mainly beneficial if we already compute (x+n*32).
4318 if (Add1.getOpcode() == ISD::TRUNCATE) {
4319 Add1 = Add1.getOperand(0);
4320 SubVT = Add1.getValueType();
4321 }
4322 if (Add0.getValueType() != SubVT) {
4323 Add0 = CurDAG->getZExtOrTrunc(Add0, DL, SubVT);
4324 insertDAGNode(*CurDAG, OrigShiftAmt, Add0);
4325 }
4326
4327 X = CurDAG->getNode(ISD::ADD, DL, SubVT, Add1, Add0);
4328 insertDAGNode(*CurDAG, OrigShiftAmt, X);
4329 } else
4330 return false;
4331 // Insert a negate op.
4332 // TODO: This isn't guaranteed to replace the sub if there is a logic cone
4333 // that uses it that's not a shift.
4334 SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
4335 SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, X);
4336 NewShiftAmt = Neg;
4337
4338 // Insert these operands into a valid topological order so they can
4339 // get selected independently.
4340 insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
4341 insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
4342 } else
4343 return false;
4344 } else
4345 return false;
4346
4347 if (NewShiftAmt.getValueType() != MVT::i8) {
4348 // Need to truncate the shift amount.
4349 NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
4350 // Add to a correct topological ordering.
4351 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4352 }
4353
4354 // Insert a new mask to keep the shift amount legal. This should be removed
4355 // by isel patterns.
4356 NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
4357 CurDAG->getConstant(Size - 1, DL, MVT::i8));
4358 // Place in a correct topological ordering.
4359 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4360
4361 SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
4362 NewShiftAmt);
4363 if (UpdatedNode != N) {
4364 // If we found an existing node, we should replace ourselves with that node
4365 // and wait for it to be selected after its other users.
4366 ReplaceNode(N, UpdatedNode);
4367 return true;
4368 }
4369
4370 // If the original shift amount is now dead, delete it so that we don't run
4371 // it through isel.
4372 if (OrigShiftAmt.getNode()->use_empty())
4373 CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
4374
4375 // Now that we've optimized the shift amount, defer to normal isel to get
4376 // load folding and legacy vs BMI2 selection without repeating it here.
4377 SelectCode(N);
4378 return true;
4379}
4380
4381bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4382 MVT NVT = N->getSimpleValueType(0);
4383 unsigned Opcode = N->getOpcode();
4384 SDLoc dl(N);
4385
4386 // For operations of the form (x << C1) op C2, check if we can use a smaller
4387 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4388 SDValue Shift = N->getOperand(0);
4389 SDValue N1 = N->getOperand(1);
4390
4391 auto *Cst = dyn_cast<ConstantSDNode>(N1);
4392 if (!Cst)
4393 return false;
4394
4395 int64_t Val = Cst->getSExtValue();
4396
4397 // If we have an any_extend feeding the AND, look through it to see if there
4398 // is a shift behind it. But only if the AND doesn't use the extended bits.
4399 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4400 bool FoundAnyExtend = false;
4401 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4402 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
4403 isUInt<32>(Val)) {
4404 FoundAnyExtend = true;
4405 Shift = Shift.getOperand(0);
4406 }
4407
4408 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
4409 return false;
4410
4411 // i8 is unshrinkable, i16 should be promoted to i32.
4412 if (NVT != MVT::i32 && NVT != MVT::i64)
4413 return false;
4414
4415 auto *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
4416 if (!ShlCst)
4417 return false;
4418
4419 uint64_t ShAmt = ShlCst->getZExtValue();
4420
4421 // Make sure that we don't change the operation by removing bits.
4422 // This only matters for OR and XOR, AND is unaffected.
4423 uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
4424 if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
4425 return false;
4426
4427 // Check the minimum bitwidth for the new constant.
4428 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4429 auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4430 if (Opcode == ISD::AND) {
4431 // AND32ri is the same as AND64ri32 with zext imm.
4432 // Try this before sign extended immediates below.
4433 ShiftedVal = (uint64_t)Val >> ShAmt;
4434 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4435 return true;
4436 // Also swap order when the AND can become MOVZX.
4437 if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
4438 return true;
4439 }
4440 ShiftedVal = Val >> ShAmt;
4441 if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
4442 (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
4443 return true;
4444 if (Opcode != ISD::AND) {
4445 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4446 ShiftedVal = (uint64_t)Val >> ShAmt;
4447 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4448 return true;
4449 }
4450 return false;
4451 };
4452
4453 int64_t ShiftedVal;
4454 if (!CanShrinkImmediate(ShiftedVal))
4455 return false;
4456
4457 // Ok, we can reorder to get a smaller immediate.
4458
4459 // But, its possible the original immediate allowed an AND to become MOVZX.
4460 // Doing this late due to avoid the MakedValueIsZero call as late as
4461 // possible.
4462 if (Opcode == ISD::AND) {
4463 // Find the smallest zext this could possibly be.
4464 unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4465 ZExtWidth = llvm::bit_ceil(std::max(ZExtWidth, 8U));
4466
4467 // Figure out which bits need to be zero to achieve that mask.
4468 APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
4469 ZExtWidth);
4470 NeededMask &= ~Cst->getAPIntValue();
4471
4472 if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
4473 return false;
4474 }
4475
4476 SDValue X = Shift.getOperand(0);
4477 if (FoundAnyExtend) {
4478 SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
4479 insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
4480 X = NewX;
4481 }
4482
4483 SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT);
4484 insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
4485 SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
4486 insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
4487 SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
4488 Shift.getOperand(1));
4489 ReplaceNode(N, NewSHL.getNode());
4490 SelectCode(NewSHL.getNode());
4491 return true;
4492}
4493
4494bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
4495 SDNode *ParentB, SDNode *ParentC,
4497 uint8_t Imm) {
4498 assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4499 C.isOperandOf(ParentC) && "Incorrect parent node");
4500
4501 auto tryFoldLoadOrBCast =
4502 [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
4503 SDValue &Index, SDValue &Disp, SDValue &Segment) {
4504 if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4505 return true;
4506
4507 // Not a load, check for broadcast which may be behind a bitcast.
4508 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4509 P = L.getNode();
4510 L = L.getOperand(0);
4511 }
4512
4513 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4514 return false;
4515
4516 // Only 32 and 64 bit broadcasts are supported.
4517 auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4518 unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4519 if (Size != 32 && Size != 64)
4520 return false;
4521
4522 return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4523 };
4524
4525 bool FoldedLoad = false;
4526 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4527 if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4528 FoldedLoad = true;
4529 } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4530 Tmp4)) {
4531 FoldedLoad = true;
4532 std::swap(A, C);
4533 // Swap bits 1/4 and 3/6.
4534 uint8_t OldImm = Imm;
4535 Imm = OldImm & 0xa5;
4536 if (OldImm & 0x02) Imm |= 0x10;
4537 if (OldImm & 0x10) Imm |= 0x02;
4538 if (OldImm & 0x08) Imm |= 0x40;
4539 if (OldImm & 0x40) Imm |= 0x08;
4540 } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4541 Tmp4)) {
4542 FoldedLoad = true;
4543 std::swap(B, C);
4544 // Swap bits 1/2 and 5/6.
4545 uint8_t OldImm = Imm;
4546 Imm = OldImm & 0x99;
4547 if (OldImm & 0x02) Imm |= 0x04;
4548 if (OldImm & 0x04) Imm |= 0x02;
4549 if (OldImm & 0x20) Imm |= 0x40;
4550 if (OldImm & 0x40) Imm |= 0x20;
4551 }
4552
4553 SDLoc DL(Root);
4554
4555 SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8);
4556
4557 MVT NVT = Root->getSimpleValueType(0);
4558
4559 MachineSDNode *MNode;
4560 if (FoldedLoad) {
4561 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
4562
4563 unsigned Opc;
4564 if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4565 auto *MemIntr = cast<MemIntrinsicSDNode>(C);
4566 unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4567 assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
4568
4569 bool UseD = EltSize == 32;
4570 if (NVT.is128BitVector())
4571 Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4572 else if (NVT.is256BitVector())
4573 Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4574 else if (NVT.is512BitVector())
4575 Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4576 else
4577 llvm_unreachable("Unexpected vector size!");
4578 } else {
4579 bool UseD = NVT.getVectorElementType() == MVT::i32;
4580 if (NVT.is128BitVector())
4581 Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4582 else if (NVT.is256BitVector())
4583 Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4584 else if (NVT.is512BitVector())
4585 Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4586 else
4587 llvm_unreachable("Unexpected vector size!");
4588 }
4589
4590 SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)};
4591 MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
4592
4593 // Update the chain.
4594 ReplaceUses(C.getValue(1), SDValue(MNode, 1));
4595 // Record the mem-refs
4596 CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()});
4597 } else {
4598 bool UseD = NVT.getVectorElementType() == MVT::i32;
4599 unsigned Opc;
4600 if (NVT.is128BitVector())
4601 Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4602 else if (NVT.is256BitVector())
4603 Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4604 else if (NVT.is512BitVector())
4605 Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4606 else
4607 llvm_unreachable("Unexpected vector size!");
4608
4609 MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm});
4610 }
4611
4612 ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0));
4613 CurDAG->RemoveDeadNode(Root);
4614 return true;
4615}
4616
4617// Try to match two logic ops to a VPTERNLOG.
4618// FIXME: Handle more complex patterns that use an operand more than once?
4619bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4620 MVT NVT = N->getSimpleValueType(0);
4621
4622 // Make sure we support VPTERNLOG.
4623 if (!NVT.isVector() || !Subtarget->hasAVX512() ||
4624 NVT.getVectorElementType() == MVT::i1)
4625 return false;
4626
4627 // We need VLX for 128/256-bit.
4628 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4629 return false;
4630
4631 SDValue N0 = N->getOperand(0);
4632 SDValue N1 = N->getOperand(1);
4633
4634 auto getFoldableLogicOp = [](SDValue Op) {
4635 // Peek through single use bitcast.
4636 if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4637 Op = Op.getOperand(0);
4638
4639 if (!Op.hasOneUse())
4640 return SDValue();
4641
4642 unsigned Opc = Op.getOpcode();
4643 if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
4644 Opc == X86ISD::ANDNP)
4645 return Op;
4646
4647 return SDValue();
4648 };
4649
4650 SDValue A, FoldableOp;
4651 if ((FoldableOp = getFoldableLogicOp(N1))) {
4652 A = N0;
4653 } else if ((FoldableOp = getFoldableLogicOp(N0))) {
4654 A = N1;
4655 } else
4656 return false;
4657
4658 SDValue B = FoldableOp.getOperand(0);
4659 SDValue C = FoldableOp.getOperand(1);
4660 SDNode *ParentA = N;
4661 SDNode *ParentB = FoldableOp.getNode();
4662 SDNode *ParentC = FoldableOp.getNode();
4663
4664 // We can build the appropriate control immediate by performing the logic
4665 // operation we're matching using these constants for A, B, and C.
4666 uint8_t TernlogMagicA = 0xf0;
4667 uint8_t TernlogMagicB = 0xcc;
4668 uint8_t TernlogMagicC = 0xaa;
4669
4670 // Some of the inputs may be inverted, peek through them and invert the
4671 // magic values accordingly.
4672 // TODO: There may be a bitcast before the xor that we should peek through.
4673 auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4674 if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4675 ISD::isBuildVectorAllOnes(Op.getOperand(1).getNode())) {
4676 Magic = ~Magic;
4677 Parent = Op.getNode();
4678 Op = Op.getOperand(0);
4679 }
4680 };
4681
4682 PeekThroughNot(A, ParentA, TernlogMagicA);
4683 PeekThroughNot(B, ParentB, TernlogMagicB);
4684 PeekThroughNot(C, ParentC, TernlogMagicC);
4685
4686 uint8_t Imm;
4687 switch (FoldableOp.getOpcode()) {
4688 default: llvm_unreachable("Unexpected opcode!");
4689 case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
4690 case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break;
4691 case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
4692 case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4693 }
4694
4695 switch (N->getOpcode()) {
4696 default: llvm_unreachable("Unexpected opcode!");
4697 case X86ISD::ANDNP:
4698 if (A == N0)
4699 Imm &= ~TernlogMagicA;
4700 else
4701 Imm = ~(Imm) & TernlogMagicA;
4702 break;
4703 case ISD::AND: Imm &= TernlogMagicA; break;
4704 case ISD::OR: Imm |= TernlogMagicA; break;
4705 case ISD::XOR: Imm ^= TernlogMagicA; break;
4706 }
4707
4708 return matchVPTERNLOG(N, ParentA, ParentB, ParentC, A, B, C, Imm);
4709}
4710
4711/// If the high bits of an 'and' operand are known zero, try setting the
4712/// high bits of an 'and' constant operand to produce a smaller encoding by
4713/// creating a small, sign-extended negative immediate rather than a large
4714/// positive one. This reverses a transform in SimplifyDemandedBits that
4715/// shrinks mask constants by clearing bits. There is also a possibility that
4716/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4717/// case, just replace the 'and'. Return 'true' if the node is replaced.
4718bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4719 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4720 // have immediate operands.
4721 MVT VT = And->getSimpleValueType(0);
4722 if (VT != MVT::i32 && VT != MVT::i64)
4723 return false;
4724
4725 auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
4726 if (!And1C)
4727 return false;
4728
4729 // Bail out if the mask constant is already negative. It's can't shrink more.
4730 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4731 // patterns to use a 32-bit and instead of a 64-bit and by relying on the
4732 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4733 // are negative too.
4734 APInt MaskVal = And1C->getAPIntValue();
4735 unsigned MaskLZ = MaskVal.countl_zero();
4736 if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
4737 return false;
4738
4739 // Don't extend into the upper 32 bits of a 64 bit mask.
4740 if (VT == MVT::i64 && MaskLZ >= 32) {
4741 MaskLZ -= 32;
4742 MaskVal = MaskVal.trunc(32);
4743 }
4744
4745 SDValue And0 = And->getOperand(0);
4746 APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
4747 APInt NegMaskVal = MaskVal | HighZeros;
4748
4749 // If a negative constant would not allow a smaller encoding, there's no need
4750 // to continue. Only change the constant when we know it's a win.
4751 unsigned MinWidth = NegMaskVal.getSignificantBits();
4752 if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32))
4753 return false;
4754
4755 // Extend masks if we truncated above.
4756 if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
4757 NegMaskVal = NegMaskVal.zext(64);
4758 HighZeros = HighZeros.zext(64);
4759 }
4760
4761 // The variable operand must be all zeros in the top bits to allow using the
4762 // new, negative constant as the mask.
4763 if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
4764 return false;
4765
4766 // Check if the mask is -1. In that case, this is an unnecessary instruction
4767 // that escaped earlier analysis.
4768 if (NegMaskVal.isAllOnes()) {
4769 ReplaceNode(And, And0.getNode());
4770 return true;
4771 }
4772
4773 // A negative mask allows a smaller encoding. Create a new 'and' node.
4774 SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
4775 insertDAGNode(*CurDAG, SDValue(And, 0), NewMask);
4776 SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
4777 ReplaceNode(And, NewAnd.getNode());
4778 SelectCode(NewAnd.getNode());
4779 return true;
4780}
4781
4782static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
4783 bool FoldedBCast, bool Masked) {
4784#define VPTESTM_CASE(VT, SUFFIX) \
4785case MVT::VT: \
4786 if (Masked) \
4787 return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4788 return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4789
4790
4791#define VPTESTM_BROADCAST_CASES(SUFFIX) \
4792default: llvm_unreachable("Unexpected VT!"); \
4793VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4794VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4795VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4796VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4797VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4798VPTESTM_CASE(v8i64, QZ##SUFFIX)
4799
4800#define VPTESTM_FULL_CASES(SUFFIX) \
4801VPTESTM_BROADCAST_CASES(SUFFIX) \
4802VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
4803VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
4804VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
4805VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
4806VPTESTM_CASE(v64i8, BZ##SUFFIX) \
4807VPTESTM_CASE(v32i16, WZ##SUFFIX)
4808
4809 if (FoldedBCast) {
4810 switch (TestVT.SimpleTy) {
4812 }
4813 }
4814
4815 if (FoldedLoad) {
4816 switch (TestVT.SimpleTy) {
4818 }
4819 }
4820
4821 switch (TestVT.SimpleTy) {
4823 }
4824
4825#undef VPTESTM_FULL_CASES
4826#undef VPTESTM_BROADCAST_CASES
4827#undef VPTESTM_CASE
4828}
4829
4830// Try to create VPTESTM instruction. If InMask is not null, it will be used
4831// to form a masked operation.
4832bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4833 SDValue InMask) {
4834 assert(Subtarget->hasAVX512() && "Expected AVX512!");
4835 assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4836 "Unexpected VT!");
4837
4838 // Look for equal and not equal compares.
4839 ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
4840 if (CC != ISD::SETEQ && CC != ISD::SETNE)
4841 return false;
4842
4843 SDValue SetccOp0 = Setcc.getOperand(0);
4844 SDValue SetccOp1 = Setcc.getOperand(1);
4845
4846 // Canonicalize the all zero vector to the RHS.
4847 if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
4848 std::swap(SetccOp0, SetccOp1);
4849
4850 // See if we're comparing against zero.
4851 if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
4852 return false;
4853
4854 SDValue N0 = SetccOp0;
4855
4856 MVT CmpVT = N0.getSimpleValueType();
4857 MVT CmpSVT = CmpVT.getVectorElementType();
4858
4859 // Start with both operands the same. We'll try to refine this.
4860 SDValue Src0 = N0;
4861 SDValue Src1 = N0;
4862
4863 {
4864 // Look through single use bitcasts.
4865 SDValue N0Temp = N0;
4866 if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4867 N0Temp = N0.getOperand(0);
4868
4869 // Look for single use AND.
4870 if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4871 Src0 = N0Temp.getOperand(0);
4872 Src1 = N0Temp.getOperand(1);
4873 }
4874 }
4875
4876 // Without VLX we need to widen the operation.
4877 bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4878
4879 auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
4880 SDValue &Base, SDValue &Scale, SDValue &Index,
4881 SDValue &Disp, SDValue &Segment) {
4882 // If we need to widen, we can't fold the load.
4883 if (!Widen)
4884 if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4885 return true;
4886
4887 // If we didn't fold a load, try to match broadcast. No widening limitation
4888 // for this. But only 32 and 64 bit types are supported.
4889 if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
4890 return false;
4891
4892 // Look through single use bitcasts.
4893 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4894 P = L.getNode();
4895 L = L.getOperand(0);
4896 }
4897
4898 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4899 return false;
4900
4901 auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4902 if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
4903 return false;
4904
4905 return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4906 };
4907
4908 // We can only fold loads if the sources are unique.
4909 bool CanFoldLoads = Src0 != Src1;
4910
4911 bool FoldedLoad = false;
4912 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4913 if (CanFoldLoads) {
4914 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
4915 Tmp3, Tmp4);
4916 if (!FoldedLoad) {
4917 // And is commutative.
4918 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
4919 Tmp2, Tmp3, Tmp4);
4920 if (FoldedLoad)
4921 std::swap(Src0, Src1);
4922 }
4923 }
4924
4925 bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
4926
4927 bool IsMasked = InMask.getNode() != nullptr;
4928
4929 SDLoc dl(Root);
4930
4931 MVT ResVT = Setcc.getSimpleValueType();
4932 MVT MaskVT = ResVT;
4933 if (Widen) {
4934 // Widen the inputs using insert_subreg or copy_to_regclass.
4935 unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
4936 unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
4937 unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
4938 CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
4939 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
4940 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
4941 CmpVT), 0);
4942 Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
4943
4944 if (!FoldedBCast)
4945 Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
4946
4947 if (IsMasked) {
4948 // Widen the mask.
4949 unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID();
4950 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4951 InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4952 dl, MaskVT, InMask, RC), 0);
4953 }
4954 }
4955
4956 bool IsTestN = CC == ISD::SETEQ;
4957 unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
4958 IsMasked);
4959
4960 MachineSDNode *CNode;
4961 if (FoldedLoad) {
4962 SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
4963
4964 if (IsMasked) {
4965 SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4966 Src1.getOperand(0) };
4967 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4968 } else {
4969 SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4970 Src1.getOperand(0) };
4971 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4972 }
4973
4974 // Update the chain.
4975 ReplaceUses(Src1.getValue(1), SDValue(CNode, 1));
4976 // Record the mem-refs
4977 CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Src1)->getMemOperand()});
4978 } else {
4979 if (IsMasked)
4980 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
4981 else
4982 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
4983 }
4984
4985 // If we widened, we need to shrink the mask VT.
4986 if (Widen) {
4987 unsigned RegClass = TLI->getRegClassFor(ResVT)->getID();
4988 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4989 CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4990 dl, ResVT, SDValue(CNode, 0), RC);
4991 }
4992
4993 ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
4994 CurDAG->RemoveDeadNode(Root);
4995 return true;
4996}
4997
4998// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
4999// into vpternlog.
5000bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
5001 assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
5002
5003 MVT NVT = N->getSimpleValueType(0);
5004
5005 // Make sure we support VPTERNLOG.
5006 if (!NVT.isVector() || !Subtarget->hasAVX512())
5007 return false;
5008
5009 // We need VLX for 128/256-bit.
5010 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
5011 return false;
5012
5013 SDValue N0 = N->getOperand(0);
5014 SDValue N1 = N->getOperand(1);
5015
5016 // Canonicalize AND to LHS.
5017 if (N1.getOpcode() == ISD::AND)
5018 std::swap(N0, N1);
5019
5020 if (N0.getOpcode() != ISD::AND ||
5021 N1.getOpcode() != X86ISD::ANDNP ||
5022 !N0.hasOneUse() || !N1.hasOneUse())
5023 return false;
5024
5025 // ANDN is not commutable, use it to pick down A and C.
5026 SDValue A = N1.getOperand(0);
5027 SDValue C = N1.getOperand(1);
5028
5029 // AND is commutable, if one operand matches A, the other operand is B.
5030 // Otherwise this isn't a match.
5031 SDValue B;
5032 if (N0.getOperand(0) == A)
5033 B = N0.getOperand(1);
5034 else if (N0.getOperand(1) == A)
5035 B = N0.getOperand(0);
5036 else
5037 return false;
5038
5039 SDLoc dl(N);
5040 SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
5041 SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
5042 ReplaceNode(N, Ternlog.getNode());
5043
5044 return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(),
5045 Ternlog.getNode(), A, B, C, 0xCA);
5046}
5047
5048void X86DAGToDAGISel::Select(SDNode *Node) {
5049 MVT NVT = Node->getSimpleValueType(0);
5050 unsigned Opcode = Node->getOpcode();
5051 SDLoc dl(Node);
5052
5053 if (Node->isMachineOpcode()) {
5054 LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
5055 Node->setNodeId(-1);
5056 return; // Already selected.
5057 }
5058
5059 switch (Opcode) {
5060 default: break;
5062 unsigned IntNo = Node->getConstantOperandVal(1);
5063 switch (IntNo) {
5064 default: break;
5065 case Intrinsic::x86_encodekey128:
5066 case Intrinsic::x86_encodekey256: {
5067 if (!Subtarget->hasKL())
5068 break;
5069
5070 unsigned Opcode;
5071 switch (IntNo) {
5072 default: llvm_unreachable("Impossible intrinsic");
5073 case Intrinsic::x86_encodekey128:
5074 Opcode = X86::ENCODEKEY128;
5075 break;
5076 case Intrinsic::x86_encodekey256:
5077 Opcode = X86::ENCODEKEY256;
5078 break;
5079 }
5080
5081 SDValue Chain = Node->getOperand(0);
5082 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3),
5083 SDValue());
5084 if (Opcode == X86::ENCODEKEY256)
5085 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4),
5086 Chain.getValue(1));
5087
5088 MachineSDNode *Res = CurDAG->getMachineNode(
5089 Opcode, dl, Node->getVTList(),
5090 {Node->getOperand(2), Chain, Chain.getValue(1)});
5091 ReplaceNode(Node, Res);
5092 return;
5093 }
5094 case Intrinsic::x86_tileloadd64_internal:
5095 case Intrinsic::x86_tileloaddt164_internal: {
5096 if (!Subtarget->hasAMXTILE())
5097 break;
5098 unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal
5099 ? X86::PTILELOADDV
5100 : X86::PTILELOADDT1V;
5101 // _tile_loadd_internal(row, col, buf, STRIDE)
5102 SDValue Base = Node->getOperand(4);
5103 SDValue Scale = getI8Imm(1, dl);
5104 SDValue Index = Node->getOperand(5);
5105 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5106 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5107 SDValue Chain = Node->getOperand(0);
5108 MachineSDNode *CNode;
5109 SDValue Ops[] = {Node->getOperand(2),
5110 Node->getOperand(3),
5111 Base,
5112 Scale,
5113 Index,
5114 Disp,
5115 Segment,
5116 Chain};
5117 CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
5118 ReplaceNode(Node, CNode);
5119 return;
5120 }
5121 }
5122 break;
5123 }
5124 case ISD::INTRINSIC_VOID: {
5125 unsigned IntNo = Node->getConstantOperandVal(1);
5126 switch (IntNo) {
5127 default: break;
5128 case Intrinsic::x86_sse3_monitor:
5129 case Intrinsic::x86_monitorx:
5130 case Intrinsic::x86_clzero: {
5131 bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
5132
5133 unsigned Opc = 0;
5134 switch (IntNo) {
5135 default: llvm_unreachable("Unexpected intrinsic!");
5136 case Intrinsic::x86_sse3_monitor:
5137 if (!Subtarget->hasSSE3())
5138 break;
5139 Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
5140 break;
5141 case Intrinsic::x86_monitorx:
5142 if (!Subtarget->hasMWAITX())
5143 break;
5144 Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
5145 break;
5146 case Intrinsic::x86_clzero:
5147 if (!Subtarget->hasCLZERO())
5148 break;
5149 Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
5150 break;
5151 }
5152
5153 if (Opc) {
5154 unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
5155 SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
5156 Node->getOperand(2), SDValue());
5157 SDValue InGlue = Chain.getValue(1);
5158
5159 if (IntNo == Intrinsic::x86_sse3_monitor ||
5160 IntNo == Intrinsic::x86_monitorx) {
5161 // Copy the other two operands to ECX and EDX.
5162 Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
5163 InGlue);
5164 InGlue = Chain.getValue(1);
5165 Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
5166 InGlue);
5167 InGlue = Chain.getValue(1);
5168 }
5169
5170 MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
5171 { Chain, InGlue});
5172 ReplaceNode(Node, CNode);
5173 return;
5174 }
5175
5176 break;
5177 }
5178 case Intrinsic::x86_tilestored64_internal: {
5179 unsigned Opc = X86::PTILESTOREDV;
5180 // _tile_stored_internal(row, col, buf, STRIDE, c)
5181 SDValue Base = Node->getOperand(4);
5182 SDValue Scale = getI8Imm(1, dl);
5183 SDValue Index = Node->getOperand(5);
5184 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5185 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5186 SDValue Chain = Node->getOperand(0);
5187 MachineSDNode *CNode;
5188 SDValue Ops[] = {Node->getOperand(2),
5189 Node->getOperand(3),
5190 Base,
5191 Scale,
5192 Index,
5193 Disp,
5194 Segment,
5195 Node->getOperand(6),
5196 Chain};
5197 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5198 ReplaceNode(Node, CNode);
5199 return;
5200 }
5201 case Intrinsic::x86_tileloadd64:
5202 case Intrinsic::x86_tileloaddt164:
5203 case Intrinsic::x86_tilestored64: {
5204 if (!Subtarget->hasAMXTILE())
5205 break;
5206 unsigned Opc;
5207 switch (IntNo) {
5208 default: llvm_unreachable("Unexpected intrinsic!");
5209 case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
5210 case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
5211 case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
5212 }
5213 // FIXME: Match displacement and scale.
5214 unsigned TIndex = Node->getConstantOperandVal(2);
5215 SDValue TReg = getI8Imm(TIndex, dl);
5216 SDValue Base = Node->getOperand(3);
5217 SDValue Scale = getI8Imm(1, dl);
5218 SDValue Index = Node->getOperand(4);
5219 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5220 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5221 SDValue Chain = Node->getOperand(0);
5222 MachineSDNode *CNode;
5223 if (Opc == X86::PTILESTORED) {
5224 SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
5225 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5226 } else {
5227 SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
5228 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5229 }
5230 ReplaceNode(Node, CNode);
5231 return;
5232 }
5233 }
5234 break;
5235 }
5236 case ISD::BRIND:
5237 case X86ISD::NT_BRIND: {
5238 if (Subtarget->isTargetNaCl())
5239 // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
5240 // leave the instruction alone.
5241 break;
5242 if (Subtarget->isTarget64BitILP32()) {
5243 // Converts a 32-bit register to a 64-bit, zero-extended version of
5244 // it. This is needed because x86-64 can do many things, but jmp %r32
5245 // ain't one of them.
5246 SDValue Target = Node->getOperand(1);
5247 assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
5248 SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64);
5249 SDValue Brind = CurDAG->getNode(Opcode, dl, MVT::Other,
5250 Node->getOperand(0), ZextTarget);
5251 ReplaceNode(Node, Brind.getNode());
5252 SelectCode(ZextTarget.getNode());
5253 SelectCode(Brind.getNode());
5254 return;
5255 }
5256 break;
5257 }
5259 ReplaceNode(Node, getGlobalBaseReg());
5260 return;
5261
5262 case ISD::BITCAST:
5263 // Just drop all 128/256/512-bit bitcasts.
5264 if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
5265 NVT == MVT::f128) {
5266 ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
5267 CurDAG->RemoveDeadNode(Node);
5268 return;
5269 }
5270 break;
5271
5272 case ISD::SRL:
5273 if (matchBitExtract(Node))
5274 return;
5275 [[fallthrough]];
5276 case ISD::SRA:
5277 case ISD::SHL:
5278 if (tryShiftAmountMod(Node))
5279 return;
5280 break;
5281
5282 case X86ISD::VPTERNLOG: {
5283 uint8_t Imm = Node->getConstantOperandVal(3);
5284 if (matchVPTERNLOG(Node, Node, Node, Node, Node->getOperand(0),
5285 Node->getOperand(1), Node->getOperand(2), Imm))
5286 return;
5287 break;
5288 }
5289
5290 case X86ISD::ANDNP:
5291 if (tryVPTERNLOG(Node))
5292 return;
5293 break;
5294
5295 case ISD::AND:
5296 if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
5297 // Try to form a masked VPTESTM. Operands can be in either order.
5298 SDValue N0 = Node->getOperand(0);
5299 SDValue N1 = Node->getOperand(1);
5300 if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
5301 tryVPTESTM(Node, N0, N1))
5302 return;
5303 if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
5304 tryVPTESTM(Node, N1, N0))
5305 return;
5306 }
5307
5308 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
5309 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
5310 CurDAG->RemoveDeadNode(Node);
5311 return;
5312 }
5313 if (matchBitExtract(Node))
5314 return;
5315 if (AndImmShrink && shrinkAndImmediate(Node))
5316 return;
5317
5318 [[fallthrough]];
5319 case ISD::OR:
5320 case ISD::XOR:
5321 if (tryShrinkShlLogicImm(Node))
5322 return;
5323 if (Opcode == ISD::OR && tryMatchBitSelect(Node))
5324 return;
5325 if (tryVPTERNLOG(Node))
5326 return;
5327
5328 [[fallthrough]];
5329 case ISD::ADD:
5330 if (Opcode == ISD::ADD && matchBitExtract(Node))
5331 return;
5332 [[fallthrough]];
5333 case ISD::SUB: {
5334 // Try to avoid folding immediates with multiple uses for optsize.
5335 // This code tries to select to register form directly to avoid going
5336 // through the isel table which might fold the immediate. We can't change
5337 // the patterns on the add/sub/and/or/xor with immediate paterns in the
5338 // tablegen files to check immediate use count without making the patterns
5339 // unavailable to the fast-isel table.
5340 if (!CurDAG->shouldOptForSize())
5341 break;
5342
5343 // Only handle i8/i16/i32/i64.
5344 if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
5345 break;
5346
5347 SDValue N0 = Node->getOperand(0);
5348 SDValue N1 = Node->getOperand(1);
5349
5350 auto *Cst = dyn_cast<ConstantSDNode>(N1);
5351 if (!Cst)
5352 break;
5353
5354 int64_t Val = Cst->getSExtValue();
5355
5356 // Make sure its an immediate that is considered foldable.
5357 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5358 if (!isInt<8>(Val) && !isInt<32>(Val))
5359 break;
5360
5361 // If this can match to INC/DEC, let it go.
5362 if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
5363 break;
5364
5365 // Check if we should avoid folding this immediate.
5366 if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
5367 break;
5368
5369 // We should not fold the immediate. So we need a register form instead.
5370 unsigned ROpc, MOpc;
5371 switch (NVT.SimpleTy) {
5372 default: llvm_unreachable("Unexpected VT!");
5373 case MVT::i8:
5374 switch (Opcode) {
5375 default: llvm_unreachable("Unexpected opcode!");
5376 case ISD::ADD:
5377 ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
5378 MOpc = GET_ND_IF_ENABLED(X86::ADD8rm);
5379 break;
5380 case ISD::SUB:
5381 ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
5382 MOpc = GET_ND_IF_ENABLED(X86::SUB8rm);
5383 break;
5384 case ISD::AND:
5385 ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
5386 MOpc = GET_ND_IF_ENABLED(X86::AND8rm);
5387 break;
5388 case ISD::OR:
5389 ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
5390 MOpc = GET_ND_IF_ENABLED(X86::OR8rm);
5391 break;
5392 case ISD::XOR:
5393 ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
5394 MOpc = GET_ND_IF_ENABLED(X86::XOR8rm);
5395 break;
5396 }
5397 break;
5398 case MVT::i16:
5399 switch (Opcode) {
5400 default: llvm_unreachable("Unexpected opcode!");
5401 case ISD::ADD:
5402 ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
5403 MOpc = GET_ND_IF_ENABLED(X86::ADD16rm);
5404 break;
5405 case ISD::SUB:
5406 ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
5407 MOpc = GET_ND_IF_ENABLED(X86::SUB16rm);
5408 break;
5409 case ISD::AND:
5410 ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
5411 MOpc = GET_ND_IF_ENABLED(X86::AND16rm);
5412 break;
5413 case ISD::OR:
5414 ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
5415 MOpc = GET_ND_IF_ENABLED(X86::OR16rm);
5416 break;
5417 case ISD::XOR:
5418 ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
5419 MOpc = GET_ND_IF_ENABLED(X86::XOR16rm);
5420 break;
5421 }
5422 break;
5423 case MVT::i32:
5424 switch (Opcode) {
5425 default: llvm_unreachable("Unexpected opcode!");
5426 case ISD::ADD:
5427 ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
5428 MOpc = GET_ND_IF_ENABLED(X86::ADD32rm);
5429 break;
5430 case ISD::SUB:
5431 ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
5432 MOpc = GET_ND_IF_ENABLED(X86::SUB32rm);
5433 break;
5434 case ISD::AND:
5435 ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
5436 MOpc = GET_ND_IF_ENABLED(X86::AND32rm);
5437 break;
5438 case ISD::OR:
5439 ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
5440 MOpc = GET_ND_IF_ENABLED(X86::OR32rm);
5441 break;
5442 case ISD::XOR:
5443 ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
5444 MOpc = GET_ND_IF_ENABLED(X86::XOR32rm);
5445 break;
5446 }
5447 break;
5448 case MVT::i64:
5449 switch (Opcode) {
5450 default: llvm_unreachable("Unexpected opcode!");
5451 case ISD::ADD:
5452 ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
5453 MOpc = GET_ND_IF_ENABLED(X86::ADD64rm);
5454 break;
5455 case ISD::SUB:
5456 ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
5457 MOpc = GET_ND_IF_ENABLED(X86::SUB64rm);
5458 break;
5459 case ISD::AND:
5460 ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
5461 MOpc = GET_ND_IF_ENABLED(X86::AND64rm);
5462 break;
5463 case ISD::OR:
5464 ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
5465 MOpc = GET_ND_IF_ENABLED(X86::OR64rm);
5466 break;
5467 case ISD::XOR:
5468 ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
5469 MOpc = GET_ND_IF_ENABLED(X86::XOR64rm);
5470 break;
5471 }
5472 break;
5473 }
5474
5475 // Ok this is a AND/OR/XOR/ADD/SUB with constant.
5476
5477 // If this is a not a subtract, we can still try to fold a load.
5478 if (Opcode != ISD::SUB) {
5479 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5480 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5481 SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5482 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5483 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5484 // Update the chain.
5485 ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
5486 // Record the mem-refs
5487 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
5488 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5489 CurDAG->RemoveDeadNode(Node);
5490 return;
5491 }
5492 }
5493
5494 CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
5495 return;
5496 }
5497
5498 case X86ISD::SMUL:
5499 // i16/i32/i64 are handled with isel patterns.
5500 if (NVT != MVT::i8)
5501 break;
5502 [[fallthrough]];
5503 case X86ISD::UMUL: {
5504 SDValue N0 = Node->getOperand(0);
5505 SDValue N1 = Node->getOperand(1);
5506
5507 unsigned LoReg, ROpc, MOpc;
5508 switch (NVT.SimpleTy) {
5509 default: llvm_unreachable("Unsupported VT!");
5510 case MVT::i8:
5511 LoReg = X86::AL;
5512 ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
5513 MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
5514 break;
5515 case MVT::i16:
5516 LoReg = X86::AX;
5517 ROpc = X86::MUL16r;
5518 MOpc = X86::MUL16m;
5519 break;
5520 case MVT::i32:
5521 LoReg = X86::EAX;
5522 ROpc = X86::MUL32r;
5523 MOpc = X86::MUL32m;
5524 break;
5525 case MVT::i64:
5526 LoReg = X86::RAX;
5527 ROpc = X86::MUL64r;
5528 MOpc = X86::MUL64m;
5529 break;
5530 }
5531
5532 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5533 bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5534 // Multiply is commutative.
5535 if (!FoldedLoad) {
5536 FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5537 if (FoldedLoad)
5538 std::swap(N0, N1);
5539 }
5540
5541 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5542 N0, SDValue()).getValue(1);
5543
5544 MachineSDNode *CNode;
5545 if (FoldedLoad) {
5546 // i16/i32/i64 use an instruction that produces a low and high result even
5547 // though only the low result is used.
5548 SDVTList VTs;
5549 if (NVT == MVT::i8)
5550 VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5551 else
5552 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
5553
5554 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5555 InGlue };
5556 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5557
5558 // Update the chain.
5559 ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
5560 // Record the mem-refs
5561 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5562 } else {
5563 // i16/i32/i64 use an instruction that produces a low and high result even
5564 // though only the low result is used.
5565 SDVTList VTs;
5566 if (NVT == MVT::i8)
5567 VTs = CurDAG->getVTList(NVT, MVT::i32);
5568 else
5569 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
5570
5571 CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InGlue});
5572 }
5573
5574 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5575 ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
5576 CurDAG->RemoveDeadNode(Node);
5577 return;
5578 }
5579
5580 case ISD::SMUL_LOHI:
5581 case ISD::UMUL_LOHI: {
5582 SDValue N0 = Node->getOperand(0);
5583 SDValue N1 = Node->getOperand(1);
5584
5585 unsigned Opc, MOpc;
5586 unsigned LoReg, HiReg;
5587 bool IsSigned = Opcode == ISD::SMUL_LOHI;
5588 bool UseMULX = !IsSigned && Subtarget->hasBMI2();
5589 bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
5590 switch (NVT.SimpleTy) {
5591 default: llvm_unreachable("Unsupported VT!");
5592 case MVT::i32:
5593 Opc = UseMULXHi ? X86::MULX32Hrr
5594 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
5595 : IsSigned ? X86::IMUL32r
5596 : X86::MUL32r;
5597 MOpc = UseMULXHi ? X86::MULX32Hrm
5598 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
5599 : IsSigned ? X86::IMUL32m
5600 : X86::MUL32m;
5601 LoReg = UseMULX ? X86::EDX : X86::EAX;
5602 HiReg = X86::EDX;
5603 break;
5604 case MVT::i64:
5605 Opc = UseMULXHi ? X86::MULX64Hrr
5606 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
5607 : IsSigned ? X86::IMUL64r
5608 : X86::MUL64r;
5609 MOpc = UseMULXHi ? X86::MULX64Hrm
5610 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
5611 : IsSigned ? X86::IMUL64m
5612 : X86::MUL64m;
5613 LoReg = UseMULX ? X86::RDX : X86::RAX;
5614 HiReg = X86::RDX;
5615 break;
5616 }
5617
5618 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5619 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5620 // Multiply is commutative.
5621 if (!foldedLoad) {
5622 foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5623 if (foldedLoad)
5624 std::swap(N0, N1);
5625 }
5626
5627 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5628 N0, SDValue()).getValue(1);
5629 SDValue ResHi, ResLo;
5630 if (foldedLoad) {
5631 SDValue Chain;
5632 MachineSDNode *CNode = nullptr;
5633 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5634 InGlue };
5635 if (UseMULXHi) {
5636 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
5637 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5638 ResHi = SDValue(CNode, 0);
5639 Chain = SDValue(CNode, 1);
5640 } else if (UseMULX) {
5641 SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
5642 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5643 ResHi = SDValue(CNode, 0);
5644 ResLo = SDValue(CNode, 1);
5645 Chain = SDValue(CNode, 2);
5646 } else {
5647 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
5648 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5649 Chain = SDValue(CNode, 0);
5650 InGlue = SDValue(CNode, 1);
5651 }
5652
5653 // Update the chain.
5654 ReplaceUses(N1.getValue(1), Chain);
5655 // Record the mem-refs
5656 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5657 } else {
5658 SDValue Ops[] = { N1, InGlue };
5659 if (UseMULXHi) {
5660 SDVTList VTs = CurDAG->getVTList(NVT);
5661 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5662 ResHi = SDValue(CNode, 0);
5663 } else if (UseMULX) {
5664 SDVTList VTs = CurDAG->getVTList(NVT, NVT);
5665 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5666 ResHi = SDValue(CNode, 0);
5667 ResLo = SDValue(CNode, 1);
5668 } else {
5669 SDVTList VTs = CurDAG->getVTList(MVT::Glue);
5670 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5671 InGlue = SDValue(CNode, 0);
5672 }
5673 }
5674
5675 // Copy the low half of the result, if it is needed.
5676 if (!SDValue(Node, 0).use_empty()) {
5677 if (!ResLo) {
5678 assert(LoReg && "Register for low half is not defined!");
5679 ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
5680 NVT, InGlue);
5681 InGlue = ResLo.getValue(2);
5682 }
5683 ReplaceUses(SDValue(Node, 0), ResLo);
5684 LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
5685 dbgs() << '\n');
5686 }
5687 // Copy the high half of the result, if it is needed.
5688 if (!SDValue(Node, 1).use_empty()) {
5689 if (!ResHi) {
5690 assert(HiReg && "Register for high half is not defined!");
5691 ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
5692 NVT, InGlue);
5693 InGlue = ResHi.getValue(2);
5694 }
5695 ReplaceUses(SDValue(Node, 1), ResHi);
5696 LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
5697 dbgs() << '\n');
5698 }
5699
5700 CurDAG->RemoveDeadNode(Node);
5701 return;
5702 }
5703
5704 case ISD::SDIVREM:
5705 case ISD::UDIVREM: {
5706 SDValue N0 = Node->getOperand(0);
5707 SDValue N1 = Node->getOperand(1);
5708
5709 unsigned ROpc, MOpc;
5710 bool isSigned = Opcode == ISD::SDIVREM;
5711 if (!isSigned) {
5712 switch (NVT.SimpleTy) {
5713 default: llvm_unreachable("Unsupported VT!");
5714 case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break;
5715 case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
5716 case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
5717 case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
5718 }
5719 } else {
5720 switch (NVT.SimpleTy) {
5721 default: llvm_unreachable("Unsupported VT!");
5722 case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
5723 case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
5724 case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
5725 case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
5726 }
5727 }
5728
5729 unsigned LoReg, HiReg, ClrReg;
5730 unsigned SExtOpcode;
5731 switch (NVT.SimpleTy) {
5732 default: llvm_unreachable("Unsupported VT!");
5733 case MVT::i8:
5734 LoReg = X86::AL; ClrReg = HiReg = X86::AH;
5735 SExtOpcode = 0; // Not used.
5736 break;
5737 case MVT::i16:
5738 LoReg = X86::AX; HiReg = X86::DX;
5739 ClrReg = X86::DX;
5740 SExtOpcode = X86::CWD;
5741 break;
5742 case MVT::i32:
5743 LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
5744 SExtOpcode = X86::CDQ;
5745 break;
5746 case MVT::i64:
5747 LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
5748 SExtOpcode = X86::CQO;
5749 break;
5750 }
5751
5752 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5753 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5754 bool signBitIsZero = CurDAG->SignBitIsZero(N0);
5755
5756 SDValue InGlue;
5757 if (NVT == MVT::i8) {
5758 // Special case for div8, just use a move with zero extension to AX to
5759 // clear the upper 8 bits (AH).
5760 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
5761 MachineSDNode *Move;
5762 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5763 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5764 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
5765 : X86::MOVZX16rm8;
5766 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
5767 Chain = SDValue(Move, 1);
5768 ReplaceUses(N0.getValue(1), Chain);
5769 // Record the mem-refs
5770 CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
5771 } else {
5772 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
5773 : X86::MOVZX16rr8;
5774 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
5775 Chain = CurDAG->getEntryNode();
5776 }
5777 Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
5778 SDValue());
5779 InGlue = Chain.getValue(1);
5780 } else {
5781 InGlue =
5782 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
5783 LoReg, N0, SDValue()).getValue(1);
5784 if (isSigned && !signBitIsZero) {
5785 // Sign extend the low part into the high part.
5786 InGlue =
5787 SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InGlue),0);
5788 } else {
5789 // Zero out the high part, effectively zero extending the input.
5790 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
5791 SDValue ClrNode = SDValue(
5792 CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, std::nullopt), 0);
5793 switch (NVT.SimpleTy) {
5794 case MVT::i16:
5795 ClrNode =
5796 SDValue(CurDAG->getMachineNode(
5797 TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
5798 CurDAG->getTargetConstant(X86::sub_16bit, dl,
5799 MVT::i32)),
5800 0);
5801 break;
5802 case MVT::i32:
5803 break;
5804 case MVT::i64:
5805 ClrNode =
5806 SDValue(CurDAG->getMachineNode(
5807 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
5808 CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
5809 CurDAG->getTargetConstant(X86::sub_32bit, dl,
5810 MVT::i32)),
5811 0);
5812 break;
5813 default:
5814 llvm_unreachable("Unexpected division source");
5815 }
5816
5817 InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
5818 ClrNode, InGlue).getValue(1);
5819 }
5820 }
5821
5822 if (foldedLoad) {
5823 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5824 InGlue };
5825 MachineSDNode *CNode =
5826 CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
5827 InGlue = SDValue(CNode, 1);
5828 // Update the chain.
5829 ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
5830 // Record the mem-refs
5831 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5832 } else {
5833 InGlue =
5834 SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InGlue), 0);
5835 }
5836
5837 // Prevent use of AH in a REX instruction by explicitly copying it to
5838 // an ABCD_L register.
5839 //
5840 // The current assumption of the register allocator is that isel
5841 // won't generate explicit references to the GR8_ABCD_H registers. If
5842 // the allocator and/or the backend get enhanced to be more robust in
5843 // that regard, this can be, and should be, removed.
5844 if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
5845 SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
5846 unsigned AHExtOpcode =
5847 isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
5848
5849 SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
5850 MVT::Glue, AHCopy, InGlue);
5851 SDValue Result(RNode, 0);
5852 InGlue = SDValue(RNode, 1);
5853
5854 Result =
5855 CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
5856
5857 ReplaceUses(SDValue(Node, 1), Result);
5858 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5859 dbgs() << '\n');
5860 }
5861 // Copy the division (low) result, if it is needed.
5862 if (!SDValue(Node, 0).use_empty()) {
5863 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
5864 LoReg, NVT, InGlue);
5865 InGlue = Result.getValue(2);
5866 ReplaceUses(SDValue(Node, 0), Result);
5867 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5868 dbgs() << '\n');
5869 }
5870 // Copy the remainder (high) result, if it is needed.
5871 if (!SDValue(Node, 1).use_empty()) {
5872 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
5873 HiReg, NVT, InGlue);
5874 InGlue = Result.getValue(2);
5875 ReplaceUses(SDValue(Node, 1), Result);
5876 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5877 dbgs() << '\n');
5878 }
5879 CurDAG->RemoveDeadNode(Node);
5880 return;
5881 }
5882
5883 case X86ISD::FCMP:
5885 case X86ISD::STRICT_FCMPS: {
5886 bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
5887 Node->getOpcode() == X86ISD::STRICT_FCMPS;
5888 SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0);
5889 SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1);
5890
5891 // Save the original VT of the compare.
5892 MVT CmpVT = N0.getSimpleValueType();
5893
5894 // Floating point needs special handling if we don't have FCOMI.
5895 if (Subtarget->canUseCMOV())
5896 break;
5897
5898 bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
5899
5900 unsigned Opc;
5901 switch (CmpVT.SimpleTy) {
5902 default: llvm_unreachable("Unexpected type!");
5903 case MVT::f32:
5904 Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
5905 break;
5906 case MVT::f64:
5907 Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
5908 break;
5909 case MVT::f80:
5910 Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
5911 break;
5912 }
5913
5914 SDValue Chain =
5915 IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode();
5916 SDValue Glue;
5917 if (IsStrictCmp) {
5918 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
5919 Chain = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0);
5920 Glue = Chain.getValue(1);
5921 } else {
5922 Glue = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N0, N1), 0);
5923 }
5924
5925 // Move FPSW to AX.
5926 SDValue FNSTSW =
5927 SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, Glue), 0);
5928
5929 // Extract upper 8-bits of AX.
5930 SDValue Extract =
5931 CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW);
5932
5933 // Move AH into flags.
5934 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
5935 assert(Subtarget->canUseLAHFSAHF() &&
5936 "Target doesn't support SAHF or FCOMI?");
5937 SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue());
5938 Chain = AH;
5939 SDValue SAHF = SDValue(
5940 CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0);
5941
5942 if (IsStrictCmp)
5943 ReplaceUses(SDValue(Node, 1), Chain);
5944
5945 ReplaceUses(SDValue(Node, 0), SAHF);
5946 CurDAG->RemoveDeadNode(Node);
5947 return;
5948 }
5949
5950 case X86ISD::CMP: {
5951 SDValue N0 = Node->getOperand(0);
5952 SDValue N1 = Node->getOperand(1);
5953
5954 // Optimizations for TEST compares.
5955 if (!isNullConstant(N1))
5956 break;
5957
5958 // Save the original VT of the compare.
5959 MVT CmpVT = N0.getSimpleValueType();
5960
5961 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
5962 // by a test instruction. The test should be removed later by
5963 // analyzeCompare if we are using only the zero flag.
5964 // TODO: Should we check the users and use the BEXTR flags directly?
5965 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
5966 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
5967 unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
5968 : X86::TEST32rr;
5969 SDValue BEXTR = SDValue(NewNode, 0);
5970 NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
5971 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
5972 CurDAG->RemoveDeadNode(Node);
5973 return;
5974 }
5975 }
5976
5977 // We can peek through truncates, but we need to be careful below.
5978 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
5979 N0 = N0.getOperand(0);
5980
5981 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
5982 // use a smaller encoding.
5983 // Look past the truncate if CMP is the only use of it.
5984 if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
5985 N0.getValueType() != MVT::i8) {
5986 auto *MaskC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
5987 if (!MaskC)
5988 break;
5989
5990 // We may have looked through a truncate so mask off any bits that
5991 // shouldn't be part of the compare.
5992 uint64_t Mask = MaskC->getZExtValue();
5993 Mask &= maskTrailingOnes<uint64_t>(CmpVT.getScalarSizeInBits());
5994
5995 // Check if we can replace AND+IMM{32,64} with a shift. This is possible
5996 // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
5997 // zero flag.
5998 if (CmpVT == MVT::i64 && !isInt<8>(Mask) && isShiftedMask_64(Mask) &&
5999 onlyUsesZeroFlag(SDValue(Node, 0))) {
6000 unsigned ShiftOpcode = ISD::DELETED_NODE;
6001 unsigned ShiftAmt;
6002 unsigned SubRegIdx;
6003 MVT SubRegVT;
6004 unsigned TestOpcode;
6005 unsigned LeadingZeros = llvm::countl_zero(Mask);
6006 unsigned TrailingZeros = llvm::countr_zero(Mask);
6007
6008 // With leading/trailing zeros, the transform is profitable if we can
6009 // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
6010 // incurring any extra register moves.
6011 bool SavesBytes = !isInt<32>(Mask) || N0.getOperand(0).hasOneUse();
6012 if (LeadingZeros == 0 && SavesBytes) {
6013 // If the mask covers the most significant bit, then we can replace
6014 // TEST+AND with a SHR and check eflags.
6015 // This emits a redundant TEST which is subsequently eliminated.
6016 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6017 ShiftAmt = TrailingZeros;
6018 SubRegIdx = 0;
6019 TestOpcode = X86::TEST64rr;
6020 } else if (TrailingZeros == 0 && SavesBytes) {
6021 // If the mask covers the least significant bit, then we can replace
6022 // TEST+AND with a SHL and check eflags.
6023 // This emits a redundant TEST which is subsequently eliminated.
6024 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri);
6025 ShiftAmt = LeadingZeros;
6026 SubRegIdx = 0;
6027 TestOpcode = X86::TEST64rr;
6028 } else if (MaskC->hasOneUse() && !isInt<32>(Mask)) {
6029 // If the shifted mask extends into the high half and is 8/16/32 bits
6030 // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6031 unsigned PopCount = 64 - LeadingZeros - TrailingZeros;
6032 if (PopCount == 8) {
6033 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6034 ShiftAmt = TrailingZeros;
6035 SubRegIdx = X86::sub_8bit;
6036 SubRegVT = MVT::i8;
6037 TestOpcode = X86::TEST8rr;
6038 } else if (PopCount == 16) {
6039 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6040 ShiftAmt = TrailingZeros;
6041 SubRegIdx = X86::sub_16bit;
6042 SubRegVT = MVT::i16;
6043 TestOpcode = X86::TEST16rr;
6044 } else if (PopCount == 32) {
6045 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6046 ShiftAmt = TrailingZeros;
6047 SubRegIdx = X86::sub_32bit;
6048 SubRegVT = MVT::i32;
6049 TestOpcode = X86::TEST32rr;
6050 }
6051 }
6052 if (ShiftOpcode != ISD::DELETED_NODE) {
6053 SDValue ShiftC = CurDAG->getTargetConstant(ShiftAmt, dl, MVT::i64);
6054 SDValue Shift = SDValue(
6055 CurDAG->getMachineNode(ShiftOpcode, dl, MVT::i64, MVT::i32,
6056 N0.getOperand(0), ShiftC),
6057 0);
6058 if (SubRegIdx != 0) {
6059 Shift =
6060 CurDAG->getTargetExtractSubreg(SubRegIdx, dl, SubRegVT, Shift);
6061 }
6063 CurDAG->getMachineNode(TestOpcode, dl, MVT::i32, Shift, Shift);
6064 ReplaceNode(Node, Test);
6065 return;
6066 }
6067 }
6068
6069 MVT VT;
6070 int SubRegOp;
6071 unsigned ROpc, MOpc;
6072
6073 // For each of these checks we need to be careful if the sign flag is
6074 // being used. It is only safe to use the sign flag in two conditions,
6075 // either the sign bit in the shrunken mask is zero or the final test
6076 // size is equal to the original compare size.
6077
6078 if (isUInt<8>(Mask) &&
6079 (!(Mask & 0x80) || CmpVT == MVT::i8 ||
6080 hasNoSignFlagUses(SDValue(Node, 0)))) {
6081 // For example, convert "testl %eax, $8" to "testb %al, $8"
6082 VT = MVT::i8;
6083 SubRegOp = X86::sub_8bit;
6084 ROpc = X86::TEST8ri;
6085 MOpc = X86::TEST8mi;
6086 } else if (OptForMinSize && isUInt<16>(Mask) &&
6087 (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
6088 hasNoSignFlagUses(SDValue(Node, 0)))) {
6089 // For example, "testl %eax, $32776" to "testw %ax, $32776".
6090 // NOTE: We only want to form TESTW instructions if optimizing for
6091 // min size. Otherwise we only save one byte and possibly get a length
6092 // changing prefix penalty in the decoders.
6093 VT = MVT::i16;
6094 SubRegOp = X86::sub_16bit;
6095 ROpc = X86::TEST16ri;
6096 MOpc = X86::TEST16mi;
6097 } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
6098 ((!(Mask & 0x80000000) &&
6099 // Without minsize 16-bit Cmps can get here so we need to
6100 // be sure we calculate the correct sign flag if needed.
6101 (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
6102 CmpVT == MVT::i32 ||
6103 hasNoSignFlagUses(SDValue(Node, 0)))) {
6104 // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6105 // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6106 // Otherwize, we find ourselves in a position where we have to do
6107 // promotion. If previous passes did not promote the and, we assume
6108 // they had a good reason not to and do not promote here.
6109 VT = MVT::i32;
6110 SubRegOp = X86::sub_32bit;
6111 ROpc = X86::TEST32ri;
6112 MOpc = X86::TEST32mi;
6113 } else {
6114 // No eligible transformation was found.
6115 break;
6116 }
6117
6118 SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
6119 SDValue Reg = N0.getOperand(0);
6120
6121 // Emit a testl or testw.
6122 MachineSDNode *NewNode;
6123 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6124 if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
6125 if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) {
6126 if (!LoadN->isSimple()) {
6127 unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits();
6128 if ((MOpc == X86::TEST8mi && NumVolBits != 8) ||
6129 (MOpc == X86::TEST16mi && NumVolBits != 16) ||
6130 (MOpc == X86::TEST32mi && NumVolBits != 32))
6131 break;
6132 }
6133 }
6134 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
6135 Reg.getOperand(0) };
6136 NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
6137 // Update the chain.
6138 ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
6139 // Record the mem-refs
6140 CurDAG->setNodeMemRefs(NewNode,
6141 {cast<LoadSDNode>(Reg)->getMemOperand()});
6142 } else {
6143 // Extract the subregister if necessary.
6144 if (N0.getValueType() != VT)
6145 Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
6146
6147 NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
6148 }
6149 // Replace CMP with TEST.
6150 ReplaceNode(Node, NewNode);
6151 return;
6152 }
6153 break;
6154 }
6155 case X86ISD::PCMPISTR: {
6156 if (!Subtarget->hasSSE42())
6157 break;
6158
6159 bool NeedIndex = !SDValue(Node, 0).use_empty();
6160 bool NeedMask = !SDValue(Node, 1).use_empty();
6161 // We can't fold a load if we are going to make two instructions.
6162 bool MayFoldLoad = !NeedIndex || !NeedMask;
6163
6164 MachineSDNode *CNode;
6165 if (NeedMask) {
6166 unsigned ROpc =
6167 Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
6168 unsigned MOpc =
6169 Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
6170 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
6171 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6172 }
6173 if (NeedIndex || !NeedMask) {
6174 unsigned ROpc =
6175 Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
6176 unsigned MOpc =
6177 Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
6178 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
6179 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6180 }
6181
6182 // Connect the flag usage to the last instruction created.
6183 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6184 CurDAG->RemoveDeadNode(Node);
6185 return;
6186 }
6187 case X86ISD::PCMPESTR: {
6188 if (!Subtarget->hasSSE42())
6189 break;
6190
6191 // Copy the two implicit register inputs.
6192 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
6193 Node->getOperand(1),
6194 SDValue()).getValue(1);
6195 InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
6196 Node->getOperand(3), InGlue).getValue(1);
6197
6198 bool NeedIndex = !SDValue(Node, 0).use_empty();
6199 bool NeedMask = !SDValue(Node, 1).use_empty();
6200 // We can't fold a load if we are going to make two instructions.
6201 bool MayFoldLoad = !NeedIndex || !NeedMask;
6202
6203 MachineSDNode *CNode;
6204 if (NeedMask) {
6205 unsigned ROpc =
6206 Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
6207 unsigned MOpc =
6208 Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
6209 CNode =
6210 emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, InGlue);
6211 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6212 }
6213 if (NeedIndex || !NeedMask) {
6214 unsigned ROpc =
6215 Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
6216 unsigned MOpc =
6217 Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
6218 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InGlue);
6219 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6220 }
6221 // Connect the flag usage to the last instruction created.
6222 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6223 CurDAG->RemoveDeadNode(Node);
6224 return;
6225 }
6226
6227 case ISD::SETCC: {
6228 if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
6229 return;
6230
6231 break;
6232 }
6233
6234 case ISD::STORE:
6235 if (foldLoadStoreIntoMemOperand(Node))
6236 return;
6237 break;
6238
6239 case X86ISD::SETCC_CARRY: {
6240 MVT VT = Node->getSimpleValueType(0);
6242 if (Subtarget->hasSBBDepBreaking()) {
6243 // We have to do this manually because tblgen will put the eflags copy in
6244 // the wrong place if we use an extract_subreg in the pattern.
6245 // Copy flags to the EFLAGS register and glue it to next node.
6246 SDValue EFLAGS =
6247 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
6248 Node->getOperand(1), SDValue());
6249
6250 // Create a 64-bit instruction if the result is 64-bits otherwise use the
6251 // 32-bit version.
6252 unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
6253 MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
6254 Result = SDValue(
6255 CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)),
6256 0);
6257 } else {
6258 // The target does not recognize sbb with the same reg operand as a
6259 // no-source idiom, so we explicitly zero the input values.
6260 Result = getSBBZero(Node);
6261 }
6262
6263 // For less than 32-bits we need to extract from the 32-bit node.
6264 if (VT == MVT::i8 || VT == MVT::i16) {
6265 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6266 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6267 }
6268
6269 ReplaceUses(SDValue(Node, 0), Result);
6270 CurDAG->RemoveDeadNode(Node);
6271 return;
6272 }
6273 case X86ISD::SBB: {
6274 if (isNullConstant(Node->getOperand(0)) &&
6275 isNullConstant(Node->getOperand(1))) {
6276 SDValue Result = getSBBZero(Node);
6277
6278 // Replace the flag use.
6279 ReplaceUses(SDValue(Node, 1), Result.getValue(1));
6280
6281 // Replace the result use.
6282 if (!SDValue(Node, 0).use_empty()) {
6283 // For less than 32-bits we need to extract from the 32-bit node.
6284 MVT VT = Node->getSimpleValueType(0);
6285 if (VT == MVT::i8 || VT == MVT::i16) {
6286 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6287 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6288 }
6289 ReplaceUses(SDValue(Node, 0), Result);
6290 }
6291
6292 CurDAG->RemoveDeadNode(Node);
6293 return;
6294 }
6295 break;
6296 }
6297 case X86ISD::MGATHER: {
6298 auto *Mgt = cast<X86MaskedGatherSDNode>(Node);
6299 SDValue IndexOp = Mgt->getIndex();
6300 SDValue Mask = Mgt->getMask();
6301 MVT IndexVT = IndexOp.getSimpleValueType();
6302 MVT ValueVT = Node->getSimpleValueType(0);
6303 MVT MaskVT = Mask.getSimpleValueType();
6304
6305 // This is just to prevent crashes if the nodes are malformed somehow. We're
6306 // otherwise only doing loose type checking in here based on type what
6307 // a type constraint would say just like table based isel.
6308 if (!ValueVT.isVector() || !MaskVT.isVector())
6309 break;
6310
6311 unsigned NumElts = ValueVT.getVectorNumElements();
6312 MVT ValueSVT = ValueVT.getVectorElementType();
6313
6314 bool IsFP = ValueSVT.isFloatingPoint();
6315 unsigned EltSize = ValueSVT.getSizeInBits();
6316
6317 unsigned Opc = 0;
6318 bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
6319 if (AVX512Gather) {
6320 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6321 Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
6322 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6323 Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
6324 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6325 Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
6326 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6327 Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
6328 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6329 Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
6330 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6331 Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
6332 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6333 Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
6334 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6335 Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
6336 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6337 Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
6338 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6339 Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
6340 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6341 Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
6342 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6343 Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
6344 } else {
6345 assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
6346 "Unexpected mask VT!");
6347 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6348 Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
6349 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6350 Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
6351 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6352 Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
6353 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6354 Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
6355 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6356 Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
6357 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6358 Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
6359 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6360 Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
6361 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6362 Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
6363 }
6364
6365 if (!Opc)
6366 break;
6367
6368 SDValue Base, Scale, Index, Disp, Segment;
6369 if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(),
6370 Base, Scale, Index, Disp, Segment))
6371 break;
6372
6373 SDValue PassThru = Mgt->getPassThru();
6374 SDValue Chain = Mgt->getChain();
6375 // Gather instructions have a mask output not in the ISD node.
6376 SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other);
6377
6378 MachineSDNode *NewNode;
6379 if (AVX512Gather) {
6380 SDValue Ops[] = {PassThru, Mask, Base, Scale,
6381 Index, Disp, Segment, Chain};
6382 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6383 } else {
6384 SDValue Ops[] = {PassThru, Base, Scale, Index,
6385 Disp, Segment, Mask, Chain};
6386 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6387 }
6388 CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()});
6389 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6390 ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2));
6391 CurDAG->RemoveDeadNode(Node);
6392 return;
6393 }
6394 case X86ISD::MSCATTER: {
6395 auto *Sc = cast<X86MaskedScatterSDNode>(Node);
6396 SDValue Value = Sc->getValue();
6397 SDValue IndexOp = Sc->getIndex();
6398 MVT IndexVT = IndexOp.getSimpleValueType();
6399 MVT ValueVT = Value.getSimpleValueType();
6400
6401 // This is just to prevent crashes if the nodes are malformed somehow. We're
6402 // otherwise only doing loose type checking in here based on type what
6403 // a type constraint would say just like table based isel.
6404 if (!ValueVT.isVector())
6405 break;
6406
6407 unsigned NumElts = ValueVT.getVectorNumElements();
6408 MVT ValueSVT = ValueVT.getVectorElementType();
6409
6410 bool IsFP = ValueSVT.isFloatingPoint();
6411 unsigned EltSize = ValueSVT.getSizeInBits();
6412
6413 unsigned Opc;
6414 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6415 Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
6416 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6417 Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
6418 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6419 Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
6420 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6421 Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
6422 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6423 Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
6424 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6425 Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
6426 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6427 Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
6428 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6429 Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
6430 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6431 Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
6432 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6433 Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
6434 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6435 Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
6436 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6437 Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
6438 else
6439 break;
6440
6441 SDValue Base, Scale, Index, Disp, Segment;
6442 if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(),
6443 Base, Scale, Index, Disp, Segment))
6444 break;
6445
6446 SDValue Mask = Sc->getMask();
6447 SDValue Chain = Sc->getChain();
6448 // Scatter instructions have a mask output not in the ISD node.
6449 SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other);
6450 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
6451
6452 MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6453 CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()});
6454 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1));
6455 CurDAG->RemoveDeadNode(Node);
6456 return;
6457 }
6459 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6460 auto CallId = MFI->getPreallocatedIdForCallSite(
6461 cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6462 SDValue Chain = Node->getOperand(0);
6463 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6464 MachineSDNode *New = CurDAG->getMachineNode(
6465 TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain);
6466 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain
6467 CurDAG->RemoveDeadNode(Node);
6468 return;
6469 }
6470 case ISD::PREALLOCATED_ARG: {
6471 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6472 auto CallId = MFI->getPreallocatedIdForCallSite(
6473 cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6474 SDValue Chain = Node->getOperand(0);
6475 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6476 SDValue ArgIndex = Node->getOperand(2);
6477 SDValue Ops[3];
6478 Ops[0] = CallIdValue;
6479 Ops[1] = ArgIndex;
6480 Ops[2] = Chain;
6481 MachineSDNode *New = CurDAG->getMachineNode(
6482 TargetOpcode::PREALLOCATED_ARG, dl,
6483 CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()),
6484 MVT::Other),
6485 Ops);
6486 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer
6487 ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain
6488 CurDAG->RemoveDeadNode(Node);
6489 return;
6490 }
6495 if (!Subtarget->hasWIDEKL())
6496 break;
6497
6498 unsigned Opcode;
6499 switch (Node->getOpcode()) {
6500 default:
6501 llvm_unreachable("Unexpected opcode!");
6503 Opcode = X86::AESENCWIDE128KL;
6504 break;
6506 Opcode = X86::AESDECWIDE128KL;
6507 break;
6509 Opcode = X86::AESENCWIDE256KL;
6510 break;
6512 Opcode = X86::AESDECWIDE256KL;
6513 break;
6514 }
6515
6516 SDValue Chain = Node->getOperand(0);
6517 SDValue Addr = Node->getOperand(1);
6518
6519 SDValue Base, Scale, Index, Disp, Segment;
6520 if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment))
6521 break;
6522
6523 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2),
6524 SDValue());
6525 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3),
6526 Chain.getValue(1));
6527 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4),
6528 Chain.getValue(1));
6529 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5),
6530 Chain.getValue(1));
6531 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6),
6532 Chain.getValue(1));
6533 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7),
6534 Chain.getValue(1));
6535 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8),
6536 Chain.getValue(1));
6537 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9),
6538 Chain.getValue(1));
6539
6540 MachineSDNode *Res = CurDAG->getMachineNode(
6541 Opcode, dl, Node->getVTList(),
6542 {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)});
6543 CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand());
6544 ReplaceNode(Node, Res);
6545 return;
6546 }
6547 }
6548
6549 SelectCode(Node);
6550}
6551
6552bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6553 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
6554 std::vector<SDValue> &OutOps) {
6555 SDValue Op0, Op1, Op2, Op3, Op4;
6556 switch (ConstraintID) {
6557 default:
6558 llvm_unreachable("Unexpected asm memory constraint");
6559 case InlineAsm::ConstraintCode::o: // offsetable ??
6560 case InlineAsm::ConstraintCode::v: // not offsetable ??
6561 case InlineAsm::ConstraintCode::m: // memory
6562 case InlineAsm::ConstraintCode::X:
6563 case InlineAsm::ConstraintCode::p: // address
6564 if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
6565 return true;
6566 break;
6567 }
6568
6569 OutOps.push_back(Op0);
6570 OutOps.push_back(Op1);
6571 OutOps.push_back(Op2);
6572 OutOps.push_back(Op3);
6573 OutOps.push_back(Op4);
6574 return false;
6575}
6576
6577/// This pass converts a legalized DAG into a X86-specific DAG,
6578/// ready for instruction scheduling.
6580 CodeGenOptLevel OptLevel) {
6581 return new X86DAGToDAGISel(TM, OptLevel);
6582}
unsigned SubReg
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
aarch64 promote const
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
amdgpu AMDGPU Register Bank Select
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
#define P(N)
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static bool isRIPRelative(const MCInst &MI, const MCInstrInfo &MCII)
Check if the instruction uses RIP relative addressing.
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget)
static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM)
static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM)
static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, SDValue StoredVal, SelectionDAG *CurDAG, unsigned LoadOpNo, LoadSDNode *&LoadNode, SDValue &InputChain)
Check whether or not the chain ending in StoreNode is suitable for doing the {load; op; store} to mod...
#define GET_EGPR_IF_ENABLED(OPC)
static bool needBWI(MVT VT)
static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, bool FoldedBCast, bool Masked)
static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM, const X86Subtarget &Subtarget)
static bool mayUseCarryFlag(X86::CondCode CC)
static cl::opt< bool > EnablePromoteAnyextLoad("x86-promote-anyext-load", cl::init(true), cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden)
cl::opt< bool > IndirectBranchTracking
static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, SDValue Call, SDValue OrigChain)
Replace the original chain operand of the call with load's chain operand and move load below the call...
static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N)
#define GET_ND_IF_ENABLED(OPC)
#define VPTESTM_BROADCAST_CASES(SUFFIX)
#define FROM_TO(A, B)
static cl::opt< bool > AndImmShrink("x86-and-imm-shrink", cl::init(true), cl::desc("Enable setting constant bits to reduce size of mask immediates"), cl::Hidden)
static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, X86ISelAddressMode &AM)
#define VPTESTM_FULL_CASES(SUFFIX)
static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq)
Return true if call address is a load and it can be moved below CALLSEQ_START and the chains leading ...
static bool isDispSafeForFrameIndex(int64_t Val)
#define PASS_NAME
#define CASE(A)
#define CASE_ND(OP)
#define DEBUG_TYPE
static bool isEndbrImm64(uint64_t Imm)
#define GET_ND_IF_ENABLED(OPC)
Value * RHS
DEMANGLE_DUMP_METHOD void dump() const
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:906
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:349
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1446
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition: APInt.h:1555
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1489
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:367
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1613
The address of a basic block.
Definition: Constants.h:889
This is an important base class in LLVM.
Definition: Constant.h:41
This class represents an Operation in the Expression.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:685
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:682
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:675
std::optional< ConstantRange > getAbsoluteSymbolRange() const
If this is an absolute symbol reference, returns the range of the symbol, otherwise returns std::null...
Definition: Globals.cpp:393
This class is used to form a handle around another node that is persistent and is updated across invo...
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:40
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
MachineModuleInfo & getMMI() const
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const Module * getModule() const
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
Root of the metadata hierarchy.
Definition: Metadata.h:62
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition: Module.cpp:331
Register getReg() const
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
int getNodeId() const
Return the unique node id.
void dump() const
Dump this node, for debugging.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
bool use_empty() const
Return true if there are no uses of this node.
const SDValue & getOperand(unsigned Num) const
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
op_iterator op_end() const
op_iterator op_begin() const
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
bool use_empty() const
Return true if there are no nodes using value ResNo of Node.
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
unsigned getNumOperands() const
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector< SDValue > &OutOps)
SelectInlineAsmMemoryOperand - Select the specified address as a target addressing mode,...
virtual void PostprocessISelDAG()
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
static int getUninvalidatedNodeId(SDNode *N)
virtual void emitFunctionEntryCode()
virtual bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const
IsProfitableToFold - Returns true if it's profitable to fold the specific operand node N of U during ...
virtual bool ComplexPatternFuncMutatesDAG() const
Return true if complex patterns for this target can mutate the DAG.
virtual void PreprocessISelDAG()
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
static void InvalidateNodeId(SDNode *N)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
static constexpr unsigned MaxRecursionDepth
Definition: SelectionDAG.h:448
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
void RepositionNode(allnodes_iterator Position, SDNode *N)
Move node N in the AllNodes list to be immediately before the given iterator Position.
ilist< SDNode >::iterator allnodes_iterator
Definition: SelectionDAG.h:534
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
std::vector< ArgListEntry > ArgListTy
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
unsigned getID() const
Return the register class ID number.
Target - Wrapper for Target specific information.
static Type * getVoidTy(LLVMContext &C)
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
LLVM Value Representation.
Definition: Value.h:74
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:5239
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
size_t getPreallocatedIdForCallSite(const Value *CS)
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
self_iterator getIterator()
Definition: ilist_node.h:109
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:751
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:251
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:560
@ STRICT_FCEIL
Definition: ISDOpcodes.h:427
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:240
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1038
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:784
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:256
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:904
@ FrameIndex
Definition: ISDOpcodes.h:80
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:775
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:621
@ PREALLOCATED_SETUP
Definition: ISDOpcodes.h:1167
@ TargetExternalSymbol
Definition: ISDOpcodes.h:169
@ PREALLOCATED_ARG
Definition: ISDOpcodes.h:1170
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1059
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:209
@ TargetGlobalAddress
TargetGlobalAddress - Like GlobalAddress, but the DAG does no folding or anything else with this node...
Definition: ISDOpcodes.h:164
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:431
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:706
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:574
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:536
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:781
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition: ISDOpcodes.h:114
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition: ISDOpcodes.h:810
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:889
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:737
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:304
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:428
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:430
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:444
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:466
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:443
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:837
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:471
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:681
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:870
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:832
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:424
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:787
@ CALLSEQ_START
CALLSEQ_START/CALLSEQ_END - These operators mark the beginning and end of a call sequence,...
Definition: ISDOpcodes.h:1147
@ STRICT_FRINT
Definition: ISDOpcodes.h:423
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ TargetGlobalTLSAddress
Definition: ISDOpcodes.h:165
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1535
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
const uint64_t Magic
Definition: InstrProf.h:1138
SymbolFlags
Symbol flags.
Definition: Symbol.h:24
@ GlobalBaseReg
The result of the mflr at function entry, used for PIC code.
@ X86
Windows x64, Windows Itanium (IA-64)
@ SS
Definition: X86.h:207
@ FS
Definition: X86.h:206
@ GS
Definition: X86.h:205
Reg
All possible values of the reg field in the ModR/M byte.
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
Definition: X86BaseInfo.h:377
@ EVEX
EVEX - Specifies that this instruction use EVEX form which provides syntax support up to 32 512-bit r...
Definition: X86BaseInfo.h:839
@ VEX
VEX - encoding using 0xC4/0xC5.
Definition: X86BaseInfo.h:832
@ XOP
XOP - Opcode prefix used by XOP instructions.
Definition: X86BaseInfo.h:834
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ STRICT_FCMP
X86 strict FP compare instructions.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ CALL
These operations represent an abstract X86 call instruction, which includes a bunch of information.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FLD
This instruction implements an extending load to FP stack slots.
@ TC_RETURN
Tail call return.
@ FOR
Bitwise logical OR of floating point values.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
int getCondSrcNoFromDesc(const MCInstrDesc &MCID)
Return the source operand # for condition code by MCID.
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition: DWP.cpp:456
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:269
unsigned M1(unsigned Val)
Definition: VE.h:376
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:257
FunctionPass * createX86ISelDag(X86TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a X86-specific DAG, ready for instruction scheduling.
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
@ And
Bitwise or logical AND of integers.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Extended Value Type.
Definition: ValueTypes.h:34
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:93
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:203
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:208
This class contains a discriminated union of information about pointers in memory operands,...
unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This structure contains all information that is necessary for lowering calls.