LLVM 19.0.0git
X86ISelDAGToDAG.cpp
Go to the documentation of this file.
1//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines a DAG pattern matching instruction selector for X86,
10// converting from a legalized dag to a X86 dag.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86.h"
16#include "X86RegisterInfo.h"
17#include "X86Subtarget.h"
18#include "X86TargetMachine.h"
19#include "llvm/ADT/Statistic.h"
22#include "llvm/Config/llvm-config.h"
24#include "llvm/IR/Function.h"
26#include "llvm/IR/Intrinsics.h"
27#include "llvm/IR/IntrinsicsX86.h"
28#include "llvm/IR/Type.h"
29#include "llvm/Support/Debug.h"
33#include <cstdint>
34
35using namespace llvm;
36
37#define DEBUG_TYPE "x86-isel"
38#define PASS_NAME "X86 DAG->DAG Instruction Selection"
39
40STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
41
42static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
43 cl::desc("Enable setting constant bits to reduce size of mask immediates"),
45
47 "x86-promote-anyext-load", cl::init(true),
48 cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
49
51
52//===----------------------------------------------------------------------===//
53// Pattern Matcher Implementation
54//===----------------------------------------------------------------------===//
55
56namespace {
57 /// This corresponds to X86AddressMode, but uses SDValue's instead of register
58 /// numbers for the leaves of the matched tree.
59 struct X86ISelAddressMode {
60 enum {
61 RegBase,
62 FrameIndexBase
63 } BaseType = RegBase;
64
65 // This is really a union, discriminated by BaseType!
66 SDValue Base_Reg;
67 int Base_FrameIndex = 0;
68
69 unsigned Scale = 1;
70 SDValue IndexReg;
71 int32_t Disp = 0;
72 SDValue Segment;
73 const GlobalValue *GV = nullptr;
74 const Constant *CP = nullptr;
75 const BlockAddress *BlockAddr = nullptr;
76 const char *ES = nullptr;
77 MCSymbol *MCSym = nullptr;
78 int JT = -1;
79 Align Alignment; // CP alignment.
80 unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_*
81 bool NegateIndex = false;
82
83 X86ISelAddressMode() = default;
84
85 bool hasSymbolicDisplacement() const {
86 return GV != nullptr || CP != nullptr || ES != nullptr ||
87 MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
88 }
89
90 bool hasBaseOrIndexReg() const {
91 return BaseType == FrameIndexBase ||
92 IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
93 }
94
95 /// Return true if this addressing mode is already RIP-relative.
96 bool isRIPRelative() const {
97 if (BaseType != RegBase) return false;
98 if (RegisterSDNode *RegNode =
99 dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
100 return RegNode->getReg() == X86::RIP;
101 return false;
102 }
103
104 void setBaseReg(SDValue Reg) {
105 BaseType = RegBase;
106 Base_Reg = Reg;
107 }
108
109#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
110 void dump(SelectionDAG *DAG = nullptr) {
111 dbgs() << "X86ISelAddressMode " << this << '\n';
112 dbgs() << "Base_Reg ";
113 if (Base_Reg.getNode())
114 Base_Reg.getNode()->dump(DAG);
115 else
116 dbgs() << "nul\n";
117 if (BaseType == FrameIndexBase)
118 dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
119 dbgs() << " Scale " << Scale << '\n'
120 << "IndexReg ";
121 if (NegateIndex)
122 dbgs() << "negate ";
123 if (IndexReg.getNode())
124 IndexReg.getNode()->dump(DAG);
125 else
126 dbgs() << "nul\n";
127 dbgs() << " Disp " << Disp << '\n'
128 << "GV ";
129 if (GV)
130 GV->dump();
131 else
132 dbgs() << "nul";
133 dbgs() << " CP ";
134 if (CP)
135 CP->dump();
136 else
137 dbgs() << "nul";
138 dbgs() << '\n'
139 << "ES ";
140 if (ES)
141 dbgs() << ES;
142 else
143 dbgs() << "nul";
144 dbgs() << " MCSym ";
145 if (MCSym)
146 dbgs() << MCSym;
147 else
148 dbgs() << "nul";
149 dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
150 }
151#endif
152 };
153}
154
155namespace {
156 //===--------------------------------------------------------------------===//
157 /// ISel - X86-specific code to select X86 machine instructions for
158 /// SelectionDAG operations.
159 ///
160 class X86DAGToDAGISel final : public SelectionDAGISel {
161 /// Keep a pointer to the X86Subtarget around so that we can
162 /// make the right decision when generating code for different targets.
163 const X86Subtarget *Subtarget;
164
165 /// If true, selector should try to optimize for minimum code size.
166 bool OptForMinSize;
167
168 /// Disable direct TLS access through segment registers.
169 bool IndirectTlsSegRefs;
170
171 public:
172 static char ID;
173
174 X86DAGToDAGISel() = delete;
175
176 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
177 : SelectionDAGISel(ID, tm, OptLevel), Subtarget(nullptr),
178 OptForMinSize(false), IndirectTlsSegRefs(false) {}
179
180 bool runOnMachineFunction(MachineFunction &MF) override {
181 // Reset the subtarget each time through.
182 Subtarget = &MF.getSubtarget<X86Subtarget>();
183 IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
184 "indirect-tls-seg-refs");
185
186 // OptFor[Min]Size are used in pattern predicates that isel is matching.
187 OptForMinSize = MF.getFunction().hasMinSize();
188 assert((!OptForMinSize || MF.getFunction().hasOptSize()) &&
189 "OptForMinSize implies OptForSize");
190
192 return true;
193 }
194
195 void emitFunctionEntryCode() override;
196
197 bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
198
199 void PreprocessISelDAG() override;
200 void PostprocessISelDAG() override;
201
202// Include the pieces autogenerated from the target description.
203#include "X86GenDAGISel.inc"
204
205 private:
206 void Select(SDNode *N) override;
207
208 bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
209 bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
210 bool AllowSegmentRegForX32 = false);
211 bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
212 bool matchAddress(SDValue N, X86ISelAddressMode &AM);
213 bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
214 bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
215 SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
216 unsigned Depth);
217 bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
218 unsigned Depth);
219 bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
220 unsigned Depth);
221 bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
222 bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
223 SDValue &Scale, SDValue &Index, SDValue &Disp,
224 SDValue &Segment);
225 bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
226 SDValue ScaleOp, SDValue &Base, SDValue &Scale,
227 SDValue &Index, SDValue &Disp, SDValue &Segment);
228 bool selectMOV64Imm32(SDValue N, SDValue &Imm);
229 bool selectLEAAddr(SDValue N, SDValue &Base,
230 SDValue &Scale, SDValue &Index, SDValue &Disp,
231 SDValue &Segment);
232 bool selectLEA64_32Addr(SDValue N, SDValue &Base,
233 SDValue &Scale, SDValue &Index, SDValue &Disp,
234 SDValue &Segment);
235 bool selectTLSADDRAddr(SDValue N, SDValue &Base,
236 SDValue &Scale, SDValue &Index, SDValue &Disp,
237 SDValue &Segment);
238 bool selectRelocImm(SDValue N, SDValue &Op);
239
240 bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
241 SDValue &Base, SDValue &Scale,
242 SDValue &Index, SDValue &Disp,
243 SDValue &Segment);
244
245 // Convenience method where P is also root.
246 bool tryFoldLoad(SDNode *P, SDValue N,
247 SDValue &Base, SDValue &Scale,
248 SDValue &Index, SDValue &Disp,
249 SDValue &Segment) {
250 return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
251 }
252
253 bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
254 SDValue &Base, SDValue &Scale,
255 SDValue &Index, SDValue &Disp,
256 SDValue &Segment);
257
258 bool isProfitableToFormMaskedOp(SDNode *N) const;
259
260 /// Implement addressing mode selection for inline asm expressions.
262 InlineAsm::ConstraintCode ConstraintID,
263 std::vector<SDValue> &OutOps) override;
264
265 void emitSpecialCodeForMain();
266
267 inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
268 MVT VT, SDValue &Base, SDValue &Scale,
269 SDValue &Index, SDValue &Disp,
270 SDValue &Segment) {
271 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
272 Base = CurDAG->getTargetFrameIndex(
273 AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
274 else if (AM.Base_Reg.getNode())
275 Base = AM.Base_Reg;
276 else
277 Base = CurDAG->getRegister(0, VT);
278
279 Scale = getI8Imm(AM.Scale, DL);
280
281#define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
282 // Negate the index if needed.
283 if (AM.NegateIndex) {
284 unsigned NegOpc = VT == MVT::i64 ? GET_ND_IF_ENABLED(X86::NEG64r)
285 : GET_ND_IF_ENABLED(X86::NEG32r);
286 SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
287 AM.IndexReg), 0);
288 AM.IndexReg = Neg;
289 }
290
291 if (AM.IndexReg.getNode())
292 Index = AM.IndexReg;
293 else
294 Index = CurDAG->getRegister(0, VT);
295
296 // These are 32-bit even in 64-bit mode since RIP-relative offset
297 // is 32-bit.
298 if (AM.GV)
299 Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
300 MVT::i32, AM.Disp,
301 AM.SymbolFlags);
302 else if (AM.CP)
303 Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment,
304 AM.Disp, AM.SymbolFlags);
305 else if (AM.ES) {
306 assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
307 Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
308 } else if (AM.MCSym) {
309 assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
310 assert(AM.SymbolFlags == 0 && "oo");
311 Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
312 } else if (AM.JT != -1) {
313 assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
314 Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
315 } else if (AM.BlockAddr)
316 Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
317 AM.SymbolFlags);
318 else
319 Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
320
321 if (AM.Segment.getNode())
322 Segment = AM.Segment;
323 else
324 Segment = CurDAG->getRegister(0, MVT::i16);
325 }
326
327 // Utility function to determine whether we should avoid selecting
328 // immediate forms of instructions for better code size or not.
329 // At a high level, we'd like to avoid such instructions when
330 // we have similar constants used within the same basic block
331 // that can be kept in a register.
332 //
333 bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
334 uint32_t UseCount = 0;
335
336 // Do not want to hoist if we're not optimizing for size.
337 // TODO: We'd like to remove this restriction.
338 // See the comment in X86InstrInfo.td for more info.
339 if (!CurDAG->shouldOptForSize())
340 return false;
341
342 // Walk all the users of the immediate.
343 for (const SDNode *User : N->uses()) {
344 if (UseCount >= 2)
345 break;
346
347 // This user is already selected. Count it as a legitimate use and
348 // move on.
349 if (User->isMachineOpcode()) {
350 UseCount++;
351 continue;
352 }
353
354 // We want to count stores of immediates as real uses.
355 if (User->getOpcode() == ISD::STORE &&
356 User->getOperand(1).getNode() == N) {
357 UseCount++;
358 continue;
359 }
360
361 // We don't currently match users that have > 2 operands (except
362 // for stores, which are handled above)
363 // Those instruction won't match in ISEL, for now, and would
364 // be counted incorrectly.
365 // This may change in the future as we add additional instruction
366 // types.
367 if (User->getNumOperands() != 2)
368 continue;
369
370 // If this is a sign-extended 8-bit integer immediate used in an ALU
371 // instruction, there is probably an opcode encoding to save space.
372 auto *C = dyn_cast<ConstantSDNode>(N);
373 if (C && isInt<8>(C->getSExtValue()))
374 continue;
375
376 // Immediates that are used for offsets as part of stack
377 // manipulation should be left alone. These are typically
378 // used to indicate SP offsets for argument passing and
379 // will get pulled into stores/pushes (implicitly).
380 if (User->getOpcode() == X86ISD::ADD ||
381 User->getOpcode() == ISD::ADD ||
382 User->getOpcode() == X86ISD::SUB ||
383 User->getOpcode() == ISD::SUB) {
384
385 // Find the other operand of the add/sub.
386 SDValue OtherOp = User->getOperand(0);
387 if (OtherOp.getNode() == N)
388 OtherOp = User->getOperand(1);
389
390 // Don't count if the other operand is SP.
391 RegisterSDNode *RegNode;
392 if (OtherOp->getOpcode() == ISD::CopyFromReg &&
393 (RegNode = dyn_cast_or_null<RegisterSDNode>(
394 OtherOp->getOperand(1).getNode())))
395 if ((RegNode->getReg() == X86::ESP) ||
396 (RegNode->getReg() == X86::RSP))
397 continue;
398 }
399
400 // ... otherwise, count this and move on.
401 UseCount++;
402 }
403
404 // If we have more than 1 use, then recommend for hoisting.
405 return (UseCount > 1);
406 }
407
408 /// Return a target constant with the specified value of type i8.
409 inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
410 return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
411 }
412
413 /// Return a target constant with the specified value, of type i32.
414 inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
415 return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
416 }
417
418 /// Return a target constant with the specified value, of type i64.
419 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
420 return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
421 }
422
423 SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
424 const SDLoc &DL) {
425 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
426 uint64_t Index = N->getConstantOperandVal(1);
427 MVT VecVT = N->getOperand(0).getSimpleValueType();
428 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
429 }
430
431 SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
432 const SDLoc &DL) {
433 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
434 uint64_t Index = N->getConstantOperandVal(2);
435 MVT VecVT = N->getSimpleValueType(0);
436 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
437 }
438
439 SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth,
440 const SDLoc &DL) {
441 assert(VecWidth == 128 && "Unexpected vector width");
442 uint64_t Index = N->getConstantOperandVal(2);
443 MVT VecVT = N->getSimpleValueType(0);
444 uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
445 assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index");
446 // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
447 // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
448 return getI8Imm(InsertIdx ? 0x02 : 0x30, DL);
449 }
450
451 SDValue getSBBZero(SDNode *N) {
452 SDLoc dl(N);
453 MVT VT = N->getSimpleValueType(0);
454
455 // Create zero.
456 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
458 CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, std::nullopt), 0);
459 if (VT == MVT::i64) {
460 Zero = SDValue(
461 CurDAG->getMachineNode(
462 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
463 CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
464 CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
465 0);
466 }
467
468 // Copy flags to the EFLAGS register and glue it to next node.
469 unsigned Opcode = N->getOpcode();
470 assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
471 "Unexpected opcode for SBB materialization");
472 unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
473 SDValue EFLAGS =
474 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
475 N->getOperand(FlagOpIndex), SDValue());
476
477 // Create a 64-bit instruction if the result is 64-bits otherwise use the
478 // 32-bit version.
479 unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
480 MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
481 VTs = CurDAG->getVTList(SBBVT, MVT::i32);
482 return SDValue(
483 CurDAG->getMachineNode(Opc, dl, VTs,
484 {Zero, Zero, EFLAGS, EFLAGS.getValue(1)}),
485 0);
486 }
487
488 // Helper to detect unneeded and instructions on shift amounts. Called
489 // from PatFrags in tablegen.
490 bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
491 assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
492 const APInt &Val = N->getConstantOperandAPInt(1);
493
494 if (Val.countr_one() >= Width)
495 return true;
496
497 APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
498 return Mask.countr_one() >= Width;
499 }
500
501 /// Return an SDNode that returns the value of the global base register.
502 /// Output instructions required to initialize the global base register,
503 /// if necessary.
504 SDNode *getGlobalBaseReg();
505
506 /// Return a reference to the TargetMachine, casted to the target-specific
507 /// type.
508 const X86TargetMachine &getTargetMachine() const {
509 return static_cast<const X86TargetMachine &>(TM);
510 }
511
512 /// Return a reference to the TargetInstrInfo, casted to the target-specific
513 /// type.
514 const X86InstrInfo *getInstrInfo() const {
515 return Subtarget->getInstrInfo();
516 }
517
518 /// Return a condition code of the given SDNode
519 X86::CondCode getCondFromNode(SDNode *N) const;
520
521 /// Address-mode matching performs shift-of-and to and-of-shift
522 /// reassociation in order to expose more scaled addressing
523 /// opportunities.
524 bool ComplexPatternFuncMutatesDAG() const override {
525 return true;
526 }
527
528 bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
529
530 // Indicates we should prefer to use a non-temporal load for this load.
531 bool useNonTemporalLoad(LoadSDNode *N) const {
532 if (!N->isNonTemporal())
533 return false;
534
535 unsigned StoreSize = N->getMemoryVT().getStoreSize();
536
537 if (N->getAlign().value() < StoreSize)
538 return false;
539
540 switch (StoreSize) {
541 default: llvm_unreachable("Unsupported store size");
542 case 4:
543 case 8:
544 return false;
545 case 16:
546 return Subtarget->hasSSE41();
547 case 32:
548 return Subtarget->hasAVX2();
549 case 64:
550 return Subtarget->hasAVX512();
551 }
552 }
553
554 bool foldLoadStoreIntoMemOperand(SDNode *Node);
555 MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
556 bool matchBitExtract(SDNode *Node);
557 bool shrinkAndImmediate(SDNode *N);
558 bool isMaskZeroExtended(SDNode *N) const;
559 bool tryShiftAmountMod(SDNode *N);
560 bool tryShrinkShlLogicImm(SDNode *N);
561 bool tryVPTERNLOG(SDNode *N);
562 bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB,
563 SDNode *ParentC, SDValue A, SDValue B, SDValue C,
564 uint8_t Imm);
565 bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
566 bool tryMatchBitSelect(SDNode *N);
567
568 MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
569 const SDLoc &dl, MVT VT, SDNode *Node);
570 MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
571 const SDLoc &dl, MVT VT, SDNode *Node,
572 SDValue &InGlue);
573
574 bool tryOptimizeRem8Extend(SDNode *N);
575
576 bool onlyUsesZeroFlag(SDValue Flags) const;
577 bool hasNoSignFlagUses(SDValue Flags) const;
578 bool hasNoCarryFlagUses(SDValue Flags) const;
579 };
580}
581
582char X86DAGToDAGISel::ID = 0;
583
584INITIALIZE_PASS(X86DAGToDAGISel, DEBUG_TYPE, PASS_NAME, false, false)
585
586// Returns true if this masked compare can be implemented legally with this
587// type.
588static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
589 unsigned Opcode = N->getOpcode();
590 if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
591 Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
592 Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
593 // We can get 256-bit 8 element types here without VLX being enabled. When
594 // this happens we will use 512-bit operations and the mask will not be
595 // zero extended.
596 EVT OpVT = N->getOperand(0).getValueType();
597 // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
598 // second operand.
599 if (Opcode == X86ISD::STRICT_CMPM)
600 OpVT = N->getOperand(1).getValueType();
601 if (OpVT.is256BitVector() || OpVT.is128BitVector())
602 return Subtarget->hasVLX();
603
604 return true;
605 }
606 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
607 if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
608 Opcode == X86ISD::FSETCCM_SAE)
609 return true;
610
611 return false;
612}
613
614// Returns true if we can assume the writer of the mask has zero extended it
615// for us.
616bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
617 // If this is an AND, check if we have a compare on either side. As long as
618 // one side guarantees the mask is zero extended, the AND will preserve those
619 // zeros.
620 if (N->getOpcode() == ISD::AND)
621 return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
622 isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
623
624 return isLegalMaskCompare(N, Subtarget);
625}
626
627bool
628X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
629 if (OptLevel == CodeGenOptLevel::None)
630 return false;
631
632 if (!N.hasOneUse())
633 return false;
634
635 if (N.getOpcode() != ISD::LOAD)
636 return true;
637
638 // Don't fold non-temporal loads if we have an instruction for them.
639 if (useNonTemporalLoad(cast<LoadSDNode>(N)))
640 return false;
641
642 // If N is a load, do additional profitability checks.
643 if (U == Root) {
644 switch (U->getOpcode()) {
645 default: break;
646 case X86ISD::ADD:
647 case X86ISD::ADC:
648 case X86ISD::SUB:
649 case X86ISD::SBB:
650 case X86ISD::AND:
651 case X86ISD::XOR:
652 case X86ISD::OR:
653 case ISD::ADD:
654 case ISD::UADDO_CARRY:
655 case ISD::AND:
656 case ISD::OR:
657 case ISD::XOR: {
658 SDValue Op1 = U->getOperand(1);
659
660 // If the other operand is a 8-bit immediate we should fold the immediate
661 // instead. This reduces code size.
662 // e.g.
663 // movl 4(%esp), %eax
664 // addl $4, %eax
665 // vs.
666 // movl $4, %eax
667 // addl 4(%esp), %eax
668 // The former is 2 bytes shorter. In case where the increment is 1, then
669 // the saving can be 4 bytes (by using incl %eax).
670 if (auto *Imm = dyn_cast<ConstantSDNode>(Op1)) {
671 if (Imm->getAPIntValue().isSignedIntN(8))
672 return false;
673
674 // If this is a 64-bit AND with an immediate that fits in 32-bits,
675 // prefer using the smaller and over folding the load. This is needed to
676 // make sure immediates created by shrinkAndImmediate are always folded.
677 // Ideally we would narrow the load during DAG combine and get the
678 // best of both worlds.
679 if (U->getOpcode() == ISD::AND &&
680 Imm->getAPIntValue().getBitWidth() == 64 &&
681 Imm->getAPIntValue().isIntN(32))
682 return false;
683
684 // If this really a zext_inreg that can be represented with a movzx
685 // instruction, prefer that.
686 // TODO: We could shrink the load and fold if it is non-volatile.
687 if (U->getOpcode() == ISD::AND &&
688 (Imm->getAPIntValue() == UINT8_MAX ||
689 Imm->getAPIntValue() == UINT16_MAX ||
690 Imm->getAPIntValue() == UINT32_MAX))
691 return false;
692
693 // ADD/SUB with can negate the immediate and use the opposite operation
694 // to fit 128 into a sign extended 8 bit immediate.
695 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
696 (-Imm->getAPIntValue()).isSignedIntN(8))
697 return false;
698
699 if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
700 (-Imm->getAPIntValue()).isSignedIntN(8) &&
701 hasNoCarryFlagUses(SDValue(U, 1)))
702 return false;
703 }
704
705 // If the other operand is a TLS address, we should fold it instead.
706 // This produces
707 // movl %gs:0, %eax
708 // leal i@NTPOFF(%eax), %eax
709 // instead of
710 // movl $i@NTPOFF, %eax
711 // addl %gs:0, %eax
712 // if the block also has an access to a second TLS address this will save
713 // a load.
714 // FIXME: This is probably also true for non-TLS addresses.
715 if (Op1.getOpcode() == X86ISD::Wrapper) {
716 SDValue Val = Op1.getOperand(0);
718 return false;
719 }
720
721 // Don't fold load if this matches the BTS/BTR/BTC patterns.
722 // BTS: (or X, (shl 1, n))
723 // BTR: (and X, (rotl -2, n))
724 // BTC: (xor X, (shl 1, n))
725 if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
726 if (U->getOperand(0).getOpcode() == ISD::SHL &&
727 isOneConstant(U->getOperand(0).getOperand(0)))
728 return false;
729
730 if (U->getOperand(1).getOpcode() == ISD::SHL &&
731 isOneConstant(U->getOperand(1).getOperand(0)))
732 return false;
733 }
734 if (U->getOpcode() == ISD::AND) {
735 SDValue U0 = U->getOperand(0);
736 SDValue U1 = U->getOperand(1);
737 if (U0.getOpcode() == ISD::ROTL) {
738 auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
739 if (C && C->getSExtValue() == -2)
740 return false;
741 }
742
743 if (U1.getOpcode() == ISD::ROTL) {
744 auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
745 if (C && C->getSExtValue() == -2)
746 return false;
747 }
748 }
749
750 break;
751 }
752 case ISD::SHL:
753 case ISD::SRA:
754 case ISD::SRL:
755 // Don't fold a load into a shift by immediate. The BMI2 instructions
756 // support folding a load, but not an immediate. The legacy instructions
757 // support folding an immediate, but can't fold a load. Folding an
758 // immediate is preferable to folding a load.
759 if (isa<ConstantSDNode>(U->getOperand(1)))
760 return false;
761
762 break;
763 }
764 }
765
766 // Prevent folding a load if this can implemented with an insert_subreg or
767 // a move that implicitly zeroes.
768 if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
769 isNullConstant(Root->getOperand(2)) &&
770 (Root->getOperand(0).isUndef() ||
772 return false;
773
774 return true;
775}
776
777// Indicates it is profitable to form an AVX512 masked operation. Returning
778// false will favor a masked register-register masked move or vblendm and the
779// operation will be selected separately.
780bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
781 assert(
782 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
783 "Unexpected opcode!");
784
785 // If the operation has additional users, the operation will be duplicated.
786 // Check the use count to prevent that.
787 // FIXME: Are there cheap opcodes we might want to duplicate?
788 return N->getOperand(1).hasOneUse();
789}
790
791/// Replace the original chain operand of the call with
792/// load's chain operand and move load below the call's chain operand.
793static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
794 SDValue Call, SDValue OrigChain) {
796 SDValue Chain = OrigChain.getOperand(0);
797 if (Chain.getNode() == Load.getNode())
798 Ops.push_back(Load.getOperand(0));
799 else {
800 assert(Chain.getOpcode() == ISD::TokenFactor &&
801 "Unexpected chain operand");
802 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
803 if (Chain.getOperand(i).getNode() == Load.getNode())
804 Ops.push_back(Load.getOperand(0));
805 else
806 Ops.push_back(Chain.getOperand(i));
807 SDValue NewChain =
808 CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
809 Ops.clear();
810 Ops.push_back(NewChain);
811 }
812 Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
813 CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
814 CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
815 Load.getOperand(1), Load.getOperand(2));
816
817 Ops.clear();
818 Ops.push_back(SDValue(Load.getNode(), 1));
819 Ops.append(Call->op_begin() + 1, Call->op_end());
820 CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
821}
822
823/// Return true if call address is a load and it can be
824/// moved below CALLSEQ_START and the chains leading up to the call.
825/// Return the CALLSEQ_START by reference as a second output.
826/// In the case of a tail call, there isn't a callseq node between the call
827/// chain and the load.
828static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
829 // The transformation is somewhat dangerous if the call's chain was glued to
830 // the call. After MoveBelowOrigChain the load is moved between the call and
831 // the chain, this can create a cycle if the load is not folded. So it is
832 // *really* important that we are sure the load will be folded.
833 if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
834 return false;
835 auto *LD = dyn_cast<LoadSDNode>(Callee.getNode());
836 if (!LD ||
837 !LD->isSimple() ||
838 LD->getAddressingMode() != ISD::UNINDEXED ||
839 LD->getExtensionType() != ISD::NON_EXTLOAD)
840 return false;
841
842 // Now let's find the callseq_start.
843 while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
844 if (!Chain.hasOneUse())
845 return false;
846 Chain = Chain.getOperand(0);
847 }
848
849 if (!Chain.getNumOperands())
850 return false;
851 // Since we are not checking for AA here, conservatively abort if the chain
852 // writes to memory. It's not safe to move the callee (a load) across a store.
853 if (isa<MemSDNode>(Chain.getNode()) &&
854 cast<MemSDNode>(Chain.getNode())->writeMem())
855 return false;
856 if (Chain.getOperand(0).getNode() == Callee.getNode())
857 return true;
858 if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
859 Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
860 Callee.getValue(1).hasOneUse())
861 return true;
862 return false;
863}
864
865static bool isEndbrImm64(uint64_t Imm) {
866// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
867// i.g: 0xF3660F1EFA, 0xF3670F1EFA
868 if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
869 return false;
870
871 uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
872 0x65, 0x66, 0x67, 0xf0, 0xf2};
873 int i = 24; // 24bit 0x0F1EFA has matched
874 while (i < 64) {
875 uint8_t Byte = (Imm >> i) & 0xFF;
876 if (Byte == 0xF3)
877 return true;
878 if (!llvm::is_contained(OptionalPrefixBytes, Byte))
879 return false;
880 i += 8;
881 }
882
883 return false;
884}
885
886static bool needBWI(MVT VT) {
887 return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8);
888}
889
890void X86DAGToDAGISel::PreprocessISelDAG() {
891 bool MadeChange = false;
892 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
893 E = CurDAG->allnodes_end(); I != E; ) {
894 SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
895
896 // This is for CET enhancement.
897 //
898 // ENDBR32 and ENDBR64 have specific opcodes:
899 // ENDBR32: F3 0F 1E FB
900 // ENDBR64: F3 0F 1E FA
901 // And we want that attackers won’t find unintended ENDBR32/64
902 // opcode matches in the binary
903 // Here’s an example:
904 // If the compiler had to generate asm for the following code:
905 // a = 0xF30F1EFA
906 // it could, for example, generate:
907 // mov 0xF30F1EFA, dword ptr[a]
908 // In such a case, the binary would include a gadget that starts
909 // with a fake ENDBR64 opcode. Therefore, we split such generation
910 // into multiple operations, let it not shows in the binary
911 if (N->getOpcode() == ISD::Constant) {
912 MVT VT = N->getSimpleValueType(0);
913 int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
914 int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
915 if (Imm == EndbrImm || isEndbrImm64(Imm)) {
916 // Check that the cf-protection-branch is enabled.
917 Metadata *CFProtectionBranch =
918 MF->getMMI().getModule()->getModuleFlag("cf-protection-branch");
919 if (CFProtectionBranch || IndirectBranchTracking) {
920 SDLoc dl(N);
921 SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true);
922 Complement = CurDAG->getNOT(dl, Complement, VT);
923 --I;
924 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement);
925 ++I;
926 MadeChange = true;
927 continue;
928 }
929 }
930 }
931
932 // If this is a target specific AND node with no flag usages, turn it back
933 // into ISD::AND to enable test instruction matching.
934 if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
935 SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
936 N->getOperand(0), N->getOperand(1));
937 --I;
938 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
939 ++I;
940 MadeChange = true;
941 continue;
942 }
943
944 // Convert vector increment or decrement to sub/add with an all-ones
945 // constant:
946 // add X, <1, 1...> --> sub X, <-1, -1...>
947 // sub X, <1, 1...> --> add X, <-1, -1...>
948 // The all-ones vector constant can be materialized using a pcmpeq
949 // instruction that is commonly recognized as an idiom (has no register
950 // dependency), so that's better/smaller than loading a splat 1 constant.
951 //
952 // But don't do this if it would inhibit a potentially profitable load
953 // folding opportunity for the other operand. That only occurs with the
954 // intersection of:
955 // (1) The other operand (op0) is load foldable.
956 // (2) The op is an add (otherwise, we are *creating* an add and can still
957 // load fold the other op).
958 // (3) The target has AVX (otherwise, we have a destructive add and can't
959 // load fold the other op without killing the constant op).
960 // (4) The constant 1 vector has multiple uses (so it is profitable to load
961 // into a register anyway).
962 auto mayPreventLoadFold = [&]() {
963 return X86::mayFoldLoad(N->getOperand(0), *Subtarget) &&
964 N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
965 !N->getOperand(1).hasOneUse();
966 };
967 if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
968 N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
969 APInt SplatVal;
970 if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
971 SplatVal.isOne()) {
972 SDLoc DL(N);
973
974 MVT VT = N->getSimpleValueType(0);
975 unsigned NumElts = VT.getSizeInBits() / 32;
977 CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts));
978 AllOnes = CurDAG->getBitcast(VT, AllOnes);
979
980 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
981 SDValue Res =
982 CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes);
983 --I;
984 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
985 ++I;
986 MadeChange = true;
987 continue;
988 }
989 }
990
991 switch (N->getOpcode()) {
992 case X86ISD::VBROADCAST: {
993 MVT VT = N->getSimpleValueType(0);
994 // Emulate v32i16/v64i8 broadcast without BWI.
995 if (!Subtarget->hasBWI() && needBWI(VT)) {
996 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
997 SDLoc dl(N);
998 SDValue NarrowBCast =
999 CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0));
1000 SDValue Res =
1001 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1002 NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1003 unsigned Index = NarrowVT.getVectorMinNumElements();
1004 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1005 CurDAG->getIntPtrConstant(Index, dl));
1006
1007 --I;
1008 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1009 ++I;
1010 MadeChange = true;
1011 continue;
1012 }
1013
1014 break;
1015 }
1017 MVT VT = N->getSimpleValueType(0);
1018 // Emulate v32i16/v64i8 broadcast without BWI.
1019 if (!Subtarget->hasBWI() && needBWI(VT)) {
1020 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1021 auto *MemNode = cast<MemSDNode>(N);
1022 SDLoc dl(N);
1023 SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
1024 SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1025 SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1026 X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(),
1027 MemNode->getMemOperand());
1028 SDValue Res =
1029 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1030 NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1031 unsigned Index = NarrowVT.getVectorMinNumElements();
1032 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1033 CurDAG->getIntPtrConstant(Index, dl));
1034
1035 --I;
1036 SDValue To[] = {Res, NarrowBCast.getValue(1)};
1037 CurDAG->ReplaceAllUsesWith(N, To);
1038 ++I;
1039 MadeChange = true;
1040 continue;
1041 }
1042
1043 break;
1044 }
1045 case ISD::LOAD: {
1046 // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1047 // load, then just extract the lower subvector and avoid the second load.
1048 auto *Ld = cast<LoadSDNode>(N);
1049 MVT VT = N->getSimpleValueType(0);
1050 if (!ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
1051 !(VT.is128BitVector() || VT.is256BitVector()))
1052 break;
1053
1054 MVT MaxVT = VT;
1055 SDNode *MaxLd = nullptr;
1056 SDValue Ptr = Ld->getBasePtr();
1057 SDValue Chain = Ld->getChain();
1058 for (SDNode *User : Ptr->uses()) {
1059 auto *UserLd = dyn_cast<LoadSDNode>(User);
1060 MVT UserVT = User->getSimpleValueType(0);
1061 if (User != N && UserLd && ISD::isNormalLoad(User) &&
1062 UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1063 !User->hasAnyUseOfValue(1) &&
1064 (UserVT.is256BitVector() || UserVT.is512BitVector()) &&
1065 UserVT.getSizeInBits() > VT.getSizeInBits() &&
1066 (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1067 MaxLd = User;
1068 MaxVT = UserVT;
1069 }
1070 }
1071 if (MaxLd) {
1072 SDLoc dl(N);
1073 unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1074 MVT SubVT = MVT::getVectorVT(MaxVT.getScalarType(), NumSubElts);
1075 SDValue Extract = CurDAG->getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT,
1076 SDValue(MaxLd, 0),
1077 CurDAG->getIntPtrConstant(0, dl));
1078 SDValue Res = CurDAG->getBitcast(VT, Extract);
1079
1080 --I;
1081 SDValue To[] = {Res, SDValue(MaxLd, 1)};
1082 CurDAG->ReplaceAllUsesWith(N, To);
1083 ++I;
1084 MadeChange = true;
1085 continue;
1086 }
1087 break;
1088 }
1089 case ISD::VSELECT: {
1090 // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1091 EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();
1092 if (EleVT == MVT::i1)
1093 break;
1094
1095 assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1096 assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
1097 "We can't replace VSELECT with BLENDV in vXi16!");
1098 SDValue R;
1099 if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(N->getOperand(0)) ==
1100 EleVT.getSizeInBits()) {
1101 R = CurDAG->getNode(X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0),
1102 N->getOperand(0), N->getOperand(1), N->getOperand(2),
1103 CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
1104 } else {
1105 R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
1106 N->getOperand(0), N->getOperand(1),
1107 N->getOperand(2));
1108 }
1109 --I;
1110 CurDAG->ReplaceAllUsesWith(N, R.getNode());
1111 ++I;
1112 MadeChange = true;
1113 continue;
1114 }
1115 case ISD::FP_ROUND:
1117 case ISD::FP_TO_SINT:
1118 case ISD::FP_TO_UINT:
1121 // Replace vector fp_to_s/uint with their X86 specific equivalent so we
1122 // don't need 2 sets of patterns.
1123 if (!N->getSimpleValueType(0).isVector())
1124 break;
1125
1126 unsigned NewOpc;
1127 switch (N->getOpcode()) {
1128 default: llvm_unreachable("Unexpected opcode!");
1129 case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
1130 case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
1131 case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1132 case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
1133 case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1134 case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
1135 }
1136 SDValue Res;
1137 if (N->isStrictFPOpcode())
1138 Res =
1139 CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other},
1140 {N->getOperand(0), N->getOperand(1)});
1141 else
1142 Res =
1143 CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1144 N->getOperand(0));
1145 --I;
1146 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1147 ++I;
1148 MadeChange = true;
1149 continue;
1150 }
1151 case ISD::SHL:
1152 case ISD::SRA:
1153 case ISD::SRL: {
1154 // Replace vector shifts with their X86 specific equivalent so we don't
1155 // need 2 sets of patterns.
1156 if (!N->getValueType(0).isVector())
1157 break;
1158
1159 unsigned NewOpc;
1160 switch (N->getOpcode()) {
1161 default: llvm_unreachable("Unexpected opcode!");
1162 case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1163 case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1164 case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1165 }
1166 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1167 N->getOperand(0), N->getOperand(1));
1168 --I;
1169 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1170 ++I;
1171 MadeChange = true;
1172 continue;
1173 }
1174 case ISD::ANY_EXTEND:
1176 // Replace vector any extend with the zero extend equivalents so we don't
1177 // need 2 sets of patterns. Ignore vXi1 extensions.
1178 if (!N->getValueType(0).isVector())
1179 break;
1180
1181 unsigned NewOpc;
1182 if (N->getOperand(0).getScalarValueSizeInBits() == 1) {
1183 assert(N->getOpcode() == ISD::ANY_EXTEND &&
1184 "Unexpected opcode for mask vector!");
1185 NewOpc = ISD::SIGN_EXTEND;
1186 } else {
1187 NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1190 }
1191
1192 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1193 N->getOperand(0));
1194 --I;
1195 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1196 ++I;
1197 MadeChange = true;
1198 continue;
1199 }
1200 case ISD::FCEIL:
1201 case ISD::STRICT_FCEIL:
1202 case ISD::FFLOOR:
1203 case ISD::STRICT_FFLOOR:
1204 case ISD::FTRUNC:
1205 case ISD::STRICT_FTRUNC:
1206 case ISD::FROUNDEVEN:
1208 case ISD::FNEARBYINT:
1210 case ISD::FRINT:
1211 case ISD::STRICT_FRINT: {
1212 // Replace fp rounding with their X86 specific equivalent so we don't
1213 // need 2 sets of patterns.
1214 unsigned Imm;
1215 switch (N->getOpcode()) {
1216 default: llvm_unreachable("Unexpected opcode!");
1217 case ISD::STRICT_FCEIL:
1218 case ISD::FCEIL: Imm = 0xA; break;
1219 case ISD::STRICT_FFLOOR:
1220 case ISD::FFLOOR: Imm = 0x9; break;
1221 case ISD::STRICT_FTRUNC:
1222 case ISD::FTRUNC: Imm = 0xB; break;
1224 case ISD::FROUNDEVEN: Imm = 0x8; break;
1226 case ISD::FNEARBYINT: Imm = 0xC; break;
1227 case ISD::STRICT_FRINT:
1228 case ISD::FRINT: Imm = 0x4; break;
1229 }
1230 SDLoc dl(N);
1231 bool IsStrict = N->isStrictFPOpcode();
1232 SDValue Res;
1233 if (IsStrict)
1234 Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
1235 {N->getValueType(0), MVT::Other},
1236 {N->getOperand(0), N->getOperand(1),
1237 CurDAG->getTargetConstant(Imm, dl, MVT::i32)});
1238 else
1239 Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
1240 N->getOperand(0),
1241 CurDAG->getTargetConstant(Imm, dl, MVT::i32));
1242 --I;
1243 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1244 ++I;
1245 MadeChange = true;
1246 continue;
1247 }
1248 case X86ISD::FANDN:
1249 case X86ISD::FAND:
1250 case X86ISD::FOR:
1251 case X86ISD::FXOR: {
1252 // Widen scalar fp logic ops to vector to reduce isel patterns.
1253 // FIXME: Can we do this during lowering/combine.
1254 MVT VT = N->getSimpleValueType(0);
1255 if (VT.isVector() || VT == MVT::f128)
1256 break;
1257
1258 MVT VecVT = VT == MVT::f64 ? MVT::v2f64
1259 : VT == MVT::f32 ? MVT::v4f32
1260 : MVT::v8f16;
1261
1262 SDLoc dl(N);
1263 SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1264 N->getOperand(0));
1265 SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1266 N->getOperand(1));
1267
1268 SDValue Res;
1269 if (Subtarget->hasSSE2()) {
1270 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
1271 Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
1272 Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
1273 unsigned Opc;
1274 switch (N->getOpcode()) {
1275 default: llvm_unreachable("Unexpected opcode!");
1276 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1277 case X86ISD::FAND: Opc = ISD::AND; break;
1278 case X86ISD::FOR: Opc = ISD::OR; break;
1279 case X86ISD::FXOR: Opc = ISD::XOR; break;
1280 }
1281 Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
1282 Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
1283 } else {
1284 Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
1285 }
1286 Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
1287 CurDAG->getIntPtrConstant(0, dl));
1288 --I;
1289 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1290 ++I;
1291 MadeChange = true;
1292 continue;
1293 }
1294 }
1295
1296 if (OptLevel != CodeGenOptLevel::None &&
1297 // Only do this when the target can fold the load into the call or
1298 // jmp.
1299 !Subtarget->useIndirectThunkCalls() &&
1300 ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
1301 (N->getOpcode() == X86ISD::TC_RETURN &&
1302 (Subtarget->is64Bit() ||
1303 !getTargetMachine().isPositionIndependent())))) {
1304 /// Also try moving call address load from outside callseq_start to just
1305 /// before the call to allow it to be folded.
1306 ///
1307 /// [Load chain]
1308 /// ^
1309 /// |
1310 /// [Load]
1311 /// ^ ^
1312 /// | |
1313 /// / \--
1314 /// / |
1315 ///[CALLSEQ_START] |
1316 /// ^ |
1317 /// | |
1318 /// [LOAD/C2Reg] |
1319 /// | |
1320 /// \ /
1321 /// \ /
1322 /// [CALL]
1323 bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1324 SDValue Chain = N->getOperand(0);
1325 SDValue Load = N->getOperand(1);
1326 if (!isCalleeLoad(Load, Chain, HasCallSeq))
1327 continue;
1328 moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
1329 ++NumLoadMoved;
1330 MadeChange = true;
1331 continue;
1332 }
1333
1334 // Lower fpround and fpextend nodes that target the FP stack to be store and
1335 // load to the stack. This is a gross hack. We would like to simply mark
1336 // these as being illegal, but when we do that, legalize produces these when
1337 // it expands calls, then expands these in the same legalize pass. We would
1338 // like dag combine to be able to hack on these between the call expansion
1339 // and the node legalization. As such this pass basically does "really
1340 // late" legalization of these inline with the X86 isel pass.
1341 // FIXME: This should only happen when not compiled with -O0.
1342 switch (N->getOpcode()) {
1343 default: continue;
1344 case ISD::FP_ROUND:
1345 case ISD::FP_EXTEND:
1346 {
1347 MVT SrcVT = N->getOperand(0).getSimpleValueType();
1348 MVT DstVT = N->getSimpleValueType(0);
1349
1350 // If any of the sources are vectors, no fp stack involved.
1351 if (SrcVT.isVector() || DstVT.isVector())
1352 continue;
1353
1354 // If the source and destination are SSE registers, then this is a legal
1355 // conversion that should not be lowered.
1356 const X86TargetLowering *X86Lowering =
1357 static_cast<const X86TargetLowering *>(TLI);
1358 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1359 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1360 if (SrcIsSSE && DstIsSSE)
1361 continue;
1362
1363 if (!SrcIsSSE && !DstIsSSE) {
1364 // If this is an FPStack extension, it is a noop.
1365 if (N->getOpcode() == ISD::FP_EXTEND)
1366 continue;
1367 // If this is a value-preserving FPStack truncation, it is a noop.
1368 if (N->getConstantOperandVal(1))
1369 continue;
1370 }
1371
1372 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1373 // FPStack has extload and truncstore. SSE can fold direct loads into other
1374 // operations. Based on this, decide what we want to do.
1375 MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1376 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1377 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1378 MachinePointerInfo MPI =
1379 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1380 SDLoc dl(N);
1381
1382 // FIXME: optimize the case where the src/dest is a load or store?
1383
1384 SDValue Store = CurDAG->getTruncStore(
1385 CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT);
1386 SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store,
1387 MemTmp, MPI, MemVT);
1388
1389 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1390 // extload we created. This will cause general havok on the dag because
1391 // anything below the conversion could be folded into other existing nodes.
1392 // To avoid invalidating 'I', back it up to the convert node.
1393 --I;
1394 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1395 break;
1396 }
1397
1398 //The sequence of events for lowering STRICT_FP versions of these nodes requires
1399 //dealing with the chain differently, as there is already a preexisting chain.
1402 {
1403 MVT SrcVT = N->getOperand(1).getSimpleValueType();
1404 MVT DstVT = N->getSimpleValueType(0);
1405
1406 // If any of the sources are vectors, no fp stack involved.
1407 if (SrcVT.isVector() || DstVT.isVector())
1408 continue;
1409
1410 // If the source and destination are SSE registers, then this is a legal
1411 // conversion that should not be lowered.
1412 const X86TargetLowering *X86Lowering =
1413 static_cast<const X86TargetLowering *>(TLI);
1414 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1415 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1416 if (SrcIsSSE && DstIsSSE)
1417 continue;
1418
1419 if (!SrcIsSSE && !DstIsSSE) {
1420 // If this is an FPStack extension, it is a noop.
1421 if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1422 continue;
1423 // If this is a value-preserving FPStack truncation, it is a noop.
1424 if (N->getConstantOperandVal(2))
1425 continue;
1426 }
1427
1428 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1429 // FPStack has extload and truncstore. SSE can fold direct loads into other
1430 // operations. Based on this, decide what we want to do.
1431 MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1432 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1433 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1434 MachinePointerInfo MPI =
1435 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1436 SDLoc dl(N);
1437
1438 // FIXME: optimize the case where the src/dest is a load or store?
1439
1440 //Since the operation is StrictFP, use the preexisting chain.
1442 if (!SrcIsSSE) {
1443 SDVTList VTs = CurDAG->getVTList(MVT::Other);
1444 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
1445 Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
1446 MPI, /*Align*/ std::nullopt,
1448 if (N->getFlags().hasNoFPExcept()) {
1449 SDNodeFlags Flags = Store->getFlags();
1450 Flags.setNoFPExcept(true);
1451 Store->setFlags(Flags);
1452 }
1453 } else {
1454 assert(SrcVT == MemVT && "Unexpected VT!");
1455 Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
1456 MPI);
1457 }
1458
1459 if (!DstIsSSE) {
1460 SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
1461 SDValue Ops[] = {Store, MemTmp};
1462 Result = CurDAG->getMemIntrinsicNode(
1463 X86ISD::FLD, dl, VTs, Ops, MemVT, MPI,
1464 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
1465 if (N->getFlags().hasNoFPExcept()) {
1466 SDNodeFlags Flags = Result->getFlags();
1467 Flags.setNoFPExcept(true);
1468 Result->setFlags(Flags);
1469 }
1470 } else {
1471 assert(DstVT == MemVT && "Unexpected VT!");
1472 Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI);
1473 }
1474
1475 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1476 // extload we created. This will cause general havok on the dag because
1477 // anything below the conversion could be folded into other existing nodes.
1478 // To avoid invalidating 'I', back it up to the convert node.
1479 --I;
1480 CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1481 break;
1482 }
1483 }
1484
1485
1486 // Now that we did that, the node is dead. Increment the iterator to the
1487 // next node to process, then delete N.
1488 ++I;
1489 MadeChange = true;
1490 }
1491
1492 // Remove any dead nodes that may have been left behind.
1493 if (MadeChange)
1494 CurDAG->RemoveDeadNodes();
1495}
1496
1497// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1498bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1499 unsigned Opc = N->getMachineOpcode();
1500 if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1501 Opc != X86::MOVSX64rr8)
1502 return false;
1503
1504 SDValue N0 = N->getOperand(0);
1505
1506 // We need to be extracting the lower bit of an extend.
1507 if (!N0.isMachineOpcode() ||
1508 N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1509 N0.getConstantOperandVal(1) != X86::sub_8bit)
1510 return false;
1511
1512 // We're looking for either a movsx or movzx to match the original opcode.
1513 unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1514 : X86::MOVSX32rr8_NOREX;
1515 SDValue N00 = N0.getOperand(0);
1516 if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1517 return false;
1518
1519 if (Opc == X86::MOVSX64rr8) {
1520 // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1521 // to 64.
1522 MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1523 MVT::i64, N00);
1524 ReplaceUses(N, Extend);
1525 } else {
1526 // Ok we can drop this extend and just use the original extend.
1527 ReplaceUses(N, N00.getNode());
1528 }
1529
1530 return true;
1531}
1532
1533void X86DAGToDAGISel::PostprocessISelDAG() {
1534 // Skip peepholes at -O0.
1535 if (TM.getOptLevel() == CodeGenOptLevel::None)
1536 return;
1537
1538 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1539
1540 bool MadeChange = false;
1541 while (Position != CurDAG->allnodes_begin()) {
1542 SDNode *N = &*--Position;
1543 // Skip dead nodes and any non-machine opcodes.
1544 if (N->use_empty() || !N->isMachineOpcode())
1545 continue;
1546
1547 if (tryOptimizeRem8Extend(N)) {
1548 MadeChange = true;
1549 continue;
1550 }
1551
1552 // Look for a TESTrr+ANDrr pattern where both operands of the test are
1553 // the same. Rewrite to remove the AND.
1554 unsigned Opc = N->getMachineOpcode();
1555 if ((Opc == X86::TEST8rr || Opc == X86::TEST16rr ||
1556 Opc == X86::TEST32rr || Opc == X86::TEST64rr) &&
1557 N->getOperand(0) == N->getOperand(1) &&
1558 N->getOperand(0)->hasNUsesOfValue(2, N->getOperand(0).getResNo()) &&
1559 N->getOperand(0).isMachineOpcode()) {
1560 SDValue And = N->getOperand(0);
1561 unsigned N0Opc = And.getMachineOpcode();
1562 if ((N0Opc == X86::AND8rr || N0Opc == X86::AND16rr ||
1563 N0Opc == X86::AND32rr || N0Opc == X86::AND64rr) &&
1564 !And->hasAnyUseOfValue(1)) {
1565 MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N),
1566 MVT::i32,
1567 And.getOperand(0),
1568 And.getOperand(1));
1569 ReplaceUses(N, Test);
1570 MadeChange = true;
1571 continue;
1572 }
1573 if ((N0Opc == X86::AND8rm || N0Opc == X86::AND16rm ||
1574 N0Opc == X86::AND32rm || N0Opc == X86::AND64rm) &&
1575 !And->hasAnyUseOfValue(1)) {
1576 unsigned NewOpc;
1577 switch (N0Opc) {
1578 case X86::AND8rm: NewOpc = X86::TEST8mr; break;
1579 case X86::AND16rm: NewOpc = X86::TEST16mr; break;
1580 case X86::AND32rm: NewOpc = X86::TEST32mr; break;
1581 case X86::AND64rm: NewOpc = X86::TEST64mr; break;
1582 }
1583
1584 // Need to swap the memory and register operand.
1585 SDValue Ops[] = { And.getOperand(1),
1586 And.getOperand(2),
1587 And.getOperand(3),
1588 And.getOperand(4),
1589 And.getOperand(5),
1590 And.getOperand(0),
1591 And.getOperand(6) /* Chain */ };
1592 MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N),
1593 MVT::i32, MVT::Other, Ops);
1594 CurDAG->setNodeMemRefs(
1595 Test, cast<MachineSDNode>(And.getNode())->memoperands());
1596 ReplaceUses(And.getValue(2), SDValue(Test, 1));
1597 ReplaceUses(SDValue(N, 0), SDValue(Test, 0));
1598 MadeChange = true;
1599 continue;
1600 }
1601 }
1602
1603 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1604 // used. We're doing this late so we can prefer to fold the AND into masked
1605 // comparisons. Doing that can be better for the live range of the mask
1606 // register.
1607 if ((Opc == X86::KORTESTBrr || Opc == X86::KORTESTWrr ||
1608 Opc == X86::KORTESTDrr || Opc == X86::KORTESTQrr) &&
1609 N->getOperand(0) == N->getOperand(1) &&
1610 N->isOnlyUserOf(N->getOperand(0).getNode()) &&
1611 N->getOperand(0).isMachineOpcode() &&
1612 onlyUsesZeroFlag(SDValue(N, 0))) {
1613 SDValue And = N->getOperand(0);
1614 unsigned N0Opc = And.getMachineOpcode();
1615 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1616 // KAND instructions and KTEST use the same ISA feature.
1617 if (N0Opc == X86::KANDBrr ||
1618 (N0Opc == X86::KANDWrr && Subtarget->hasDQI()) ||
1619 N0Opc == X86::KANDDrr || N0Opc == X86::KANDQrr) {
1620 unsigned NewOpc;
1621 switch (Opc) {
1622 default: llvm_unreachable("Unexpected opcode!");
1623 case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break;
1624 case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break;
1625 case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break;
1626 case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break;
1627 }
1628 MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N),
1629 MVT::i32,
1630 And.getOperand(0),
1631 And.getOperand(1));
1632 ReplaceUses(N, KTest);
1633 MadeChange = true;
1634 continue;
1635 }
1636 }
1637
1638 // Attempt to remove vectors moves that were inserted to zero upper bits.
1639 if (Opc != TargetOpcode::SUBREG_TO_REG)
1640 continue;
1641
1642 unsigned SubRegIdx = N->getConstantOperandVal(2);
1643 if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1644 continue;
1645
1646 SDValue Move = N->getOperand(1);
1647 if (!Move.isMachineOpcode())
1648 continue;
1649
1650 // Make sure its one of the move opcodes we recognize.
1651 switch (Move.getMachineOpcode()) {
1652 default:
1653 continue;
1654 case X86::VMOVAPDrr: case X86::VMOVUPDrr:
1655 case X86::VMOVAPSrr: case X86::VMOVUPSrr:
1656 case X86::VMOVDQArr: case X86::VMOVDQUrr:
1657 case X86::VMOVAPDYrr: case X86::VMOVUPDYrr:
1658 case X86::VMOVAPSYrr: case X86::VMOVUPSYrr:
1659 case X86::VMOVDQAYrr: case X86::VMOVDQUYrr:
1660 case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr:
1661 case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr:
1662 case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr:
1663 case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr:
1664 case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr:
1665 case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr:
1666 case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr:
1667 case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr:
1668 break;
1669 }
1670
1671 SDValue In = Move.getOperand(0);
1672 if (!In.isMachineOpcode() ||
1673 In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1674 continue;
1675
1676 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1677 // the SHA instructions which use a legacy encoding.
1678 uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1679 if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1680 (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1681 (TSFlags & X86II::EncodingMask) != X86II::XOP)
1682 continue;
1683
1684 // Producing instruction is another vector instruction. We can drop the
1685 // move.
1686 CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1687 MadeChange = true;
1688 }
1689
1690 if (MadeChange)
1691 CurDAG->RemoveDeadNodes();
1692}
1693
1694
1695/// Emit any code that needs to be executed only in the main function.
1696void X86DAGToDAGISel::emitSpecialCodeForMain() {
1697 if (Subtarget->isTargetCygMing()) {
1699 auto &DL = CurDAG->getDataLayout();
1700
1702 CLI.setChain(CurDAG->getRoot())
1703 .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1704 CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1705 std::move(Args));
1706 const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1707 std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1708 CurDAG->setRoot(Result.second);
1709 }
1710}
1711
1712void X86DAGToDAGISel::emitFunctionEntryCode() {
1713 // If this is main, emit special code for main.
1714 const Function &F = MF->getFunction();
1715 if (F.hasExternalLinkage() && F.getName() == "main")
1716 emitSpecialCodeForMain();
1717}
1718
1719static bool isDispSafeForFrameIndex(int64_t Val) {
1720 // On 64-bit platforms, we can run into an issue where a frame index
1721 // includes a displacement that, when added to the explicit displacement,
1722 // will overflow the displacement field. Assuming that the frame index
1723 // displacement fits into a 31-bit integer (which is only slightly more
1724 // aggressive than the current fundamental assumption that it fits into
1725 // a 32-bit integer), a 31-bit disp should always be safe.
1726 return isInt<31>(Val);
1727}
1728
1729bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1730 X86ISelAddressMode &AM) {
1731 // We may have already matched a displacement and the caller just added the
1732 // symbolic displacement. So we still need to do the checks even if Offset
1733 // is zero.
1734
1735 int64_t Val = AM.Disp + Offset;
1736
1737 // Cannot combine ExternalSymbol displacements with integer offsets.
1738 if (Val != 0 && (AM.ES || AM.MCSym))
1739 return true;
1740
1741 CodeModel::Model M = TM.getCodeModel();
1742 if (Subtarget->is64Bit()) {
1743 if (Val != 0 &&
1745 AM.hasSymbolicDisplacement()))
1746 return true;
1747 // In addition to the checks required for a register base, check that
1748 // we do not try to use an unsafe Disp with a frame index.
1749 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1751 return true;
1752 // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1753 // 64 bits. Instructions with 32-bit register addresses perform this zero
1754 // extension for us and we can safely ignore the high bits of Offset.
1755 // Instructions with only a 32-bit immediate address do not, though: they
1756 // sign extend instead. This means only address the low 2GB of address space
1757 // is directly addressable, we need indirect addressing for the high 2GB of
1758 // address space.
1759 // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1760 // implicit zero extension of instructions would cover up any problem.
1761 // However, we have asserts elsewhere that get triggered if we do, so keep
1762 // the checks for now.
1763 // TODO: We would actually be able to accept these, as well as the same
1764 // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1765 // to get an address size override to be emitted. However, this
1766 // pseudo-register is not part of any register class and therefore causes
1767 // MIR verification to fail.
1768 if (Subtarget->isTarget64BitILP32() && !isUInt<31>(Val) &&
1769 !AM.hasBaseOrIndexReg())
1770 return true;
1771 }
1772 AM.Disp = Val;
1773 return false;
1774}
1775
1776bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1777 bool AllowSegmentRegForX32) {
1778 SDValue Address = N->getOperand(1);
1779
1780 // load gs:0 -> GS segment register.
1781 // load fs:0 -> FS segment register.
1782 //
1783 // This optimization is generally valid because the GNU TLS model defines that
1784 // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1785 // with 32-bit registers, as we get in ILP32 mode, those registers are first
1786 // zero-extended to 64 bits and then added it to the base address, which gives
1787 // unwanted results when the register holds a negative value.
1788 // For more information see http://people.redhat.com/drepper/tls.pdf
1789 if (isNullConstant(Address) && AM.Segment.getNode() == nullptr &&
1790 !IndirectTlsSegRefs &&
1791 (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1792 Subtarget->isTargetFuchsia())) {
1793 if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1794 return true;
1795 switch (N->getPointerInfo().getAddrSpace()) {
1796 case X86AS::GS:
1797 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1798 return false;
1799 case X86AS::FS:
1800 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1801 return false;
1802 // Address space X86AS::SS is not handled here, because it is not used to
1803 // address TLS areas.
1804 }
1805 }
1806
1807 return true;
1808}
1809
1810/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1811/// mode. These wrap things that will resolve down into a symbol reference.
1812/// If no match is possible, this returns true, otherwise it returns false.
1813bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1814 // If the addressing mode already has a symbol as the displacement, we can
1815 // never match another symbol.
1816 if (AM.hasSymbolicDisplacement())
1817 return true;
1818
1819 bool IsRIPRelTLS = false;
1820 bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1821 if (IsRIPRel) {
1822 SDValue Val = N.getOperand(0);
1824 IsRIPRelTLS = true;
1825 }
1826
1827 // We can't use an addressing mode in the 64-bit large code model.
1828 // Global TLS addressing is an exception. In the medium code model,
1829 // we use can use a mode when RIP wrappers are present.
1830 // That signifies access to globals that are known to be "near",
1831 // such as the GOT itself.
1832 CodeModel::Model M = TM.getCodeModel();
1833 if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1834 return true;
1835
1836 // Base and index reg must be 0 in order to use %rip as base.
1837 if (IsRIPRel && AM.hasBaseOrIndexReg())
1838 return true;
1839
1840 // Make a local copy in case we can't do this fold.
1841 X86ISelAddressMode Backup = AM;
1842
1843 int64_t Offset = 0;
1844 SDValue N0 = N.getOperand(0);
1845 if (auto *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1846 AM.GV = G->getGlobal();
1847 AM.SymbolFlags = G->getTargetFlags();
1848 Offset = G->getOffset();
1849 } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1850 AM.CP = CP->getConstVal();
1851 AM.Alignment = CP->getAlign();
1852 AM.SymbolFlags = CP->getTargetFlags();
1853 Offset = CP->getOffset();
1854 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1855 AM.ES = S->getSymbol();
1856 AM.SymbolFlags = S->getTargetFlags();
1857 } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1858 AM.MCSym = S->getMCSymbol();
1859 } else if (auto *J = dyn_cast<JumpTableSDNode>(N0)) {
1860 AM.JT = J->getIndex();
1861 AM.SymbolFlags = J->getTargetFlags();
1862 } else if (auto *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1863 AM.BlockAddr = BA->getBlockAddress();
1864 AM.SymbolFlags = BA->getTargetFlags();
1865 Offset = BA->getOffset();
1866 } else
1867 llvm_unreachable("Unhandled symbol reference node.");
1868
1869 // Can't use an addressing mode with large globals.
1870 if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1871 TM.isLargeGlobalValue(AM.GV)) {
1872 AM = Backup;
1873 return true;
1874 }
1875
1876 if (foldOffsetIntoAddress(Offset, AM)) {
1877 AM = Backup;
1878 return true;
1879 }
1880
1881 if (IsRIPRel)
1882 AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1883
1884 // Commit the changes now that we know this fold is safe.
1885 return false;
1886}
1887
1888/// Add the specified node to the specified addressing mode, returning true if
1889/// it cannot be done. This just pattern matches for the addressing mode.
1890bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1891 if (matchAddressRecursively(N, AM, 0))
1892 return true;
1893
1894 // Post-processing: Make a second attempt to fold a load, if we now know
1895 // that there will not be any other register. This is only performed for
1896 // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
1897 // any foldable load the first time.
1898 if (Subtarget->isTarget64BitILP32() &&
1899 AM.BaseType == X86ISelAddressMode::RegBase &&
1900 AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
1901 SDValue Save_Base_Reg = AM.Base_Reg;
1902 if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) {
1903 AM.Base_Reg = SDValue();
1904 if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true))
1905 AM.Base_Reg = Save_Base_Reg;
1906 }
1907 }
1908
1909 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1910 // a smaller encoding and avoids a scaled-index.
1911 if (AM.Scale == 2 &&
1912 AM.BaseType == X86ISelAddressMode::RegBase &&
1913 AM.Base_Reg.getNode() == nullptr) {
1914 AM.Base_Reg = AM.IndexReg;
1915 AM.Scale = 1;
1916 }
1917
1918 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
1919 // because it has a smaller encoding.
1920 if (TM.getCodeModel() != CodeModel::Large &&
1921 (!AM.GV || !TM.isLargeGlobalValue(AM.GV)) && Subtarget->is64Bit() &&
1922 AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
1923 AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
1924 AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
1925 AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
1926 }
1927
1928 return false;
1929}
1930
1931bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
1932 unsigned Depth) {
1933 // Add an artificial use to this node so that we can keep track of
1934 // it if it gets CSE'd with a different node.
1935 HandleSDNode Handle(N);
1936
1937 X86ISelAddressMode Backup = AM;
1938 if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
1939 !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
1940 return false;
1941 AM = Backup;
1942
1943 // Try again after commutating the operands.
1944 if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM,
1945 Depth + 1) &&
1946 !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1))
1947 return false;
1948 AM = Backup;
1949
1950 // If we couldn't fold both operands into the address at the same time,
1951 // see if we can just put each operand into a register and fold at least
1952 // the add.
1953 if (AM.BaseType == X86ISelAddressMode::RegBase &&
1954 !AM.Base_Reg.getNode() &&
1955 !AM.IndexReg.getNode()) {
1956 N = Handle.getValue();
1957 AM.Base_Reg = N.getOperand(0);
1958 AM.IndexReg = N.getOperand(1);
1959 AM.Scale = 1;
1960 return false;
1961 }
1962 N = Handle.getValue();
1963 return true;
1964}
1965
1966// Insert a node into the DAG at least before the Pos node's position. This
1967// will reposition the node as needed, and will assign it a node ID that is <=
1968// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
1969// IDs! The selection DAG must no longer depend on their uniqueness when this
1970// is used.
1971static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
1972 if (N->getNodeId() == -1 ||
1975 DAG.RepositionNode(Pos->getIterator(), N.getNode());
1976 // Mark Node as invalid for pruning as after this it may be a successor to a
1977 // selected node but otherwise be in the same position of Pos.
1978 // Conservatively mark it with the same -abs(Id) to assure node id
1979 // invariant is preserved.
1980 N->setNodeId(Pos->getNodeId());
1982 }
1983}
1984
1985// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
1986// safe. This allows us to convert the shift and and into an h-register
1987// extract and a scaled index. Returns false if the simplification is
1988// performed.
1990 uint64_t Mask,
1991 SDValue Shift, SDValue X,
1992 X86ISelAddressMode &AM) {
1993 if (Shift.getOpcode() != ISD::SRL ||
1994 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
1995 !Shift.hasOneUse())
1996 return true;
1997
1998 int ScaleLog = 8 - Shift.getConstantOperandVal(1);
1999 if (ScaleLog <= 0 || ScaleLog >= 4 ||
2000 Mask != (0xffu << ScaleLog))
2001 return true;
2002
2003 MVT XVT = X.getSimpleValueType();
2004 MVT VT = N.getSimpleValueType();
2005 SDLoc DL(N);
2006 SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
2007 SDValue NewMask = DAG.getConstant(0xff, DL, XVT);
2008 SDValue Srl = DAG.getNode(ISD::SRL, DL, XVT, X, Eight);
2009 SDValue And = DAG.getNode(ISD::AND, DL, XVT, Srl, NewMask);
2010 SDValue Ext = DAG.getZExtOrTrunc(And, DL, VT);
2011 SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
2012 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Ext, ShlCount);
2013
2014 // Insert the new nodes into the topological ordering. We must do this in
2015 // a valid topological ordering as nothing is going to go back and re-sort
2016 // these nodes. We continually insert before 'N' in sequence as this is
2017 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2018 // hierarchy left to express.
2019 insertDAGNode(DAG, N, Eight);
2020 insertDAGNode(DAG, N, NewMask);
2021 insertDAGNode(DAG, N, Srl);
2022 insertDAGNode(DAG, N, And);
2023 insertDAGNode(DAG, N, Ext);
2024 insertDAGNode(DAG, N, ShlCount);
2025 insertDAGNode(DAG, N, Shl);
2026 DAG.ReplaceAllUsesWith(N, Shl);
2027 DAG.RemoveDeadNode(N.getNode());
2028 AM.IndexReg = Ext;
2029 AM.Scale = (1 << ScaleLog);
2030 return false;
2031}
2032
2033// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2034// allows us to fold the shift into this addressing mode. Returns false if the
2035// transform succeeded.
2037 X86ISelAddressMode &AM) {
2038 SDValue Shift = N.getOperand(0);
2039
2040 // Use a signed mask so that shifting right will insert sign bits. These
2041 // bits will be removed when we shift the result left so it doesn't matter
2042 // what we use. This might allow a smaller immediate encoding.
2043 int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
2044
2045 // If we have an any_extend feeding the AND, look through it to see if there
2046 // is a shift behind it. But only if the AND doesn't use the extended bits.
2047 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2048 bool FoundAnyExtend = false;
2049 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2050 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
2051 isUInt<32>(Mask)) {
2052 FoundAnyExtend = true;
2053 Shift = Shift.getOperand(0);
2054 }
2055
2056 if (Shift.getOpcode() != ISD::SHL ||
2057 !isa<ConstantSDNode>(Shift.getOperand(1)))
2058 return true;
2059
2060 SDValue X = Shift.getOperand(0);
2061
2062 // Not likely to be profitable if either the AND or SHIFT node has more
2063 // than one use (unless all uses are for address computation). Besides,
2064 // isel mechanism requires their node ids to be reused.
2065 if (!N.hasOneUse() || !Shift.hasOneUse())
2066 return true;
2067
2068 // Verify that the shift amount is something we can fold.
2069 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2070 if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
2071 return true;
2072
2073 MVT VT = N.getSimpleValueType();
2074 SDLoc DL(N);
2075 if (FoundAnyExtend) {
2076 SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
2077 insertDAGNode(DAG, N, NewX);
2078 X = NewX;
2079 }
2080
2081 SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
2082 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
2083 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
2084
2085 // Insert the new nodes into the topological ordering. We must do this in
2086 // a valid topological ordering as nothing is going to go back and re-sort
2087 // these nodes. We continually insert before 'N' in sequence as this is
2088 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2089 // hierarchy left to express.
2090 insertDAGNode(DAG, N, NewMask);
2091 insertDAGNode(DAG, N, NewAnd);
2092 insertDAGNode(DAG, N, NewShift);
2093 DAG.ReplaceAllUsesWith(N, NewShift);
2094 DAG.RemoveDeadNode(N.getNode());
2095
2096 AM.Scale = 1 << ShiftAmt;
2097 AM.IndexReg = NewAnd;
2098 return false;
2099}
2100
2101// Implement some heroics to detect shifts of masked values where the mask can
2102// be replaced by extending the shift and undoing that in the addressing mode
2103// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2104// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2105// the addressing mode. This results in code such as:
2106//
2107// int f(short *y, int *lookup_table) {
2108// ...
2109// return *y + lookup_table[*y >> 11];
2110// }
2111//
2112// Turning into:
2113// movzwl (%rdi), %eax
2114// movl %eax, %ecx
2115// shrl $11, %ecx
2116// addl (%rsi,%rcx,4), %eax
2117//
2118// Instead of:
2119// movzwl (%rdi), %eax
2120// movl %eax, %ecx
2121// shrl $9, %ecx
2122// andl $124, %rcx
2123// addl (%rsi,%rcx), %eax
2124//
2125// Note that this function assumes the mask is provided as a mask *after* the
2126// value is shifted. The input chain may or may not match that, but computing
2127// such a mask is trivial.
2129 uint64_t Mask,
2130 SDValue Shift, SDValue X,
2131 X86ISelAddressMode &AM) {
2132 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
2133 !isa<ConstantSDNode>(Shift.getOperand(1)))
2134 return true;
2135
2136 // We need to ensure that mask is a continuous run of bits.
2137 unsigned MaskIdx, MaskLen;
2138 if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2139 return true;
2140 unsigned MaskLZ = 64 - (MaskIdx + MaskLen);
2141
2142 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2143
2144 // The amount of shift we're trying to fit into the addressing mode is taken
2145 // from the shifted mask index (number of trailing zeros of the mask).
2146 unsigned AMShiftAmt = MaskIdx;
2147
2148 // There is nothing we can do here unless the mask is removing some bits.
2149 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2150 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2151
2152 // Scale the leading zero count down based on the actual size of the value.
2153 // Also scale it down based on the size of the shift.
2154 unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2155 if (MaskLZ < ScaleDown)
2156 return true;
2157 MaskLZ -= ScaleDown;
2158
2159 // The final check is to ensure that any masked out high bits of X are
2160 // already known to be zero. Otherwise, the mask has a semantic impact
2161 // other than masking out a couple of low bits. Unfortunately, because of
2162 // the mask, zero extensions will be removed from operands in some cases.
2163 // This code works extra hard to look through extensions because we can
2164 // replace them with zero extensions cheaply if necessary.
2165 bool ReplacingAnyExtend = false;
2166 if (X.getOpcode() == ISD::ANY_EXTEND) {
2167 unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2168 X.getOperand(0).getSimpleValueType().getSizeInBits();
2169 // Assume that we'll replace the any-extend with a zero-extend, and
2170 // narrow the search to the extended value.
2171 X = X.getOperand(0);
2172 MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
2173 ReplacingAnyExtend = true;
2174 }
2175 APInt MaskedHighBits =
2176 APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
2177 if (!DAG.MaskedValueIsZero(X, MaskedHighBits))
2178 return true;
2179
2180 // We've identified a pattern that can be transformed into a single shift
2181 // and an addressing mode. Make it so.
2182 MVT VT = N.getSimpleValueType();
2183 if (ReplacingAnyExtend) {
2184 assert(X.getValueType() != VT);
2185 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2186 SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
2187 insertDAGNode(DAG, N, NewX);
2188 X = NewX;
2189 }
2190
2191 MVT XVT = X.getSimpleValueType();
2192 SDLoc DL(N);
2193 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2194 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2195 SDValue NewExt = DAG.getZExtOrTrunc(NewSRL, DL, VT);
2196 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2197 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2198
2199 // Insert the new nodes into the topological ordering. We must do this in
2200 // a valid topological ordering as nothing is going to go back and re-sort
2201 // these nodes. We continually insert before 'N' in sequence as this is
2202 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2203 // hierarchy left to express.
2204 insertDAGNode(DAG, N, NewSRLAmt);
2205 insertDAGNode(DAG, N, NewSRL);
2206 insertDAGNode(DAG, N, NewExt);
2207 insertDAGNode(DAG, N, NewSHLAmt);
2208 insertDAGNode(DAG, N, NewSHL);
2209 DAG.ReplaceAllUsesWith(N, NewSHL);
2210 DAG.RemoveDeadNode(N.getNode());
2211
2212 AM.Scale = 1 << AMShiftAmt;
2213 AM.IndexReg = NewExt;
2214 return false;
2215}
2216
2217// Transform "(X >> SHIFT) & (MASK << C1)" to
2218// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2219// matched to a BEXTR later. Returns false if the simplification is performed.
2221 uint64_t Mask,
2222 SDValue Shift, SDValue X,
2223 X86ISelAddressMode &AM,
2224 const X86Subtarget &Subtarget) {
2225 if (Shift.getOpcode() != ISD::SRL ||
2226 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2227 !Shift.hasOneUse() || !N.hasOneUse())
2228 return true;
2229
2230 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2231 if (!Subtarget.hasTBM() &&
2232 !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2233 return true;
2234
2235 // We need to ensure that mask is a continuous run of bits.
2236 unsigned MaskIdx, MaskLen;
2237 if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2238 return true;
2239
2240 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2241
2242 // The amount of shift we're trying to fit into the addressing mode is taken
2243 // from the shifted mask index (number of trailing zeros of the mask).
2244 unsigned AMShiftAmt = MaskIdx;
2245
2246 // There is nothing we can do here unless the mask is removing some bits.
2247 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2248 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2249
2250 MVT XVT = X.getSimpleValueType();
2251 MVT VT = N.getSimpleValueType();
2252 SDLoc DL(N);
2253 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2254 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2255 SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, XVT);
2256 SDValue NewAnd = DAG.getNode(ISD::AND, DL, XVT, NewSRL, NewMask);
2257 SDValue NewExt = DAG.getZExtOrTrunc(NewAnd, DL, VT);
2258 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2259 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2260
2261 // Insert the new nodes into the topological ordering. We must do this in
2262 // a valid topological ordering as nothing is going to go back and re-sort
2263 // these nodes. We continually insert before 'N' in sequence as this is
2264 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2265 // hierarchy left to express.
2266 insertDAGNode(DAG, N, NewSRLAmt);
2267 insertDAGNode(DAG, N, NewSRL);
2268 insertDAGNode(DAG, N, NewMask);
2269 insertDAGNode(DAG, N, NewAnd);
2270 insertDAGNode(DAG, N, NewExt);
2271 insertDAGNode(DAG, N, NewSHLAmt);
2272 insertDAGNode(DAG, N, NewSHL);
2273 DAG.ReplaceAllUsesWith(N, NewSHL);
2274 DAG.RemoveDeadNode(N.getNode());
2275
2276 AM.Scale = 1 << AMShiftAmt;
2277 AM.IndexReg = NewExt;
2278 return false;
2279}
2280
2281// Attempt to peek further into a scaled index register, collecting additional
2282// extensions / offsets / etc. Returns /p N if we can't peek any further.
2283SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2284 X86ISelAddressMode &AM,
2285 unsigned Depth) {
2286 assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2287 assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) &&
2288 "Illegal index scale");
2289
2290 // Limit recursion.
2292 return N;
2293
2294 EVT VT = N.getValueType();
2295 unsigned Opc = N.getOpcode();
2296
2297 // index: add(x,c) -> index: x, disp + c
2298 if (CurDAG->isBaseWithConstantOffset(N)) {
2299 auto *AddVal = cast<ConstantSDNode>(N.getOperand(1));
2300 uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2301 if (!foldOffsetIntoAddress(Offset, AM))
2302 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2303 }
2304
2305 // index: add(x,x) -> index: x, scale * 2
2306 if (Opc == ISD::ADD && N.getOperand(0) == N.getOperand(1)) {
2307 if (AM.Scale <= 4) {
2308 AM.Scale *= 2;
2309 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2310 }
2311 }
2312
2313 // index: shl(x,i) -> index: x, scale * (1 << i)
2314 if (Opc == X86ISD::VSHLI) {
2315 uint64_t ShiftAmt = N.getConstantOperandVal(1);
2316 uint64_t ScaleAmt = 1ULL << ShiftAmt;
2317 if ((AM.Scale * ScaleAmt) <= 8) {
2318 AM.Scale *= ScaleAmt;
2319 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2320 }
2321 }
2322
2323 // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2324 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2325 if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2326 SDValue Src = N.getOperand(0);
2327 if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() &&
2328 Src.hasOneUse()) {
2329 if (CurDAG->isBaseWithConstantOffset(Src)) {
2330 SDValue AddSrc = Src.getOperand(0);
2331 auto *AddVal = cast<ConstantSDNode>(Src.getOperand(1));
2332 uint64_t Offset = (uint64_t)AddVal->getSExtValue();
2333 if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2334 SDLoc DL(N);
2335 SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2336 SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2337 SDValue ExtAdd = CurDAG->getNode(ISD::ADD, DL, VT, ExtSrc, ExtVal);
2338 insertDAGNode(*CurDAG, N, ExtSrc);
2339 insertDAGNode(*CurDAG, N, ExtVal);
2340 insertDAGNode(*CurDAG, N, ExtAdd);
2341 CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2342 CurDAG->RemoveDeadNode(N.getNode());
2343 return ExtSrc;
2344 }
2345 }
2346 }
2347 }
2348
2349 // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2350 // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2351 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2352 if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2353 SDValue Src = N.getOperand(0);
2354 unsigned SrcOpc = Src.getOpcode();
2355 if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) ||
2356 CurDAG->isADDLike(Src)) &&
2357 Src.hasOneUse()) {
2358 if (CurDAG->isBaseWithConstantOffset(Src)) {
2359 SDValue AddSrc = Src.getOperand(0);
2360 uint64_t Offset = Src.getConstantOperandVal(1);
2361 if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2362 SDLoc DL(N);
2363 SDValue Res;
2364 // If we're also scaling, see if we can use that as well.
2365 if (AddSrc.getOpcode() == ISD::SHL &&
2366 isa<ConstantSDNode>(AddSrc.getOperand(1))) {
2367 SDValue ShVal = AddSrc.getOperand(0);
2368 uint64_t ShAmt = AddSrc.getConstantOperandVal(1);
2369 APInt HiBits =
2371 uint64_t ScaleAmt = 1ULL << ShAmt;
2372 if ((AM.Scale * ScaleAmt) <= 8 &&
2373 (AddSrc->getFlags().hasNoUnsignedWrap() ||
2374 CurDAG->MaskedValueIsZero(ShVal, HiBits))) {
2375 AM.Scale *= ScaleAmt;
2376 SDValue ExtShVal = CurDAG->getNode(Opc, DL, VT, ShVal);
2377 SDValue ExtShift = CurDAG->getNode(ISD::SHL, DL, VT, ExtShVal,
2378 AddSrc.getOperand(1));
2379 insertDAGNode(*CurDAG, N, ExtShVal);
2380 insertDAGNode(*CurDAG, N, ExtShift);
2381 AddSrc = ExtShift;
2382 Res = ExtShVal;
2383 }
2384 }
2385 SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2386 SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2387 SDValue ExtAdd = CurDAG->getNode(SrcOpc, DL, VT, ExtSrc, ExtVal);
2388 insertDAGNode(*CurDAG, N, ExtSrc);
2389 insertDAGNode(*CurDAG, N, ExtVal);
2390 insertDAGNode(*CurDAG, N, ExtAdd);
2391 CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2392 CurDAG->RemoveDeadNode(N.getNode());
2393 return Res ? Res : ExtSrc;
2394 }
2395 }
2396 }
2397 }
2398
2399 // TODO: Handle extensions, shifted masks etc.
2400 return N;
2401}
2402
2403bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2404 unsigned Depth) {
2405 SDLoc dl(N);
2406 LLVM_DEBUG({
2407 dbgs() << "MatchAddress: ";
2408 AM.dump(CurDAG);
2409 });
2410 // Limit recursion.
2412 return matchAddressBase(N, AM);
2413
2414 // If this is already a %rip relative address, we can only merge immediates
2415 // into it. Instead of handling this in every case, we handle it here.
2416 // RIP relative addressing: %rip + 32-bit displacement!
2417 if (AM.isRIPRelative()) {
2418 // FIXME: JumpTable and ExternalSymbol address currently don't like
2419 // displacements. It isn't very important, but this should be fixed for
2420 // consistency.
2421 if (!(AM.ES || AM.MCSym) && AM.JT != -1)
2422 return true;
2423
2424 if (auto *Cst = dyn_cast<ConstantSDNode>(N))
2425 if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
2426 return false;
2427 return true;
2428 }
2429
2430 switch (N.getOpcode()) {
2431 default: break;
2432 case ISD::LOCAL_RECOVER: {
2433 if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
2434 if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
2435 // Use the symbol and don't prefix it.
2436 AM.MCSym = ESNode->getMCSymbol();
2437 return false;
2438 }
2439 break;
2440 }
2441 case ISD::Constant: {
2442 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2443 if (!foldOffsetIntoAddress(Val, AM))
2444 return false;
2445 break;
2446 }
2447
2448 case X86ISD::Wrapper:
2449 case X86ISD::WrapperRIP:
2450 if (!matchWrapper(N, AM))
2451 return false;
2452 break;
2453
2454 case ISD::LOAD:
2455 if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
2456 return false;
2457 break;
2458
2459 case ISD::FrameIndex:
2460 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2461 AM.Base_Reg.getNode() == nullptr &&
2462 (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
2463 AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2464 AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
2465 return false;
2466 }
2467 break;
2468
2469 case ISD::SHL:
2470 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2471 break;
2472
2473 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
2474 unsigned Val = CN->getZExtValue();
2475 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2476 // that the base operand remains free for further matching. If
2477 // the base doesn't end up getting used, a post-processing step
2478 // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2479 if (Val == 1 || Val == 2 || Val == 3) {
2480 SDValue ShVal = N.getOperand(0);
2481 AM.Scale = 1 << Val;
2482 AM.IndexReg = matchIndexRecursively(ShVal, AM, Depth + 1);
2483 return false;
2484 }
2485 }
2486 break;
2487
2488 case ISD::SRL: {
2489 // Scale must not be used already.
2490 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2491
2492 // We only handle up to 64-bit values here as those are what matter for
2493 // addressing mode optimizations.
2494 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2495 "Unexpected value size!");
2496
2497 SDValue And = N.getOperand(0);
2498 if (And.getOpcode() != ISD::AND) break;
2499 SDValue X = And.getOperand(0);
2500
2501 // The mask used for the transform is expected to be post-shift, but we
2502 // found the shift first so just apply the shift to the mask before passing
2503 // it down.
2504 if (!isa<ConstantSDNode>(N.getOperand(1)) ||
2505 !isa<ConstantSDNode>(And.getOperand(1)))
2506 break;
2507 uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
2508
2509 // Try to fold the mask and shift into the scale, and return false if we
2510 // succeed.
2511 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
2512 return false;
2513 break;
2514 }
2515
2516 case ISD::SMUL_LOHI:
2517 case ISD::UMUL_LOHI:
2518 // A mul_lohi where we need the low part can be folded as a plain multiply.
2519 if (N.getResNo() != 0) break;
2520 [[fallthrough]];
2521 case ISD::MUL:
2522 case X86ISD::MUL_IMM:
2523 // X*[3,5,9] -> X+X*[2,4,8]
2524 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2525 AM.Base_Reg.getNode() == nullptr &&
2526 AM.IndexReg.getNode() == nullptr) {
2527 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
2528 if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
2529 CN->getZExtValue() == 9) {
2530 AM.Scale = unsigned(CN->getZExtValue())-1;
2531
2532 SDValue MulVal = N.getOperand(0);
2533 SDValue Reg;
2534
2535 // Okay, we know that we have a scale by now. However, if the scaled
2536 // value is an add of something and a constant, we can fold the
2537 // constant into the disp field here.
2538 if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2539 isa<ConstantSDNode>(MulVal.getOperand(1))) {
2540 Reg = MulVal.getOperand(0);
2541 auto *AddVal = cast<ConstantSDNode>(MulVal.getOperand(1));
2542 uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2543 if (foldOffsetIntoAddress(Disp, AM))
2544 Reg = N.getOperand(0);
2545 } else {
2546 Reg = N.getOperand(0);
2547 }
2548
2549 AM.IndexReg = AM.Base_Reg = Reg;
2550 return false;
2551 }
2552 }
2553 break;
2554
2555 case ISD::SUB: {
2556 // Given A-B, if A can be completely folded into the address and
2557 // the index field with the index field unused, use -B as the index.
2558 // This is a win if a has multiple parts that can be folded into
2559 // the address. Also, this saves a mov if the base register has
2560 // other uses, since it avoids a two-address sub instruction, however
2561 // it costs an additional mov if the index register has other uses.
2562
2563 // Add an artificial use to this node so that we can keep track of
2564 // it if it gets CSE'd with a different node.
2565 HandleSDNode Handle(N);
2566
2567 // Test if the LHS of the sub can be folded.
2568 X86ISelAddressMode Backup = AM;
2569 if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2570 N = Handle.getValue();
2571 AM = Backup;
2572 break;
2573 }
2574 N = Handle.getValue();
2575 // Test if the index field is free for use.
2576 if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2577 AM = Backup;
2578 break;
2579 }
2580
2581 int Cost = 0;
2582 SDValue RHS = N.getOperand(1);
2583 // If the RHS involves a register with multiple uses, this
2584 // transformation incurs an extra mov, due to the neg instruction
2585 // clobbering its operand.
2586 if (!RHS.getNode()->hasOneUse() ||
2587 RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2588 RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2589 RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2590 (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2591 RHS.getOperand(0).getValueType() == MVT::i32))
2592 ++Cost;
2593 // If the base is a register with multiple uses, this
2594 // transformation may save a mov.
2595 if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2596 !AM.Base_Reg.getNode()->hasOneUse()) ||
2597 AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2598 --Cost;
2599 // If the folded LHS was interesting, this transformation saves
2600 // address arithmetic.
2601 if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2602 ((AM.Disp != 0) && (Backup.Disp == 0)) +
2603 (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2604 --Cost;
2605 // If it doesn't look like it may be an overall win, don't do it.
2606 if (Cost >= 0) {
2607 AM = Backup;
2608 break;
2609 }
2610
2611 // Ok, the transformation is legal and appears profitable. Go for it.
2612 // Negation will be emitted later to avoid creating dangling nodes if this
2613 // was an unprofitable LEA.
2614 AM.IndexReg = RHS;
2615 AM.NegateIndex = true;
2616 AM.Scale = 1;
2617 return false;
2618 }
2619
2620 case ISD::OR:
2621 case ISD::XOR:
2622 // See if we can treat the OR/XOR node as an ADD node.
2623 if (!CurDAG->isADDLike(N))
2624 break;
2625 [[fallthrough]];
2626 case ISD::ADD:
2627 if (!matchAdd(N, AM, Depth))
2628 return false;
2629 break;
2630
2631 case ISD::AND: {
2632 // Perform some heroic transforms on an and of a constant-count shift
2633 // with a constant to enable use of the scaled offset field.
2634
2635 // Scale must not be used already.
2636 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2637
2638 // We only handle up to 64-bit values here as those are what matter for
2639 // addressing mode optimizations.
2640 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2641 "Unexpected value size!");
2642
2643 if (!isa<ConstantSDNode>(N.getOperand(1)))
2644 break;
2645
2646 if (N.getOperand(0).getOpcode() == ISD::SRL) {
2647 SDValue Shift = N.getOperand(0);
2648 SDValue X = Shift.getOperand(0);
2649
2650 uint64_t Mask = N.getConstantOperandVal(1);
2651
2652 // Try to fold the mask and shift into an extract and scale.
2653 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2654 return false;
2655
2656 // Try to fold the mask and shift directly into the scale.
2657 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2658 return false;
2659
2660 // Try to fold the mask and shift into BEXTR and scale.
2661 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2662 return false;
2663 }
2664
2665 // Try to swap the mask and shift to place shifts which can be done as
2666 // a scale on the outside of the mask.
2667 if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2668 return false;
2669
2670 break;
2671 }
2672 case ISD::ZERO_EXTEND: {
2673 // Try to widen a zexted shift left to the same size as its use, so we can
2674 // match the shift as a scale factor.
2675 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2676 break;
2677
2678 SDValue Src = N.getOperand(0);
2679
2680 // See if we can match a zext(addlike(x,c)).
2681 // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2682 if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR)
2683 if (SDValue Index = matchIndexRecursively(N, AM, Depth + 1))
2684 if (Index != N) {
2685 AM.IndexReg = Index;
2686 return false;
2687 }
2688
2689 // Peek through mask: zext(and(shl(x,c1),c2))
2690 APInt Mask = APInt::getAllOnes(Src.getScalarValueSizeInBits());
2691 if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2692 if (auto *MaskC = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
2693 Mask = MaskC->getAPIntValue();
2694 Src = Src.getOperand(0);
2695 }
2696
2697 if (Src.getOpcode() == ISD::SHL && Src.hasOneUse()) {
2698 // Give up if the shift is not a valid scale factor [1,2,3].
2699 SDValue ShlSrc = Src.getOperand(0);
2700 SDValue ShlAmt = Src.getOperand(1);
2701 auto *ShAmtC = dyn_cast<ConstantSDNode>(ShlAmt);
2702 if (!ShAmtC)
2703 break;
2704 unsigned ShAmtV = ShAmtC->getZExtValue();
2705 if (ShAmtV > 3)
2706 break;
2707
2708 // The narrow shift must only shift out zero bits (it must be 'nuw').
2709 // That makes it safe to widen to the destination type.
2710 APInt HighZeros =
2711 APInt::getHighBitsSet(ShlSrc.getValueSizeInBits(), ShAmtV);
2712 if (!Src->getFlags().hasNoUnsignedWrap() &&
2713 !CurDAG->MaskedValueIsZero(ShlSrc, HighZeros & Mask))
2714 break;
2715
2716 // zext (shl nuw i8 %x, C1) to i32
2717 // --> shl (zext i8 %x to i32), (zext C1)
2718 // zext (and (shl nuw i8 %x, C1), C2) to i32
2719 // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2720 MVT SrcVT = ShlSrc.getSimpleValueType();
2721 MVT VT = N.getSimpleValueType();
2722 SDLoc DL(N);
2723
2724 SDValue Res = ShlSrc;
2725 if (!Mask.isAllOnes()) {
2726 Res = CurDAG->getConstant(Mask.lshr(ShAmtV), DL, SrcVT);
2727 insertDAGNode(*CurDAG, N, Res);
2728 Res = CurDAG->getNode(ISD::AND, DL, SrcVT, ShlSrc, Res);
2729 insertDAGNode(*CurDAG, N, Res);
2730 }
2731 SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Res);
2732 insertDAGNode(*CurDAG, N, Zext);
2733 SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, ShlAmt);
2734 insertDAGNode(*CurDAG, N, NewShl);
2735 CurDAG->ReplaceAllUsesWith(N, NewShl);
2736 CurDAG->RemoveDeadNode(N.getNode());
2737
2738 // Convert the shift to scale factor.
2739 AM.Scale = 1 << ShAmtV;
2740 // If matchIndexRecursively is not called here,
2741 // Zext may be replaced by other nodes but later used to call a builder
2742 // method
2743 AM.IndexReg = matchIndexRecursively(Zext, AM, Depth + 1);
2744 return false;
2745 }
2746
2747 if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2748 // Try to fold the mask and shift into an extract and scale.
2749 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask.getZExtValue(), Src,
2750 Src.getOperand(0), AM))
2751 return false;
2752
2753 // Try to fold the mask and shift directly into the scale.
2754 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask.getZExtValue(), Src,
2755 Src.getOperand(0), AM))
2756 return false;
2757
2758 // Try to fold the mask and shift into BEXTR and scale.
2759 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask.getZExtValue(), Src,
2760 Src.getOperand(0), AM, *Subtarget))
2761 return false;
2762 }
2763
2764 break;
2765 }
2766 }
2767
2768 return matchAddressBase(N, AM);
2769}
2770
2771/// Helper for MatchAddress. Add the specified node to the
2772/// specified addressing mode without any further recursion.
2773bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2774 // Is the base register already occupied?
2775 if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2776 // If so, check to see if the scale index register is set.
2777 if (!AM.IndexReg.getNode()) {
2778 AM.IndexReg = N;
2779 AM.Scale = 1;
2780 return false;
2781 }
2782
2783 // Otherwise, we cannot select it.
2784 return true;
2785 }
2786
2787 // Default, generate it as a register.
2788 AM.BaseType = X86ISelAddressMode::RegBase;
2789 AM.Base_Reg = N;
2790 return false;
2791}
2792
2793bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2794 X86ISelAddressMode &AM,
2795 unsigned Depth) {
2796 SDLoc dl(N);
2797 LLVM_DEBUG({
2798 dbgs() << "MatchVectorAddress: ";
2799 AM.dump(CurDAG);
2800 });
2801 // Limit recursion.
2803 return matchAddressBase(N, AM);
2804
2805 // TODO: Support other operations.
2806 switch (N.getOpcode()) {
2807 case ISD::Constant: {
2808 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2809 if (!foldOffsetIntoAddress(Val, AM))
2810 return false;
2811 break;
2812 }
2813 case X86ISD::Wrapper:
2814 if (!matchWrapper(N, AM))
2815 return false;
2816 break;
2817 case ISD::ADD: {
2818 // Add an artificial use to this node so that we can keep track of
2819 // it if it gets CSE'd with a different node.
2820 HandleSDNode Handle(N);
2821
2822 X86ISelAddressMode Backup = AM;
2823 if (!matchVectorAddressRecursively(N.getOperand(0), AM, Depth + 1) &&
2824 !matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2825 Depth + 1))
2826 return false;
2827 AM = Backup;
2828
2829 // Try again after commuting the operands.
2830 if (!matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2831 Depth + 1) &&
2832 !matchVectorAddressRecursively(Handle.getValue().getOperand(0), AM,
2833 Depth + 1))
2834 return false;
2835 AM = Backup;
2836
2837 N = Handle.getValue();
2838 break;
2839 }
2840 }
2841
2842 return matchAddressBase(N, AM);
2843}
2844
2845/// Helper for selectVectorAddr. Handles things that can be folded into a
2846/// gather/scatter address. The index register and scale should have already
2847/// been handled.
2848bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2849 return matchVectorAddressRecursively(N, AM, 0);
2850}
2851
2852bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2853 SDValue IndexOp, SDValue ScaleOp,
2854 SDValue &Base, SDValue &Scale,
2855 SDValue &Index, SDValue &Disp,
2856 SDValue &Segment) {
2857 X86ISelAddressMode AM;
2858 AM.Scale = ScaleOp->getAsZExtVal();
2859
2860 // Attempt to match index patterns, as long as we're not relying on implicit
2861 // sign-extension, which is performed BEFORE scale.
2862 if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2863 AM.IndexReg = matchIndexRecursively(IndexOp, AM, 0);
2864 else
2865 AM.IndexReg = IndexOp;
2866
2867 unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2868 if (AddrSpace == X86AS::GS)
2869 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2870 if (AddrSpace == X86AS::FS)
2871 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2872 if (AddrSpace == X86AS::SS)
2873 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2874
2875 SDLoc DL(BasePtr);
2876 MVT VT = BasePtr.getSimpleValueType();
2877
2878 // Try to match into the base and displacement fields.
2879 if (matchVectorAddress(BasePtr, AM))
2880 return false;
2881
2882 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2883 return true;
2884}
2885
2886/// Returns true if it is able to pattern match an addressing mode.
2887/// It returns the operands which make up the maximal addressing mode it can
2888/// match by reference.
2889///
2890/// Parent is the parent node of the addr operand that is being matched. It
2891/// is always a load, store, atomic node, or null. It is only null when
2892/// checking memory operands for inline asm nodes.
2893bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2894 SDValue &Scale, SDValue &Index,
2895 SDValue &Disp, SDValue &Segment) {
2896 X86ISelAddressMode AM;
2897
2898 if (Parent &&
2899 // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2900 // that are not a MemSDNode, and thus don't have proper addrspace info.
2901 Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
2902 Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
2903 Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
2904 Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
2905 Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
2906 Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
2907 Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
2908 unsigned AddrSpace =
2909 cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
2910 if (AddrSpace == X86AS::GS)
2911 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2912 if (AddrSpace == X86AS::FS)
2913 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2914 if (AddrSpace == X86AS::SS)
2915 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2916 }
2917
2918 // Save the DL and VT before calling matchAddress, it can invalidate N.
2919 SDLoc DL(N);
2920 MVT VT = N.getSimpleValueType();
2921
2922 if (matchAddress(N, AM))
2923 return false;
2924
2925 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2926 return true;
2927}
2928
2929bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
2930 // Cannot use 32 bit constants to reference objects in kernel code model.
2931 // Cannot use 32 bit constants to reference objects in large PIC mode since
2932 // GOTOFF is 64 bits.
2933 if (TM.getCodeModel() == CodeModel::Kernel ||
2934 (TM.getCodeModel() == CodeModel::Large && TM.isPositionIndependent()))
2935 return false;
2936
2937 // In static codegen with small code model, we can get the address of a label
2938 // into a register with 'movl'
2939 if (N->getOpcode() != X86ISD::Wrapper)
2940 return false;
2941
2942 N = N.getOperand(0);
2943
2944 // At least GNU as does not accept 'movl' for TPOFF relocations.
2945 // FIXME: We could use 'movl' when we know we are targeting MC.
2946 if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
2947 return false;
2948
2949 Imm = N;
2950 // Small/medium code model can reference non-TargetGlobalAddress objects with
2951 // 32 bit constants.
2952 if (N->getOpcode() != ISD::TargetGlobalAddress) {
2953 return TM.getCodeModel() == CodeModel::Small ||
2954 TM.getCodeModel() == CodeModel::Medium;
2955 }
2956
2957 const GlobalValue *GV = cast<GlobalAddressSDNode>(N)->getGlobal();
2958 if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
2959 return CR->getUnsignedMax().ult(1ull << 32);
2960
2961 return !TM.isLargeGlobalValue(GV);
2962}
2963
2964bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
2965 SDValue &Scale, SDValue &Index,
2966 SDValue &Disp, SDValue &Segment) {
2967 // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
2968 SDLoc DL(N);
2969
2970 if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
2971 return false;
2972
2973 auto *RN = dyn_cast<RegisterSDNode>(Base);
2974 if (RN && RN->getReg() == 0)
2975 Base = CurDAG->getRegister(0, MVT::i64);
2976 else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
2977 // Base could already be %rip, particularly in the x32 ABI.
2978 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
2979 MVT::i64), 0);
2980 Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
2981 Base);
2982 }
2983
2984 RN = dyn_cast<RegisterSDNode>(Index);
2985 if (RN && RN->getReg() == 0)
2986 Index = CurDAG->getRegister(0, MVT::i64);
2987 else {
2988 assert(Index.getValueType() == MVT::i32 &&
2989 "Expect to be extending 32-bit registers for use in LEA");
2990 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
2991 MVT::i64), 0);
2992 Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
2993 Index);
2994 }
2995
2996 return true;
2997}
2998
2999/// Calls SelectAddr and determines if the maximal addressing
3000/// mode it matches can be cost effectively emitted as an LEA instruction.
3001bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3002 SDValue &Base, SDValue &Scale,
3003 SDValue &Index, SDValue &Disp,
3004 SDValue &Segment) {
3005 X86ISelAddressMode AM;
3006
3007 // Save the DL and VT before calling matchAddress, it can invalidate N.
3008 SDLoc DL(N);
3009 MVT VT = N.getSimpleValueType();
3010
3011 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3012 // segments.
3013 SDValue Copy = AM.Segment;
3014 SDValue T = CurDAG->getRegister(0, MVT::i32);
3015 AM.Segment = T;
3016 if (matchAddress(N, AM))
3017 return false;
3018 assert (T == AM.Segment);
3019 AM.Segment = Copy;
3020
3021 unsigned Complexity = 0;
3022 if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3023 Complexity = 1;
3024 else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3025 Complexity = 4;
3026
3027 if (AM.IndexReg.getNode())
3028 Complexity++;
3029
3030 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3031 // a simple shift.
3032 if (AM.Scale > 1)
3033 Complexity++;
3034
3035 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3036 // to a LEA. This is determined with some experimentation but is by no means
3037 // optimal (especially for code size consideration). LEA is nice because of
3038 // its three-address nature. Tweak the cost function again when we can run
3039 // convertToThreeAddress() at register allocation time.
3040 if (AM.hasSymbolicDisplacement()) {
3041 // For X86-64, always use LEA to materialize RIP-relative addresses.
3042 if (Subtarget->is64Bit())
3043 Complexity = 4;
3044 else
3045 Complexity += 2;
3046 }
3047
3048 // Heuristic: try harder to form an LEA from ADD if the operands set flags.
3049 // Unlike ADD, LEA does not affect flags, so we will be less likely to require
3050 // duplicating flag-producing instructions later in the pipeline.
3051 if (N.getOpcode() == ISD::ADD) {
3052 auto isMathWithFlags = [](SDValue V) {
3053 switch (V.getOpcode()) {
3054 case X86ISD::ADD:
3055 case X86ISD::SUB:
3056 case X86ISD::ADC:
3057 case X86ISD::SBB:
3058 case X86ISD::SMUL:
3059 case X86ISD::UMUL:
3060 /* TODO: These opcodes can be added safely, but we may want to justify
3061 their inclusion for different reasons (better for reg-alloc).
3062 case X86ISD::OR:
3063 case X86ISD::XOR:
3064 case X86ISD::AND:
3065 */
3066 // Value 1 is the flag output of the node - verify it's not dead.
3067 return !SDValue(V.getNode(), 1).use_empty();
3068 default:
3069 return false;
3070 }
3071 };
3072 // TODO: We might want to factor in whether there's a load folding
3073 // opportunity for the math op that disappears with LEA.
3074 if (isMathWithFlags(N.getOperand(0)) || isMathWithFlags(N.getOperand(1)))
3075 Complexity++;
3076 }
3077
3078 if (AM.Disp)
3079 Complexity++;
3080
3081 // If it isn't worth using an LEA, reject it.
3082 if (Complexity <= 2)
3083 return false;
3084
3085 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3086 return true;
3087}
3088
3089/// This is only run on TargetGlobalTLSAddress nodes.
3090bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3091 SDValue &Scale, SDValue &Index,
3092 SDValue &Disp, SDValue &Segment) {
3093 assert(N.getOpcode() == ISD::TargetGlobalTLSAddress ||
3094 N.getOpcode() == ISD::TargetExternalSymbol);
3095
3096 X86ISelAddressMode AM;
3097 if (auto *GA = dyn_cast<GlobalAddressSDNode>(N)) {
3098 AM.GV = GA->getGlobal();
3099 AM.Disp += GA->getOffset();
3100 AM.SymbolFlags = GA->getTargetFlags();
3101 } else {
3102 auto *SA = cast<ExternalSymbolSDNode>(N);
3103 AM.ES = SA->getSymbol();
3104 AM.SymbolFlags = SA->getTargetFlags();
3105 }
3106
3107 if (Subtarget->is32Bit()) {
3108 AM.Scale = 1;
3109 AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
3110 }
3111
3112 MVT VT = N.getSimpleValueType();
3113 getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
3114 return true;
3115}
3116
3117bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3118 // Keep track of the original value type and whether this value was
3119 // truncated. If we see a truncation from pointer type to VT that truncates
3120 // bits that are known to be zero, we can use a narrow reference.
3121 EVT VT = N.getValueType();
3122 bool WasTruncated = false;
3123 if (N.getOpcode() == ISD::TRUNCATE) {
3124 WasTruncated = true;
3125 N = N.getOperand(0);
3126 }
3127
3128 if (N.getOpcode() != X86ISD::Wrapper)
3129 return false;
3130
3131 // We can only use non-GlobalValues as immediates if they were not truncated,
3132 // as we do not have any range information. If we have a GlobalValue and the
3133 // address was not truncated, we can select it as an operand directly.
3134 unsigned Opc = N.getOperand(0)->getOpcode();
3135 if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
3136 Op = N.getOperand(0);
3137 // We can only select the operand directly if we didn't have to look past a
3138 // truncate.
3139 return !WasTruncated;
3140 }
3141
3142 // Check that the global's range fits into VT.
3143 auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
3144 std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3145 if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
3146 return false;
3147
3148 // Okay, we can use a narrow reference.
3149 Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
3150 GA->getOffset(), GA->getTargetFlags());
3151 return true;
3152}
3153
3154bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
3155 SDValue &Base, SDValue &Scale,
3156 SDValue &Index, SDValue &Disp,
3157 SDValue &Segment) {
3158 assert(Root && P && "Unknown root/parent nodes");
3159 if (!ISD::isNON_EXTLoad(N.getNode()) ||
3160 !IsProfitableToFold(N, P, Root) ||
3161 !IsLegalToFold(N, P, Root, OptLevel))
3162 return false;
3163
3164 return selectAddr(N.getNode(),
3165 N.getOperand(1), Base, Scale, Index, Disp, Segment);
3166}
3167
3168bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
3169 SDValue &Base, SDValue &Scale,
3170 SDValue &Index, SDValue &Disp,
3171 SDValue &Segment) {
3172 assert(Root && P && "Unknown root/parent nodes");
3173 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
3174 !IsProfitableToFold(N, P, Root) ||
3175 !IsLegalToFold(N, P, Root, OptLevel))
3176 return false;
3177
3178 return selectAddr(N.getNode(),
3179 N.getOperand(1), Base, Scale, Index, Disp, Segment);
3180}
3181
3182/// Return an SDNode that returns the value of the global base register.
3183/// Output instructions required to initialize the global base register,
3184/// if necessary.
3185SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3186 unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3187 auto &DL = MF->getDataLayout();
3188 return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
3189}
3190
3191bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
3192 if (N->getOpcode() == ISD::TRUNCATE)
3193 N = N->getOperand(0).getNode();
3194 if (N->getOpcode() != X86ISD::Wrapper)
3195 return false;
3196
3197 auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
3198 if (!GA)
3199 return false;
3200
3201 auto *GV = GA->getGlobal();
3202 std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3203 if (CR)
3204 return CR->getSignedMin().sge(-1ull << Width) &&
3205 CR->getSignedMax().slt(1ull << Width);
3206 // In the kernel code model, globals are in the negative 2GB of the address
3207 // space, so globals can be a sign extended 32-bit immediate.
3208 // In other code models, small globals are in the low 2GB of the address
3209 // space, so sign extending them is equivalent to zero extending them.
3210 return Width == 32 && !TM.isLargeGlobalValue(GV);
3211}
3212
3213X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
3214 assert(N->isMachineOpcode() && "Unexpected node");
3215 unsigned Opc = N->getMachineOpcode();
3216 const MCInstrDesc &MCID = getInstrInfo()->get(Opc);
3217 int CondNo = X86::getCondSrcNoFromDesc(MCID);
3218 if (CondNo < 0)
3219 return X86::COND_INVALID;
3220
3221 return static_cast<X86::CondCode>(N->getConstantOperandVal(CondNo));
3222}
3223
3224/// Test whether the given X86ISD::CMP node has any users that use a flag
3225/// other than ZF.
3226bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3227 // Examine each user of the node.
3228 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3229 UI != UE; ++UI) {
3230 // Only check things that use the flags.
3231 if (UI.getUse().getResNo() != Flags.getResNo())
3232 continue;
3233 // Only examine CopyToReg uses that copy to EFLAGS.
3234 if (UI->getOpcode() != ISD::CopyToReg ||
3235 cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
3236 return false;
3237 // Examine each user of the CopyToReg use.
3238 for (SDNode::use_iterator FlagUI = UI->use_begin(),
3239 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
3240 // Only examine the Flag result.
3241 if (FlagUI.getUse().getResNo() != 1) continue;
3242 // Anything unusual: assume conservatively.
3243 if (!FlagUI->isMachineOpcode()) return false;
3244 // Examine the condition code of the user.
3245 X86::CondCode CC = getCondFromNode(*FlagUI);
3246
3247 switch (CC) {
3248 // Comparisons which only use the zero flag.
3249 case X86::COND_E: case X86::COND_NE:
3250 continue;
3251 // Anything else: assume conservatively.
3252 default:
3253 return false;
3254 }
3255 }
3256 }
3257 return true;
3258}
3259
3260/// Test whether the given X86ISD::CMP node has any uses which require the SF
3261/// flag to be accurate.
3262bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3263 // Examine each user of the node.
3264 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3265 UI != UE; ++UI) {
3266 // Only check things that use the flags.
3267 if (UI.getUse().getResNo() != Flags.getResNo())
3268 continue;
3269 // Only examine CopyToReg uses that copy to EFLAGS.
3270 if (UI->getOpcode() != ISD::CopyToReg ||
3271 cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
3272 return false;
3273 // Examine each user of the CopyToReg use.
3274 for (SDNode::use_iterator FlagUI = UI->use_begin(),
3275 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
3276 // Only examine the Flag result.
3277 if (FlagUI.getUse().getResNo() != 1) continue;
3278 // Anything unusual: assume conservatively.
3279 if (!FlagUI->isMachineOpcode()) return false;
3280 // Examine the condition code of the user.
3281 X86::CondCode CC = getCondFromNode(*FlagUI);
3282
3283 switch (CC) {
3284 // Comparisons which don't examine the SF flag.
3285 case X86::COND_A: case X86::COND_AE:
3286 case X86::COND_B: case X86::COND_BE:
3287 case X86::COND_E: case X86::COND_NE:
3288 case X86::COND_O: case X86::COND_NO:
3289 case X86::COND_P: case X86::COND_NP:
3290 continue;
3291 // Anything else: assume conservatively.
3292 default:
3293 return false;
3294 }
3295 }
3296 }
3297 return true;
3298}
3299
3301 switch (CC) {
3302 // Comparisons which don't examine the CF flag.
3303 case X86::COND_O: case X86::COND_NO:
3304 case X86::COND_E: case X86::COND_NE:
3305 case X86::COND_S: case X86::COND_NS:
3306 case X86::COND_P: case X86::COND_NP:
3307 case X86::COND_L: case X86::COND_GE:
3308 case X86::COND_G: case X86::COND_LE:
3309 return false;
3310 // Anything else: assume conservatively.
3311 default:
3312 return true;
3313 }
3314}
3315
3316/// Test whether the given node which sets flags has any uses which require the
3317/// CF flag to be accurate.
3318 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3319 // Examine each user of the node.
3320 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3321 UI != UE; ++UI) {
3322 // Only check things that use the flags.
3323 if (UI.getUse().getResNo() != Flags.getResNo())
3324 continue;
3325
3326 unsigned UIOpc = UI->getOpcode();
3327
3328 if (UIOpc == ISD::CopyToReg) {
3329 // Only examine CopyToReg uses that copy to EFLAGS.
3330 if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
3331 return false;
3332 // Examine each user of the CopyToReg use.
3333 for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
3334 FlagUI != FlagUE; ++FlagUI) {
3335 // Only examine the Flag result.
3336 if (FlagUI.getUse().getResNo() != 1)
3337 continue;
3338 // Anything unusual: assume conservatively.
3339 if (!FlagUI->isMachineOpcode())
3340 return false;
3341 // Examine the condition code of the user.
3342 X86::CondCode CC = getCondFromNode(*FlagUI);
3343
3344 if (mayUseCarryFlag(CC))
3345 return false;
3346 }
3347
3348 // This CopyToReg is ok. Move on to the next user.
3349 continue;
3350 }
3351
3352 // This might be an unselected node. So look for the pre-isel opcodes that
3353 // use flags.
3354 unsigned CCOpNo;
3355 switch (UIOpc) {
3356 default:
3357 // Something unusual. Be conservative.
3358 return false;
3359 case X86ISD::SETCC: CCOpNo = 0; break;
3360 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
3361 case X86ISD::CMOV: CCOpNo = 2; break;
3362 case X86ISD::BRCOND: CCOpNo = 2; break;
3363 }
3364
3365 X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo);
3366 if (mayUseCarryFlag(CC))
3367 return false;
3368 }
3369 return true;
3370}
3371
3372/// Check whether or not the chain ending in StoreNode is suitable for doing
3373/// the {load; op; store} to modify transformation.
3375 SDValue StoredVal, SelectionDAG *CurDAG,
3376 unsigned LoadOpNo,
3377 LoadSDNode *&LoadNode,
3378 SDValue &InputChain) {
3379 // Is the stored value result 0 of the operation?
3380 if (StoredVal.getResNo() != 0) return false;
3381
3382 // Are there other uses of the operation other than the store?
3383 if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
3384
3385 // Is the store non-extending and non-indexed?
3386 if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
3387 return false;
3388
3389 SDValue Load = StoredVal->getOperand(LoadOpNo);
3390 // Is the stored value a non-extending and non-indexed load?
3391 if (!ISD::isNormalLoad(Load.getNode())) return false;
3392
3393 // Return LoadNode by reference.
3394 LoadNode = cast<LoadSDNode>(Load);
3395
3396 // Is store the only read of the loaded value?
3397 if (!Load.hasOneUse())
3398 return false;
3399
3400 // Is the address of the store the same as the load?
3401 if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
3402 LoadNode->getOffset() != StoreNode->getOffset())
3403 return false;
3404
3405 bool FoundLoad = false;
3406 SmallVector<SDValue, 4> ChainOps;
3407 SmallVector<const SDNode *, 4> LoopWorklist;
3409 const unsigned int Max = 1024;
3410
3411 // Visualization of Load-Op-Store fusion:
3412 // -------------------------
3413 // Legend:
3414 // *-lines = Chain operand dependencies.
3415 // |-lines = Normal operand dependencies.
3416 // Dependencies flow down and right. n-suffix references multiple nodes.
3417 //
3418 // C Xn C
3419 // * * *
3420 // * * *
3421 // Xn A-LD Yn TF Yn
3422 // * * \ | * |
3423 // * * \ | * |
3424 // * * \ | => A--LD_OP_ST
3425 // * * \| \
3426 // TF OP \
3427 // * | \ Zn
3428 // * | \
3429 // A-ST Zn
3430 //
3431
3432 // This merge induced dependences from: #1: Xn -> LD, OP, Zn
3433 // #2: Yn -> LD
3434 // #3: ST -> Zn
3435
3436 // Ensure the transform is safe by checking for the dual
3437 // dependencies to make sure we do not induce a loop.
3438
3439 // As LD is a predecessor to both OP and ST we can do this by checking:
3440 // a). if LD is a predecessor to a member of Xn or Yn.
3441 // b). if a Zn is a predecessor to ST.
3442
3443 // However, (b) can only occur through being a chain predecessor to
3444 // ST, which is the same as Zn being a member or predecessor of Xn,
3445 // which is a subset of LD being a predecessor of Xn. So it's
3446 // subsumed by check (a).
3447
3448 SDValue Chain = StoreNode->getChain();
3449
3450 // Gather X elements in ChainOps.
3451 if (Chain == Load.getValue(1)) {
3452 FoundLoad = true;
3453 ChainOps.push_back(Load.getOperand(0));
3454 } else if (Chain.getOpcode() == ISD::TokenFactor) {
3455 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
3456 SDValue Op = Chain.getOperand(i);
3457 if (Op == Load.getValue(1)) {
3458 FoundLoad = true;
3459 // Drop Load, but keep its chain. No cycle check necessary.
3460 ChainOps.push_back(Load.getOperand(0));
3461 continue;
3462 }
3463 LoopWorklist.push_back(Op.getNode());
3464 ChainOps.push_back(Op);
3465 }
3466 }
3467
3468 if (!FoundLoad)
3469 return false;
3470
3471 // Worklist is currently Xn. Add Yn to worklist.
3472 for (SDValue Op : StoredVal->ops())
3473 if (Op.getNode() != LoadNode)
3474 LoopWorklist.push_back(Op.getNode());
3475
3476 // Check (a) if Load is a predecessor to Xn + Yn
3477 if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
3478 true))
3479 return false;
3480
3481 InputChain =
3482 CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
3483 return true;
3484}
3485
3486// Change a chain of {load; op; store} of the same value into a simple op
3487// through memory of that value, if the uses of the modified value and its
3488// address are suitable.
3489//
3490// The tablegen pattern memory operand pattern is currently not able to match
3491// the case where the EFLAGS on the original operation are used.
3492//
3493// To move this to tablegen, we'll need to improve tablegen to allow flags to
3494// be transferred from a node in the pattern to the result node, probably with
3495// a new keyword. For example, we have this
3496// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3497// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
3498// (implicit EFLAGS)]>;
3499// but maybe need something like this
3500// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3501// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
3502// (transferrable EFLAGS)]>;
3503//
3504// Until then, we manually fold these and instruction select the operation
3505// here.
3506bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3507 auto *StoreNode = cast<StoreSDNode>(Node);
3508 SDValue StoredVal = StoreNode->getOperand(1);
3509 unsigned Opc = StoredVal->getOpcode();
3510
3511 // Before we try to select anything, make sure this is memory operand size
3512 // and opcode we can handle. Note that this must match the code below that
3513 // actually lowers the opcodes.
3514 EVT MemVT = StoreNode->getMemoryVT();
3515 if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3516 MemVT != MVT::i8)
3517 return false;
3518
3519 bool IsCommutable = false;
3520 bool IsNegate = false;
3521 switch (Opc) {
3522 default:
3523 return false;
3524 case X86ISD::SUB:
3525 IsNegate = isNullConstant(StoredVal.getOperand(0));
3526 break;
3527 case X86ISD::SBB:
3528 break;
3529 case X86ISD::ADD:
3530 case X86ISD::ADC:
3531 case X86ISD::AND:
3532 case X86ISD::OR:
3533 case X86ISD::XOR:
3534 IsCommutable = true;
3535 break;
3536 }
3537
3538 unsigned LoadOpNo = IsNegate ? 1 : 0;
3539 LoadSDNode *LoadNode = nullptr;
3540 SDValue InputChain;
3541 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3542 LoadNode, InputChain)) {
3543 if (!IsCommutable)
3544 return false;
3545
3546 // This operation is commutable, try the other operand.
3547 LoadOpNo = 1;
3548 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3549 LoadNode, InputChain))
3550 return false;
3551 }
3552
3553 SDValue Base, Scale, Index, Disp, Segment;
3554 if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3555 Segment))
3556 return false;
3557
3558 auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3559 unsigned Opc8) {
3560 switch (MemVT.getSimpleVT().SimpleTy) {
3561 case MVT::i64:
3562 return Opc64;
3563 case MVT::i32:
3564 return Opc32;
3565 case MVT::i16:
3566 return Opc16;
3567 case MVT::i8:
3568 return Opc8;
3569 default:
3570 llvm_unreachable("Invalid size!");
3571 }
3572 };
3573
3575 switch (Opc) {
3576 case X86ISD::SUB:
3577 // Handle negate.
3578 if (IsNegate) {
3579 unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3580 X86::NEG8m);
3581 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3582 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3583 MVT::Other, Ops);
3584 break;
3585 }
3586 [[fallthrough]];
3587 case X86ISD::ADD:
3588 // Try to match inc/dec.
3589 if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
3590 bool IsOne = isOneConstant(StoredVal.getOperand(1));
3591 bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
3592 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3593 if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
3594 unsigned NewOpc =
3595 ((Opc == X86ISD::ADD) == IsOne)
3596 ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3597 : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3598 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3599 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3600 MVT::Other, Ops);
3601 break;
3602 }
3603 }
3604 [[fallthrough]];
3605 case X86ISD::ADC:
3606 case X86ISD::SBB:
3607 case X86ISD::AND:
3608 case X86ISD::OR:
3609 case X86ISD::XOR: {
3610 auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3611 switch (Opc) {
3612 case X86ISD::ADD:
3613 return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3614 X86::ADD8mr);
3615 case X86ISD::ADC:
3616 return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3617 X86::ADC8mr);
3618 case X86ISD::SUB:
3619 return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3620 X86::SUB8mr);
3621 case X86ISD::SBB:
3622 return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3623 X86::SBB8mr);
3624 case X86ISD::AND:
3625 return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3626 X86::AND8mr);
3627 case X86ISD::OR:
3628 return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3629 case X86ISD::XOR:
3630 return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3631 X86::XOR8mr);
3632 default:
3633 llvm_unreachable("Invalid opcode!");
3634 }
3635 };
3636 auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3637 switch (Opc) {
3638 case X86ISD::ADD:
3639 return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3640 X86::ADD8mi);
3641 case X86ISD::ADC:
3642 return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3643 X86::ADC8mi);
3644 case X86ISD::SUB:
3645 return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3646 X86::SUB8mi);
3647 case X86ISD::SBB:
3648 return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3649 X86::SBB8mi);
3650 case X86ISD::AND:
3651 return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3652 X86::AND8mi);
3653 case X86ISD::OR:
3654 return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3655 X86::OR8mi);
3656 case X86ISD::XOR:
3657 return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3658 X86::XOR8mi);
3659 default:
3660 llvm_unreachable("Invalid opcode!");
3661 }
3662 };
3663
3664 unsigned NewOpc = SelectRegOpcode(Opc);
3665 SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3666
3667 // See if the operand is a constant that we can fold into an immediate
3668 // operand.
3669 if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3670 int64_t OperandV = OperandC->getSExtValue();
3671
3672 // Check if we can shrink the operand enough to fit in an immediate (or
3673 // fit into a smaller immediate) by negating it and switching the
3674 // operation.
3675 if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3676 ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3677 (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3678 isInt<32>(-OperandV))) &&
3679 hasNoCarryFlagUses(StoredVal.getValue(1))) {
3680 OperandV = -OperandV;
3681 Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3682 }
3683
3684 if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3685 Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
3686 NewOpc = SelectImmOpcode(Opc);
3687 }
3688 }
3689
3690 if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3691 SDValue CopyTo =
3692 CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3693 StoredVal.getOperand(2), SDValue());
3694
3695 const SDValue Ops[] = {Base, Scale, Index, Disp,
3696 Segment, Operand, CopyTo, CopyTo.getValue(1)};
3697 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3698 Ops);
3699 } else {
3700 const SDValue Ops[] = {Base, Scale, Index, Disp,
3701 Segment, Operand, InputChain};
3702 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3703 Ops);
3704 }
3705 break;
3706 }
3707 default:
3708 llvm_unreachable("Invalid opcode!");
3709 }
3710
3711 MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3712 LoadNode->getMemOperand()};
3713 CurDAG->setNodeMemRefs(Result, MemOps);
3714
3715 // Update Load Chain uses as well.
3716 ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3717 ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3718 ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3719 CurDAG->RemoveDeadNode(Node);
3720 return true;
3721}
3722
3723// See if this is an X & Mask that we can match to BEXTR/BZHI.
3724// Where Mask is one of the following patterns:
3725// a) x & (1 << nbits) - 1
3726// b) x & ~(-1 << nbits)
3727// c) x & (-1 >> (32 - y))
3728// d) x << (32 - y) >> (32 - y)
3729// e) (1 << nbits) - 1
3730bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3731 assert(
3732 (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND ||
3733 Node->getOpcode() == ISD::SRL) &&
3734 "Should be either an and-mask, or right-shift after clearing high bits.");
3735
3736 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3737 if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3738 return false;
3739
3740 MVT NVT = Node->getSimpleValueType(0);
3741
3742 // Only supported for 32 and 64 bits.
3743 if (NVT != MVT::i32 && NVT != MVT::i64)
3744 return false;
3745
3746 SDValue NBits;
3747 bool NegateNBits;
3748
3749 // If we have BMI2's BZHI, we are ok with muti-use patterns.
3750 // Else, if we only have BMI1's BEXTR, we require one-use.
3751 const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3752 auto checkUses = [AllowExtraUsesByDefault](
3753 SDValue Op, unsigned NUses,
3754 std::optional<bool> AllowExtraUses) {
3755 return AllowExtraUses.value_or(AllowExtraUsesByDefault) ||
3756 Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3757 };
3758 auto checkOneUse = [checkUses](SDValue Op,
3759 std::optional<bool> AllowExtraUses =
3760 std::nullopt) {
3761 return checkUses(Op, 1, AllowExtraUses);
3762 };
3763 auto checkTwoUse = [checkUses](SDValue Op,
3764 std::optional<bool> AllowExtraUses =
3765 std::nullopt) {
3766 return checkUses(Op, 2, AllowExtraUses);
3767 };
3768
3769 auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3770 if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3771 assert(V.getSimpleValueType() == MVT::i32 &&
3772 V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3773 "Expected i64 -> i32 truncation");
3774 V = V.getOperand(0);
3775 }
3776 return V;
3777 };
3778
3779 // a) x & ((1 << nbits) + (-1))
3780 auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3781 &NegateNBits](SDValue Mask) -> bool {
3782 // Match `add`. Must only have one use!
3783 if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3784 return false;
3785 // We should be adding all-ones constant (i.e. subtracting one.)
3786 if (!isAllOnesConstant(Mask->getOperand(1)))
3787 return false;
3788 // Match `1 << nbits`. Might be truncated. Must only have one use!
3789 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3790 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3791 return false;
3792 if (!isOneConstant(M0->getOperand(0)))
3793 return false;
3794 NBits = M0->getOperand(1);
3795 NegateNBits = false;
3796 return true;
3797 };
3798
3799 auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3800 V = peekThroughOneUseTruncation(V);
3801 return CurDAG->MaskedValueIsAllOnes(
3802 V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
3803 NVT.getSizeInBits()));
3804 };
3805
3806 // b) x & ~(-1 << nbits)
3807 auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3808 &NBits, &NegateNBits](SDValue Mask) -> bool {
3809 // Match `~()`. Must only have one use!
3810 if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3811 return false;
3812 // The -1 only has to be all-ones for the final Node's NVT.
3813 if (!isAllOnes(Mask->getOperand(1)))
3814 return false;
3815 // Match `-1 << nbits`. Might be truncated. Must only have one use!
3816 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3817 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3818 return false;
3819 // The -1 only has to be all-ones for the final Node's NVT.
3820 if (!isAllOnes(M0->getOperand(0)))
3821 return false;
3822 NBits = M0->getOperand(1);
3823 NegateNBits = false;
3824 return true;
3825 };
3826
3827 // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
3828 // or leave the shift amount as-is, but then we'll have to negate it.
3829 auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
3830 unsigned Bitwidth) {
3831 NBits = ShiftAmt;
3832 NegateNBits = true;
3833 // Skip over a truncate of the shift amount, if any.
3834 if (NBits.getOpcode() == ISD::TRUNCATE)
3835 NBits = NBits.getOperand(0);
3836 // Try to match the shift amount as (bitwidth - y). It should go away, too.
3837 // If it doesn't match, that's fine, we'll just negate it ourselves.
3838 if (NBits.getOpcode() != ISD::SUB)
3839 return;
3840 auto *V0 = dyn_cast<ConstantSDNode>(NBits.getOperand(0));
3841 if (!V0 || V0->getZExtValue() != Bitwidth)
3842 return;
3843 NBits = NBits.getOperand(1);
3844 NegateNBits = false;
3845 };
3846
3847 // c) x & (-1 >> z) but then we'll have to subtract z from bitwidth
3848 // or
3849 // c) x & (-1 >> (32 - y))
3850 auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
3851 canonicalizeShiftAmt](SDValue Mask) -> bool {
3852 // The mask itself may be truncated.
3853 Mask = peekThroughOneUseTruncation(Mask);
3854 unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3855 // Match `l>>`. Must only have one use!
3856 if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3857 return false;
3858 // We should be shifting truly all-ones constant.
3859 if (!isAllOnesConstant(Mask.getOperand(0)))
3860 return false;
3861 SDValue M1 = Mask.getOperand(1);
3862 // The shift amount should not be used externally.
3863 if (!checkOneUse(M1))
3864 return false;
3865 canonicalizeShiftAmt(M1, Bitwidth);
3866 // Pattern c. is non-canonical, and is expanded into pattern d. iff there
3867 // is no extra use of the mask. Clearly, there was one since we are here.
3868 // But at the same time, if we need to negate the shift amount,
3869 // then we don't want the mask to stick around, else it's unprofitable.
3870 return !NegateNBits;
3871 };
3872
3873 SDValue X;
3874
3875 // d) x << z >> z but then we'll have to subtract z from bitwidth
3876 // or
3877 // d) x << (32 - y) >> (32 - y)
3878 auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
3879 AllowExtraUsesByDefault, &NegateNBits,
3880 &X](SDNode *Node) -> bool {
3881 if (Node->getOpcode() != ISD::SRL)
3882 return false;
3883 SDValue N0 = Node->getOperand(0);
3884 if (N0->getOpcode() != ISD::SHL)
3885 return false;
3886 unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3887 SDValue N1 = Node->getOperand(1);
3888 SDValue N01 = N0->getOperand(1);
3889 // Both of the shifts must be by the exact same value.
3890 if (N1 != N01)
3891 return false;
3892 canonicalizeShiftAmt(N1, Bitwidth);
3893 // There should not be any external uses of the inner shift / shift amount.
3894 // Note that while we are generally okay with external uses given BMI2,
3895 // iff we need to negate the shift amount, we are not okay with extra uses.
3896 const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
3897 if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses))
3898 return false;
3899 X = N0->getOperand(0);
3900 return true;
3901 };
3902
3903 auto matchLowBitMask = [matchPatternA, matchPatternB,
3904 matchPatternC](SDValue Mask) -> bool {
3905 return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
3906 };
3907
3908 if (Node->getOpcode() == ISD::AND) {
3909 X = Node->getOperand(0);
3910 SDValue Mask = Node->getOperand(1);
3911
3912 if (matchLowBitMask(Mask)) {
3913 // Great.
3914 } else {
3915 std::swap(X, Mask);
3916 if (!matchLowBitMask(Mask))
3917 return false;
3918 }
3919 } else if (matchLowBitMask(SDValue(Node, 0))) {
3920 X = CurDAG->getAllOnesConstant(SDLoc(Node), NVT);
3921 } else if (!matchPatternD(Node))
3922 return false;
3923
3924 // If we need to negate the shift amount, require BMI2 BZHI support.
3925 // It's just too unprofitable for BMI1 BEXTR.
3926 if (NegateNBits && !Subtarget->hasBMI2())
3927 return false;
3928
3929 SDLoc DL(Node);
3930
3931 // Truncate the shift amount.
3932 NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
3933 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3934
3935 // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
3936 // All the other bits are undefined, we do not care about them.
3937 SDValue ImplDef = SDValue(
3938 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
3939 insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
3940
3941 SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
3942 insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
3943 NBits = SDValue(CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
3944 MVT::i32, ImplDef, NBits, SRIdxVal),
3945 0);
3946 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3947
3948 // We might have matched the amount of high bits to be cleared,
3949 // but we want the amount of low bits to be kept, so negate it then.
3950 if (NegateNBits) {
3951 SDValue BitWidthC = CurDAG->getConstant(NVT.getSizeInBits(), DL, MVT::i32);
3952 insertDAGNode(*CurDAG, SDValue(Node, 0), BitWidthC);
3953
3954 NBits = CurDAG->getNode(ISD::SUB, DL, MVT::i32, BitWidthC, NBits);
3955 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3956 }
3957
3958 if (Subtarget->hasBMI2()) {
3959 // Great, just emit the BZHI..
3960 if (NVT != MVT::i32) {
3961 // But have to place the bit count into the wide-enough register first.
3962 NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
3963 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3964 }
3965
3966 SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
3967 ReplaceNode(Node, Extract.getNode());
3968 SelectCode(Extract.getNode());
3969 return true;
3970 }
3971
3972 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
3973 // *logically* shifted (potentially with one-use trunc inbetween),
3974 // and the truncation was the only use of the shift,
3975 // and if so look past one-use truncation.
3976 {
3977 SDValue RealX = peekThroughOneUseTruncation(X);
3978 // FIXME: only if the shift is one-use?
3979 if (RealX != X && RealX.getOpcode() == ISD::SRL)
3980 X = RealX;
3981 }
3982
3983 MVT XVT = X.getSimpleValueType();
3984
3985 // Else, emitting BEXTR requires one more step.
3986 // The 'control' of BEXTR has the pattern of:
3987 // [15...8 bit][ 7...0 bit] location
3988 // [ bit count][ shift] name
3989 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
3990
3991 // Shift NBits left by 8 bits, thus producing 'control'.
3992 // This makes the low 8 bits to be zero.
3993 SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
3994 insertDAGNode(*CurDAG, SDValue(Node, 0), C8);
3995 SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
3996 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3997
3998 // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
3999 // FIXME: only if the shift is one-use?
4000 if (X.getOpcode() == ISD::SRL) {
4001 SDValue ShiftAmt = X.getOperand(1);
4002 X = X.getOperand(0);
4003
4004 assert(ShiftAmt.getValueType() == MVT::i8 &&
4005 "Expected shift amount to be i8");
4006
4007 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
4008 // We could zext to i16 in some form, but we intentionally don't do that.
4009 SDValue OrigShiftAmt = ShiftAmt;
4010 ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
4011 insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
4012
4013 // And now 'or' these low 8 bits of shift amount into the 'control'.
4014 Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
4015 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4016 }
4017
4018 // But have to place the 'control' into the wide-enough register first.
4019 if (XVT != MVT::i32) {
4020 Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
4021 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4022 }
4023
4024 // And finally, form the BEXTR itself.
4025 SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
4026
4027 // The 'X' was originally truncated. Do that now.
4028 if (XVT != NVT) {
4029 insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
4030 Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
4031 }
4032
4033 ReplaceNode(Node, Extract.getNode());
4034 SelectCode(Extract.getNode());
4035
4036 return true;
4037}
4038
4039// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4040MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
4041 MVT NVT = Node->getSimpleValueType(0);
4042 SDLoc dl(Node);
4043
4044 SDValue N0 = Node->getOperand(0);
4045 SDValue N1 = Node->getOperand(1);
4046
4047 // If we have TBM we can use an immediate for the control. If we have BMI
4048 // we should only do this if the BEXTR instruction is implemented well.
4049 // Otherwise moving the control into a register makes this more costly.
4050 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4051 // hoisting the move immediate would make it worthwhile with a less optimal
4052 // BEXTR?
4053 bool PreferBEXTR =
4054 Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4055 if (!PreferBEXTR && !Subtarget->hasBMI2())
4056 return nullptr;
4057
4058 // Must have a shift right.
4059 if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
4060 return nullptr;
4061
4062 // Shift can't have additional users.
4063 if (!N0->hasOneUse())
4064 return nullptr;
4065
4066 // Only supported for 32 and 64 bits.
4067 if (NVT != MVT::i32 && NVT != MVT::i64)
4068 return nullptr;
4069
4070 // Shift amount and RHS of and must be constant.
4071 auto *MaskCst = dyn_cast<ConstantSDNode>(N1);
4072 auto *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
4073 if (!MaskCst || !ShiftCst)
4074 return nullptr;
4075
4076 // And RHS must be a mask.
4077 uint64_t Mask = MaskCst->getZExtValue();
4078 if (!isMask_64(Mask))
4079 return nullptr;
4080
4081 uint64_t Shift = ShiftCst->getZExtValue();
4082 uint64_t MaskSize = llvm::popcount(Mask);
4083
4084 // Don't interfere with something that can be handled by extracting AH.
4085 // TODO: If we are able to fold a load, BEXTR might still be better than AH.
4086 if (Shift == 8 && MaskSize == 8)
4087 return nullptr;
4088
4089 // Make sure we are only using bits that were in the original value, not
4090 // shifted in.
4091 if (Shift + MaskSize > NVT.getSizeInBits())
4092 return nullptr;
4093
4094 // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4095 // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4096 // does not fit into 32 bits. Load folding is not a sufficient reason.
4097 if (!PreferBEXTR && MaskSize <= 32)
4098 return nullptr;
4099
4100 SDValue Control;
4101 unsigned ROpc, MOpc;
4102
4103#define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4104 if (!PreferBEXTR) {
4105 assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4106 // If we can't make use of BEXTR then we can't fuse shift+mask stages.
4107 // Let's perform the mask first, and apply shift later. Note that we need to
4108 // widen the mask to account for the fact that we'll apply shift afterwards!
4109 Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
4110 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4111 : GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4112 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4113 : GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4114 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4115 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4116 } else {
4117 // The 'control' of BEXTR has the pattern of:
4118 // [15...8 bit][ 7...0 bit] location
4119 // [ bit count][ shift] name
4120 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4121 Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
4122 if (Subtarget->hasTBM()) {
4123 ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4124 MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4125 } else {
4126 assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4127 // BMI requires the immediate to placed in a register.
4128 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4129 : GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4130 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4131 : GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4132 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4133 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4134 }
4135 }
4136
4137 MachineSDNode *NewNode;
4138 SDValue Input = N0->getOperand(0);
4139 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4140 if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4141 SDValue Ops[] = {
4142 Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
4143 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4144 NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4145 // Update the chain.
4146 ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
4147 // Record the mem-refs
4148 CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
4149 } else {
4150 NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
4151 }
4152
4153 if (!PreferBEXTR) {
4154 // We still need to apply the shift.
4155 SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
4156 unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4157 : GET_ND_IF_ENABLED(X86::SHR32ri);
4158 NewNode =
4159 CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
4160 }
4161
4162 return NewNode;
4163}
4164
4165// Emit a PCMISTR(I/M) instruction.
4166MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
4167 bool MayFoldLoad, const SDLoc &dl,
4168 MVT VT, SDNode *Node) {
4169 SDValue N0 = Node->getOperand(0);
4170 SDValue N1 = Node->getOperand(1);
4171 SDValue Imm = Node->getOperand(2);
4172 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4173 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4174
4175 // Try to fold a load. No need to check alignment.
4176 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4177 if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4178 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4179 N1.getOperand(0) };
4180 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
4181 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4182 // Update the chain.
4183 ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
4184 // Record the mem-refs
4185 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4186 return CNode;
4187 }
4188
4189 SDValue Ops[] = { N0, N1, Imm };
4190 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
4191 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4192 return CNode;
4193}
4194
4195// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4196// to emit a second instruction after this one. This is needed since we have two
4197// copyToReg nodes glued before this and we need to continue that glue through.
4198MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
4199 bool MayFoldLoad, const SDLoc &dl,
4200 MVT VT, SDNode *Node,
4201 SDValue &InGlue) {
4202 SDValue N0 = Node->getOperand(0);
4203 SDValue N2 = Node->getOperand(2);
4204 SDValue Imm = Node->getOperand(4);
4205 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4206 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4207
4208 // Try to fold a load. No need to check alignment.
4209 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4210 if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4211 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4212 N2.getOperand(0), InGlue };
4213 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
4214 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4215 InGlue = SDValue(CNode, 3);
4216 // Update the chain.
4217 ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
4218 // Record the mem-refs
4219 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
4220 return CNode;
4221 }
4222
4223 SDValue Ops[] = { N0, N2, Imm, InGlue };
4224 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
4225 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4226 InGlue = SDValue(CNode, 2);
4227 return CNode;
4228}
4229
4230bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4231 EVT VT = N->getValueType(0);
4232
4233 // Only handle scalar shifts.
4234 if (VT.isVector())
4235 return false;
4236
4237 // Narrower shifts only mask to 5 bits in hardware.
4238 unsigned Size = VT == MVT::i64 ? 64 : 32;
4239
4240 SDValue OrigShiftAmt = N->getOperand(1);
4241 SDValue ShiftAmt = OrigShiftAmt;
4242 SDLoc DL(N);
4243
4244 // Skip over a truncate of the shift amount.
4245 if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
4246 ShiftAmt = ShiftAmt->getOperand(0);
4247
4248 // This function is called after X86DAGToDAGISel::matchBitExtract(),
4249 // so we are not afraid that we might mess up BZHI/BEXTR pattern.
4250
4251 SDValue NewShiftAmt;
4252 if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB ||
4253 ShiftAmt->getOpcode() == ISD::XOR) {
4254 SDValue Add0 = ShiftAmt->getOperand(0);
4255 SDValue Add1 = ShiftAmt->getOperand(1);
4256 auto *Add0C = dyn_cast<ConstantSDNode>(Add0);
4257 auto *Add1C = dyn_cast<ConstantSDNode>(Add1);
4258 // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4259 // to avoid the ADD/SUB/XOR.
4260 if (Add1C && Add1C->getAPIntValue().urem(Size) == 0) {
4261 NewShiftAmt = Add0;
4262
4263 } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4264 ((Add0C && Add0C->getAPIntValue().urem(Size) == Size - 1) ||
4265 (Add1C && Add1C->getAPIntValue().urem(Size) == Size - 1))) {
4266 // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
4267 // we can replace it with a NOT. In the XOR case it may save some code
4268 // size, in the SUB case it also may save a move.
4269 assert(Add0C == nullptr || Add1C == nullptr);
4270
4271 // We can only do N-X, not X-N
4272 if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr)
4273 return false;
4274
4275 EVT OpVT = ShiftAmt.getValueType();
4276
4277 SDValue AllOnes = CurDAG->getAllOnesConstant(DL, OpVT);
4278 NewShiftAmt = CurDAG->getNode(ISD::XOR, DL, OpVT,
4279 Add0C == nullptr ? Add0 : Add1, AllOnes);
4280 insertDAGNode(*CurDAG, OrigShiftAmt, AllOnes);
4281 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4282 // If we are shifting by N-X where N == 0 mod Size, then just shift by
4283 // -X to generate a NEG instead of a SUB of a constant.
4284 } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
4285 Add0C->getZExtValue() != 0) {
4286 EVT SubVT = ShiftAmt.getValueType();
4287 SDValue X;
4288 if (Add0C->getZExtValue() % Size == 0)
4289 X = Add1;
4290 else if (ShiftAmt.hasOneUse() && Size == 64 &&
4291 Add0C->getZExtValue() % 32 == 0) {
4292 // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
4293 // This is mainly beneficial if we already compute (x+n*32).
4294 if (Add1.getOpcode() == ISD::TRUNCATE) {
4295 Add1 = Add1.getOperand(0);
4296 SubVT = Add1.getValueType();
4297 }
4298 if (Add0.getValueType() != SubVT) {
4299 Add0 = CurDAG->getZExtOrTrunc(Add0, DL, SubVT);
4300 insertDAGNode(*CurDAG, OrigShiftAmt, Add0);
4301 }
4302
4303 X = CurDAG->getNode(ISD::ADD, DL, SubVT, Add1, Add0);
4304 insertDAGNode(*CurDAG, OrigShiftAmt, X);
4305 } else
4306 return false;
4307 // Insert a negate op.
4308 // TODO: This isn't guaranteed to replace the sub if there is a logic cone
4309 // that uses it that's not a shift.
4310 SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
4311 SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, X);
4312 NewShiftAmt = Neg;
4313
4314 // Insert these operands into a valid topological order so they can
4315 // get selected independently.
4316 insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
4317 insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
4318 } else
4319 return false;
4320 } else
4321 return false;
4322
4323 if (NewShiftAmt.getValueType() != MVT::i8) {
4324 // Need to truncate the shift amount.
4325 NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
4326 // Add to a correct topological ordering.
4327 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4328 }
4329
4330 // Insert a new mask to keep the shift amount legal. This should be removed
4331 // by isel patterns.
4332 NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
4333 CurDAG->getConstant(Size - 1, DL, MVT::i8));
4334 // Place in a correct topological ordering.
4335 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4336
4337 SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
4338 NewShiftAmt);
4339 if (UpdatedNode != N) {
4340 // If we found an existing node, we should replace ourselves with that node
4341 // and wait for it to be selected after its other users.
4342 ReplaceNode(N, UpdatedNode);
4343 return true;
4344 }
4345
4346 // If the original shift amount is now dead, delete it so that we don't run
4347 // it through isel.
4348 if (OrigShiftAmt.getNode()->use_empty())
4349 CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
4350
4351 // Now that we've optimized the shift amount, defer to normal isel to get
4352 // load folding and legacy vs BMI2 selection without repeating it here.
4353 SelectCode(N);
4354 return true;
4355}
4356
4357bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4358 MVT NVT = N->getSimpleValueType(0);
4359 unsigned Opcode = N->getOpcode();
4360 SDLoc dl(N);
4361
4362 // For operations of the form (x << C1) op C2, check if we can use a smaller
4363 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4364 SDValue Shift = N->getOperand(0);
4365 SDValue N1 = N->getOperand(1);
4366
4367 auto *Cst = dyn_cast<ConstantSDNode>(N1);
4368 if (!Cst)
4369 return false;
4370
4371 int64_t Val = Cst->getSExtValue();
4372
4373 // If we have an any_extend feeding the AND, look through it to see if there
4374 // is a shift behind it. But only if the AND doesn't use the extended bits.
4375 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4376 bool FoundAnyExtend = false;
4377 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4378 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
4379 isUInt<32>(Val)) {
4380 FoundAnyExtend = true;
4381 Shift = Shift.getOperand(0);
4382 }
4383
4384 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
4385 return false;
4386
4387 // i8 is unshrinkable, i16 should be promoted to i32.
4388 if (NVT != MVT::i32 && NVT != MVT::i64)
4389 return false;
4390
4391 auto *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
4392 if (!ShlCst)
4393 return false;
4394
4395 uint64_t ShAmt = ShlCst->getZExtValue();
4396
4397 // Make sure that we don't change the operation by removing bits.
4398 // This only matters for OR and XOR, AND is unaffected.
4399 uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
4400 if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
4401 return false;
4402
4403 // Check the minimum bitwidth for the new constant.
4404 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4405 auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4406 if (Opcode == ISD::AND) {
4407 // AND32ri is the same as AND64ri32 with zext imm.
4408 // Try this before sign extended immediates below.
4409 ShiftedVal = (uint64_t)Val >> ShAmt;
4410 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4411 return true;
4412 // Also swap order when the AND can become MOVZX.
4413 if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
4414 return true;
4415 }
4416 ShiftedVal = Val >> ShAmt;
4417 if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
4418 (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
4419 return true;
4420 if (Opcode != ISD::AND) {
4421 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4422 ShiftedVal = (uint64_t)Val >> ShAmt;
4423 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4424 return true;
4425 }
4426 return false;
4427 };
4428
4429 int64_t ShiftedVal;
4430 if (!CanShrinkImmediate(ShiftedVal))
4431 return false;
4432
4433 // Ok, we can reorder to get a smaller immediate.
4434
4435 // But, its possible the original immediate allowed an AND to become MOVZX.
4436 // Doing this late due to avoid the MakedValueIsZero call as late as
4437 // possible.
4438 if (Opcode == ISD::AND) {
4439 // Find the smallest zext this could possibly be.
4440 unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4441 ZExtWidth = llvm::bit_ceil(std::max(ZExtWidth, 8U));
4442
4443 // Figure out which bits need to be zero to achieve that mask.
4444 APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
4445 ZExtWidth);
4446 NeededMask &= ~Cst->getAPIntValue();
4447
4448 if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
4449 return false;
4450 }
4451
4452 SDValue X = Shift.getOperand(0);
4453 if (FoundAnyExtend) {
4454 SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
4455 insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
4456 X = NewX;
4457 }
4458
4459 SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT);
4460 insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
4461 SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
4462 insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
4463 SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
4464 Shift.getOperand(1));
4465 ReplaceNode(N, NewSHL.getNode());
4466 SelectCode(NewSHL.getNode());
4467 return true;
4468}
4469
4470bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
4471 SDNode *ParentB, SDNode *ParentC,
4473 uint8_t Imm) {
4474 assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4475 C.isOperandOf(ParentC) && "Incorrect parent node");
4476
4477 auto tryFoldLoadOrBCast =
4478 [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
4479 SDValue &Index, SDValue &Disp, SDValue &Segment) {
4480 if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4481 return true;
4482
4483 // Not a load, check for broadcast which may be behind a bitcast.
4484 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4485 P = L.getNode();
4486 L = L.getOperand(0);
4487 }
4488
4489 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4490 return false;
4491
4492 // Only 32 and 64 bit broadcasts are supported.
4493 auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4494 unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4495 if (Size != 32 && Size != 64)
4496 return false;
4497
4498 return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4499 };
4500
4501 bool FoldedLoad = false;
4502 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4503 if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4504 FoldedLoad = true;
4505 } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4506 Tmp4)) {
4507 FoldedLoad = true;
4508 std::swap(A, C);
4509 // Swap bits 1/4 and 3/6.
4510 uint8_t OldImm = Imm;
4511 Imm = OldImm & 0xa5;
4512 if (OldImm & 0x02) Imm |= 0x10;
4513 if (OldImm & 0x10) Imm |= 0x02;
4514 if (OldImm & 0x08) Imm |= 0x40;
4515 if (OldImm & 0x40) Imm |= 0x08;
4516 } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4517 Tmp4)) {
4518 FoldedLoad = true;
4519 std::swap(B, C);
4520 // Swap bits 1/2 and 5/6.
4521 uint8_t OldImm = Imm;
4522 Imm = OldImm & 0x99;
4523 if (OldImm & 0x02) Imm |= 0x04;
4524 if (OldImm & 0x04) Imm |= 0x02;
4525 if (OldImm & 0x20) Imm |= 0x40;
4526 if (OldImm & 0x40) Imm |= 0x20;
4527 }
4528
4529 SDLoc DL(Root);
4530
4531 SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8);
4532
4533 MVT NVT = Root->getSimpleValueType(0);
4534
4535 MachineSDNode *MNode;
4536 if (FoldedLoad) {
4537 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
4538
4539 unsigned Opc;
4540 if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4541 auto *MemIntr = cast<MemIntrinsicSDNode>(C);
4542 unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4543 assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
4544
4545 bool UseD = EltSize == 32;
4546 if (NVT.is128BitVector())
4547 Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4548 else if (NVT.is256BitVector())
4549 Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4550 else if (NVT.is512BitVector())
4551 Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4552 else
4553 llvm_unreachable("Unexpected vector size!");
4554 } else {
4555 bool UseD = NVT.getVectorElementType() == MVT::i32;
4556 if (NVT.is128BitVector())
4557 Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4558 else if (NVT.is256BitVector())
4559 Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4560 else if (NVT.is512BitVector())
4561 Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4562 else
4563 llvm_unreachable("Unexpected vector size!");
4564 }
4565
4566 SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)};
4567 MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
4568
4569 // Update the chain.
4570 ReplaceUses(C.getValue(1), SDValue(MNode, 1));
4571 // Record the mem-refs
4572 CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()});
4573 } else {
4574 bool UseD = NVT.getVectorElementType() == MVT::i32;
4575 unsigned Opc;
4576 if (NVT.is128BitVector())
4577 Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4578 else if (NVT.is256BitVector())
4579 Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4580 else if (NVT.is512BitVector())
4581 Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4582 else
4583 llvm_unreachable("Unexpected vector size!");
4584
4585 MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm});
4586 }
4587
4588 ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0));
4589 CurDAG->RemoveDeadNode(Root);
4590 return true;
4591}
4592
4593// Try to match two logic ops to a VPTERNLOG.
4594// FIXME: Handle more complex patterns that use an operand more than once?
4595bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4596 MVT NVT = N->getSimpleValueType(0);
4597
4598 // Make sure we support VPTERNLOG.
4599 if (!NVT.isVector() || !Subtarget->hasAVX512() ||
4600 NVT.getVectorElementType() == MVT::i1)
4601 return false;
4602
4603 // We need VLX for 128/256-bit.
4604 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4605 return false;
4606
4607 SDValue N0 = N->getOperand(0);
4608 SDValue N1 = N->getOperand(1);
4609
4610 auto getFoldableLogicOp = [](SDValue Op) {
4611 // Peek through single use bitcast.
4612 if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4613 Op = Op.getOperand(0);
4614
4615 if (!Op.hasOneUse())
4616 return SDValue();
4617
4618 unsigned Opc = Op.getOpcode();
4619 if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
4620 Opc == X86ISD::ANDNP)
4621 return Op;
4622
4623 return SDValue();
4624 };
4625
4626 SDValue A, FoldableOp;
4627 if ((FoldableOp = getFoldableLogicOp(N1))) {
4628 A = N0;
4629 } else if ((FoldableOp = getFoldableLogicOp(N0))) {
4630 A = N1;
4631 } else
4632 return false;
4633
4634 SDValue B = FoldableOp.getOperand(0);
4635 SDValue C = FoldableOp.getOperand(1);
4636 SDNode *ParentA = N;
4637 SDNode *ParentB = FoldableOp.getNode();
4638 SDNode *ParentC = FoldableOp.getNode();
4639
4640 // We can build the appropriate control immediate by performing the logic
4641 // operation we're matching using these constants for A, B, and C.
4642 uint8_t TernlogMagicA = 0xf0;
4643 uint8_t TernlogMagicB = 0xcc;
4644 uint8_t TernlogMagicC = 0xaa;
4645
4646 // Some of the inputs may be inverted, peek through them and invert the
4647 // magic values accordingly.
4648 // TODO: There may be a bitcast before the xor that we should peek through.
4649 auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4650 if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4651 ISD::isBuildVectorAllOnes(Op.getOperand(1).getNode())) {
4652 Magic = ~Magic;
4653 Parent = Op.getNode();
4654 Op = Op.getOperand(0);
4655 }
4656 };
4657
4658 PeekThroughNot(A, ParentA, TernlogMagicA);
4659 PeekThroughNot(B, ParentB, TernlogMagicB);
4660 PeekThroughNot(C, ParentC, TernlogMagicC);
4661
4662 uint8_t Imm;
4663 switch (FoldableOp.getOpcode()) {
4664 default: llvm_unreachable("Unexpected opcode!");
4665 case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
4666 case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break;
4667 case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
4668 case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4669 }
4670
4671 switch (N->getOpcode()) {
4672 default: llvm_unreachable("Unexpected opcode!");
4673 case X86ISD::ANDNP:
4674 if (A == N0)
4675 Imm &= ~TernlogMagicA;
4676 else
4677 Imm = ~(Imm) & TernlogMagicA;
4678 break;
4679 case ISD::AND: Imm &= TernlogMagicA; break;
4680 case ISD::OR: Imm |= TernlogMagicA; break;
4681 case ISD::XOR: Imm ^= TernlogMagicA; break;
4682 }
4683
4684 return matchVPTERNLOG(N, ParentA, ParentB, ParentC, A, B, C, Imm);
4685}
4686
4687/// If the high bits of an 'and' operand are known zero, try setting the
4688/// high bits of an 'and' constant operand to produce a smaller encoding by
4689/// creating a small, sign-extended negative immediate rather than a large
4690/// positive one. This reverses a transform in SimplifyDemandedBits that
4691/// shrinks mask constants by clearing bits. There is also a possibility that
4692/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4693/// case, just replace the 'and'. Return 'true' if the node is replaced.
4694bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4695 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4696 // have immediate operands.
4697 MVT VT = And->getSimpleValueType(0);
4698 if (VT != MVT::i32 && VT != MVT::i64)
4699 return false;
4700
4701 auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
4702 if (!And1C)
4703 return false;
4704
4705 // Bail out if the mask constant is already negative. It's can't shrink more.
4706 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4707 // patterns to use a 32-bit and instead of a 64-bit and by relying on the
4708 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4709 // are negative too.
4710 APInt MaskVal = And1C->getAPIntValue();
4711 unsigned MaskLZ = MaskVal.countl_zero();
4712 if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
4713 return false;
4714
4715 // Don't extend into the upper 32 bits of a 64 bit mask.
4716 if (VT == MVT::i64 && MaskLZ >= 32) {
4717 MaskLZ -= 32;
4718 MaskVal = MaskVal.trunc(32);
4719 }
4720
4721 SDValue And0 = And->getOperand(0);
4722 APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
4723 APInt NegMaskVal = MaskVal | HighZeros;
4724
4725 // If a negative constant would not allow a smaller encoding, there's no need
4726 // to continue. Only change the constant when we know it's a win.
4727 unsigned MinWidth = NegMaskVal.getSignificantBits();
4728 if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32))
4729 return false;
4730
4731 // Extend masks if we truncated above.
4732 if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
4733 NegMaskVal = NegMaskVal.zext(64);
4734 HighZeros = HighZeros.zext(64);
4735 }
4736
4737 // The variable operand must be all zeros in the top bits to allow using the
4738 // new, negative constant as the mask.
4739 if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
4740 return false;
4741
4742 // Check if the mask is -1. In that case, this is an unnecessary instruction
4743 // that escaped earlier analysis.
4744 if (NegMaskVal.isAllOnes()) {
4745 ReplaceNode(And, And0.getNode());
4746 return true;
4747 }
4748
4749 // A negative mask allows a smaller encoding. Create a new 'and' node.
4750 SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
4751 insertDAGNode(*CurDAG, SDValue(And, 0), NewMask);
4752 SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
4753 ReplaceNode(And, NewAnd.getNode());
4754 SelectCode(NewAnd.getNode());
4755 return true;
4756}
4757
4758static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
4759 bool FoldedBCast, bool Masked) {
4760#define VPTESTM_CASE(VT, SUFFIX) \
4761case MVT::VT: \
4762 if (Masked) \
4763 return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4764 return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4765
4766
4767#define VPTESTM_BROADCAST_CASES(SUFFIX) \
4768default: llvm_unreachable("Unexpected VT!"); \
4769VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4770VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4771VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4772VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4773VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4774VPTESTM_CASE(v8i64, QZ##SUFFIX)
4775
4776#define VPTESTM_FULL_CASES(SUFFIX) \
4777VPTESTM_BROADCAST_CASES(SUFFIX) \
4778VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
4779VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
4780VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
4781VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
4782VPTESTM_CASE(v64i8, BZ##SUFFIX) \
4783VPTESTM_CASE(v32i16, WZ##SUFFIX)
4784
4785 if (FoldedBCast) {
4786 switch (TestVT.SimpleTy) {
4788 }
4789 }
4790
4791 if (FoldedLoad) {
4792 switch (TestVT.SimpleTy) {
4794 }
4795 }
4796
4797 switch (TestVT.SimpleTy) {
4799 }
4800
4801#undef VPTESTM_FULL_CASES
4802#undef VPTESTM_BROADCAST_CASES
4803#undef VPTESTM_CASE
4804}
4805
4806// Try to create VPTESTM instruction. If InMask is not null, it will be used
4807// to form a masked operation.
4808bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4809 SDValue InMask) {
4810 assert(Subtarget->hasAVX512() && "Expected AVX512!");
4811 assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4812 "Unexpected VT!");
4813
4814 // Look for equal and not equal compares.
4815 ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
4816 if (CC != ISD::SETEQ && CC != ISD::SETNE)
4817 return false;
4818
4819 SDValue SetccOp0 = Setcc.getOperand(0);
4820 SDValue SetccOp1 = Setcc.getOperand(1);
4821
4822 // Canonicalize the all zero vector to the RHS.
4823 if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
4824 std::swap(SetccOp0, SetccOp1);
4825
4826 // See if we're comparing against zero.
4827 if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
4828 return false;
4829
4830 SDValue N0 = SetccOp0;
4831
4832 MVT CmpVT = N0.getSimpleValueType();
4833 MVT CmpSVT = CmpVT.getVectorElementType();
4834
4835 // Start with both operands the same. We'll try to refine this.
4836 SDValue Src0 = N0;
4837 SDValue Src1 = N0;
4838
4839 {
4840 // Look through single use bitcasts.
4841 SDValue N0Temp = N0;
4842 if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4843 N0Temp = N0.getOperand(0);
4844
4845 // Look for single use AND.
4846 if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4847 Src0 = N0Temp.getOperand(0);
4848 Src1 = N0Temp.getOperand(1);
4849 }
4850 }
4851
4852 // Without VLX we need to widen the operation.
4853 bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4854
4855 auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
4856 SDValue &Base, SDValue &Scale, SDValue &Index,
4857 SDValue &Disp, SDValue &Segment) {
4858 // If we need to widen, we can't fold the load.
4859 if (!Widen)
4860 if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4861 return true;
4862
4863 // If we didn't fold a load, try to match broadcast. No widening limitation
4864 // for this. But only 32 and 64 bit types are supported.
4865 if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
4866 return false;
4867
4868 // Look through single use bitcasts.
4869 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4870 P = L.getNode();
4871 L = L.getOperand(0);
4872 }
4873
4874 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4875 return false;
4876
4877 auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4878 if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
4879 return false;
4880
4881 return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4882 };
4883
4884 // We can only fold loads if the sources are unique.
4885 bool CanFoldLoads = Src0 != Src1;
4886
4887 bool FoldedLoad = false;
4888 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4889 if (CanFoldLoads) {
4890 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
4891 Tmp3, Tmp4);
4892 if (!FoldedLoad) {
4893 // And is commutative.
4894 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
4895 Tmp2, Tmp3, Tmp4);
4896 if (FoldedLoad)
4897 std::swap(Src0, Src1);
4898 }
4899 }
4900
4901 bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
4902
4903 bool IsMasked = InMask.getNode() != nullptr;
4904
4905 SDLoc dl(Root);
4906
4907 MVT ResVT = Setcc.getSimpleValueType();
4908 MVT MaskVT = ResVT;
4909 if (Widen) {
4910 // Widen the inputs using insert_subreg or copy_to_regclass.
4911 unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
4912 unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
4913 unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
4914 CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
4915 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
4916 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
4917 CmpVT), 0);
4918 Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
4919
4920 if (!FoldedBCast)
4921 Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
4922
4923 if (IsMasked) {
4924 // Widen the mask.
4925 unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID();
4926 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4927 InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4928 dl, MaskVT, InMask, RC), 0);
4929 }
4930 }
4931
4932 bool IsTestN = CC == ISD::SETEQ;
4933 unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
4934 IsMasked);
4935
4936 MachineSDNode *CNode;
4937 if (FoldedLoad) {
4938 SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
4939
4940 if (IsMasked) {
4941 SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4942 Src1.getOperand(0) };
4943 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4944 } else {
4945 SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4946 Src1.getOperand(0) };
4947 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4948 }
4949
4950 // Update the chain.
4951 ReplaceUses(Src1.getValue(1), SDValue(CNode, 1));
4952 // Record the mem-refs
4953 CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Src1)->getMemOperand()});
4954 } else {
4955 if (IsMasked)
4956 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
4957 else
4958 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
4959 }
4960
4961 // If we widened, we need to shrink the mask VT.
4962 if (Widen) {
4963 unsigned RegClass = TLI->getRegClassFor(ResVT)->getID();
4964 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4965 CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4966 dl, ResVT, SDValue(CNode, 0), RC);
4967 }
4968
4969 ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
4970 CurDAG->RemoveDeadNode(Root);
4971 return true;
4972}
4973
4974// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
4975// into vpternlog.
4976bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
4977 assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
4978
4979 MVT NVT = N->getSimpleValueType(0);
4980
4981 // Make sure we support VPTERNLOG.
4982 if (!NVT.isVector() || !Subtarget->hasAVX512())
4983 return false;
4984
4985 // We need VLX for 128/256-bit.
4986 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4987 return false;
4988
4989 SDValue N0 = N->getOperand(0);
4990 SDValue N1 = N->getOperand(1);
4991
4992 // Canonicalize AND to LHS.
4993 if (N1.getOpcode() == ISD::AND)
4994 std::swap(N0, N1);
4995
4996 if (N0.getOpcode() != ISD::AND ||
4997 N1.getOpcode() != X86ISD::ANDNP ||
4998 !N0.hasOneUse() || !N1.hasOneUse())
4999 return false;
5000
5001 // ANDN is not commutable, use it to pick down A and C.
5002 SDValue A = N1.getOperand(0);
5003 SDValue C = N1.getOperand(1);
5004
5005 // AND is commutable, if one operand matches A, the other operand is B.
5006 // Otherwise this isn't a match.
5007 SDValue B;
5008 if (N0.getOperand(0) == A)
5009 B = N0.getOperand(1);
5010 else if (N0.getOperand(1) == A)
5011 B = N0.getOperand(0);
5012 else
5013 return false;
5014
5015 SDLoc dl(N);
5016 SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
5017 SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
5018 ReplaceNode(N, Ternlog.getNode());
5019
5020 return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(),
5021 Ternlog.getNode(), A, B, C, 0xCA);
5022}
5023
5024void X86DAGToDAGISel::Select(SDNode *Node) {
5025 MVT NVT = Node->getSimpleValueType(0);
5026 unsigned Opcode = Node->getOpcode();
5027 SDLoc dl(Node);
5028
5029 if (Node->isMachineOpcode()) {
5030 LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
5031 Node->setNodeId(-1);
5032 return; // Already selected.
5033 }
5034
5035 switch (Opcode) {
5036 default: break;
5038 unsigned IntNo = Node->getConstantOperandVal(1);
5039 switch (IntNo) {
5040 default: break;
5041 case Intrinsic::x86_encodekey128:
5042 case Intrinsic::x86_encodekey256: {
5043 if (!Subtarget->hasKL())
5044 break;
5045
5046 unsigned Opcode;
5047 switch (IntNo) {
5048 default: llvm_unreachable("Impossible intrinsic");
5049 case Intrinsic::x86_encodekey128:
5050 Opcode = GET_EGPR_IF_ENABLED(X86::ENCODEKEY128);
5051 break;
5052 case Intrinsic::x86_encodekey256:
5053 Opcode = GET_EGPR_IF_ENABLED(X86::ENCODEKEY256);
5054 break;
5055 }
5056
5057 SDValue Chain = Node->getOperand(0);
5058 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3),
5059 SDValue());
5060 if (Opcode == X86::ENCODEKEY256 || Opcode == X86::ENCODEKEY256_EVEX)
5061 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4),
5062 Chain.getValue(1));
5063
5064 MachineSDNode *Res = CurDAG->getMachineNode(
5065 Opcode, dl, Node->getVTList(),
5066 {Node->getOperand(2), Chain, Chain.getValue(1)});
5067 ReplaceNode(Node, Res);
5068 return;
5069 }
5070 case Intrinsic::x86_tileloadd64_internal:
5071 case Intrinsic::x86_tileloaddt164_internal: {
5072 if (!Subtarget->hasAMXTILE())
5073 break;
5074 unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal
5075 ? X86::PTILELOADDV
5076 : X86::PTILELOADDT1V;
5077 // _tile_loadd_internal(row, col, buf, STRIDE)
5078 SDValue Base = Node->getOperand(4);
5079 SDValue Scale = getI8Imm(1, dl);
5080 SDValue Index = Node->getOperand(5);
5081 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5082 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5083 SDValue Chain = Node->getOperand(0);
5084 MachineSDNode *CNode;
5085 SDValue Ops[] = {Node->getOperand(2),
5086 Node->getOperand(3),
5087 Base,
5088 Scale,
5089 Index,
5090 Disp,
5091 Segment,
5092 Chain};
5093 CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
5094 ReplaceNode(Node, CNode);
5095 return;
5096 }
5097 }
5098 break;
5099 }
5100 case ISD::INTRINSIC_VOID: {
5101 unsigned IntNo = Node->getConstantOperandVal(1);
5102 switch (IntNo) {
5103 default: break;
5104 case Intrinsic::x86_sse3_monitor:
5105 case Intrinsic::x86_monitorx:
5106 case Intrinsic::x86_clzero: {
5107 bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
5108
5109 unsigned Opc = 0;
5110 switch (IntNo) {
5111 default: llvm_unreachable("Unexpected intrinsic!");
5112 case Intrinsic::x86_sse3_monitor:
5113 if (!Subtarget->hasSSE3())
5114 break;
5115 Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
5116 break;
5117 case Intrinsic::x86_monitorx:
5118 if (!Subtarget->hasMWAITX())
5119 break;
5120 Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
5121 break;
5122 case Intrinsic::x86_clzero:
5123 if (!Subtarget->hasCLZERO())
5124 break;
5125 Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
5126 break;
5127 }
5128
5129 if (Opc) {
5130 unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
5131 SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
5132 Node->getOperand(2), SDValue());
5133 SDValue InGlue = Chain.getValue(1);
5134
5135 if (IntNo == Intrinsic::x86_sse3_monitor ||
5136 IntNo == Intrinsic::x86_monitorx) {
5137 // Copy the other two operands to ECX and EDX.
5138 Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
5139 InGlue);
5140 InGlue = Chain.getValue(1);
5141 Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
5142 InGlue);
5143 InGlue = Chain.getValue(1);
5144 }
5145
5146 MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
5147 { Chain, InGlue});
5148 ReplaceNode(Node, CNode);
5149 return;
5150 }
5151
5152 break;
5153 }
5154 case Intrinsic::x86_tilestored64_internal: {
5155 unsigned Opc = X86::PTILESTOREDV;
5156 // _tile_stored_internal(row, col, buf, STRIDE, c)
5157 SDValue Base = Node->getOperand(4);
5158 SDValue Scale = getI8Imm(1, dl);
5159 SDValue Index = Node->getOperand(5);
5160 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5161 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5162 SDValue Chain = Node->getOperand(0);
5163 MachineSDNode *CNode;
5164 SDValue Ops[] = {Node->getOperand(2),
5165 Node->getOperand(3),
5166 Base,
5167 Scale,
5168 Index,
5169 Disp,
5170 Segment,
5171 Node->getOperand(6),
5172 Chain};
5173 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5174 ReplaceNode(Node, CNode);
5175 return;
5176 }
5177 case Intrinsic::x86_tileloadd64:
5178 case Intrinsic::x86_tileloaddt164:
5179 case Intrinsic::x86_tilestored64: {
5180 if (!Subtarget->hasAMXTILE())
5181 break;
5182 unsigned Opc;
5183 switch (IntNo) {
5184 default: llvm_unreachable("Unexpected intrinsic!");
5185 case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
5186 case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
5187 case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
5188 }
5189 // FIXME: Match displacement and scale.
5190 unsigned TIndex = Node->getConstantOperandVal(2);
5191 SDValue TReg = getI8Imm(TIndex, dl);
5192 SDValue Base = Node->getOperand(3);
5193 SDValue Scale = getI8Imm(1, dl);
5194 SDValue Index = Node->getOperand(4);
5195 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5196 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5197 SDValue Chain = Node->getOperand(0);
5198 MachineSDNode *CNode;
5199 if (Opc == X86::PTILESTORED) {
5200 SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
5201 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5202 } else {
5203 SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
5204 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5205 }
5206 ReplaceNode(Node, CNode);
5207 return;
5208 }
5209 }
5210 break;
5211 }
5212 case ISD::BRIND:
5213 case X86ISD::NT_BRIND: {
5214 if (Subtarget->isTargetNaCl())
5215 // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
5216 // leave the instruction alone.
5217 break;
5218 if (Subtarget->isTarget64BitILP32()) {
5219 // Converts a 32-bit register to a 64-bit, zero-extended version of
5220 // it. This is needed because x86-64 can do many things, but jmp %r32
5221 // ain't one of them.
5222 SDValue Target = Node->getOperand(1);
5223 assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
5224 SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64);
5225 SDValue Brind = CurDAG->getNode(Opcode, dl, MVT::Other,
5226 Node->getOperand(0), ZextTarget);
5227 ReplaceNode(Node, Brind.getNode());
5228 SelectCode(ZextTarget.getNode());
5229 SelectCode(Brind.getNode());
5230 return;
5231 }
5232 break;
5233 }
5235 ReplaceNode(Node, getGlobalBaseReg());
5236 return;
5237
5238 case ISD::BITCAST:
5239 // Just drop all 128/256/512-bit bitcasts.
5240 if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
5241 NVT == MVT::f128) {
5242 ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
5243 CurDAG->RemoveDeadNode(Node);
5244 return;
5245 }
5246 break;
5247
5248 case ISD::SRL:
5249 if (matchBitExtract(Node))
5250 return;
5251 [[fallthrough]];
5252 case ISD::SRA:
5253 case ISD::SHL:
5254 if (tryShiftAmountMod(Node))
5255 return;
5256 break;
5257
5258 case X86ISD::VPTERNLOG: {
5259 uint8_t Imm = Node->getConstantOperandVal(3);
5260 if (matchVPTERNLOG(Node, Node, Node, Node, Node->getOperand(0),
5261 Node->getOperand(1), Node->getOperand(2), Imm))
5262 return;
5263 break;
5264 }
5265
5266 case X86ISD::ANDNP:
5267 if (tryVPTERNLOG(Node))
5268 return;
5269 break;
5270
5271 case ISD::AND:
5272 if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
5273 // Try to form a masked VPTESTM. Operands can be in either order.
5274 SDValue N0 = Node->getOperand(0);
5275 SDValue N1 = Node->getOperand(1);
5276 if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
5277 tryVPTESTM(Node, N0, N1))
5278 return;
5279 if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
5280 tryVPTESTM(Node, N1, N0))
5281 return;
5282 }
5283
5284 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
5285 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
5286 CurDAG->RemoveDeadNode(Node);
5287 return;
5288 }
5289 if (matchBitExtract(Node))
5290 return;
5291 if (AndImmShrink && shrinkAndImmediate(Node))
5292 return;
5293
5294 [[fallthrough]];
5295 case ISD::OR:
5296 case ISD::XOR:
5297 if (tryShrinkShlLogicImm(Node))
5298 return;
5299 if (Opcode == ISD::OR && tryMatchBitSelect(Node))
5300 return;
5301 if (tryVPTERNLOG(Node))
5302 return;
5303
5304 [[fallthrough]];
5305 case ISD::ADD:
5306 if (Opcode == ISD::ADD && matchBitExtract(Node))
5307 return;
5308 [[fallthrough]];
5309 case ISD::SUB: {
5310 // Try to avoid folding immediates with multiple uses for optsize.
5311 // This code tries to select to register form directly to avoid going
5312 // through the isel table which might fold the immediate. We can't change
5313 // the patterns on the add/sub/and/or/xor with immediate paterns in the
5314 // tablegen files to check immediate use count without making the patterns
5315 // unavailable to the fast-isel table.
5316 if (!CurDAG->shouldOptForSize())
5317 break;
5318
5319 // Only handle i8/i16/i32/i64.
5320 if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
5321 break;
5322
5323 SDValue N0 = Node->getOperand(0);
5324 SDValue N1 = Node->getOperand(1);
5325
5326 auto *Cst = dyn_cast<ConstantSDNode>(N1);
5327 if (!Cst)
5328 break;
5329
5330 int64_t Val = Cst->getSExtValue();
5331
5332 // Make sure its an immediate that is considered foldable.
5333 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5334 if (!isInt<8>(Val) && !isInt<32>(Val))
5335 break;
5336
5337 // If this can match to INC/DEC, let it go.
5338 if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
5339 break;
5340
5341 // Check if we should avoid folding this immediate.
5342 if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
5343 break;
5344
5345 // We should not fold the immediate. So we need a register form instead.
5346 unsigned ROpc, MOpc;
5347 switch (NVT.SimpleTy) {
5348 default: llvm_unreachable("Unexpected VT!");
5349 case MVT::i8:
5350 switch (Opcode) {
5351 default: llvm_unreachable("Unexpected opcode!");
5352 case ISD::ADD:
5353 ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
5354 MOpc = GET_ND_IF_ENABLED(X86::ADD8rm);
5355 break;
5356 case ISD::SUB:
5357 ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
5358 MOpc = GET_ND_IF_ENABLED(X86::SUB8rm);
5359 break;
5360 case ISD::AND:
5361 ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
5362 MOpc = GET_ND_IF_ENABLED(X86::AND8rm);
5363 break;
5364 case ISD::OR:
5365 ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
5366 MOpc = GET_ND_IF_ENABLED(X86::OR8rm);
5367 break;
5368 case ISD::XOR:
5369 ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
5370 MOpc = GET_ND_IF_ENABLED(X86::XOR8rm);
5371 break;
5372 }
5373 break;
5374 case MVT::i16:
5375 switch (Opcode) {
5376 default: llvm_unreachable("Unexpected opcode!");
5377 case ISD::ADD:
5378 ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
5379 MOpc = GET_ND_IF_ENABLED(X86::ADD16rm);
5380 break;
5381 case ISD::SUB:
5382 ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
5383 MOpc = GET_ND_IF_ENABLED(X86::SUB16rm);
5384 break;
5385 case ISD::AND:
5386 ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
5387 MOpc = GET_ND_IF_ENABLED(X86::AND16rm);
5388 break;
5389 case ISD::OR:
5390 ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
5391 MOpc = GET_ND_IF_ENABLED(X86::OR16rm);
5392 break;
5393 case ISD::XOR:
5394 ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
5395 MOpc = GET_ND_IF_ENABLED(X86::XOR16rm);
5396 break;
5397 }
5398 break;
5399 case MVT::i32:
5400 switch (Opcode) {
5401 default: llvm_unreachable("Unexpected opcode!");
5402 case ISD::ADD:
5403 ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
5404 MOpc = GET_ND_IF_ENABLED(X86::ADD32rm);
5405 break;
5406 case ISD::SUB:
5407 ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
5408 MOpc = GET_ND_IF_ENABLED(X86::SUB32rm);
5409 break;
5410 case ISD::AND:
5411 ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
5412 MOpc = GET_ND_IF_ENABLED(X86::AND32rm);
5413 break;
5414 case ISD::OR:
5415 ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
5416 MOpc = GET_ND_IF_ENABLED(X86::OR32rm);
5417 break;
5418 case ISD::XOR:
5419 ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
5420 MOpc = GET_ND_IF_ENABLED(X86::XOR32rm);
5421 break;
5422 }
5423 break;
5424 case MVT::i64:
5425 switch (Opcode) {
5426 default: llvm_unreachable("Unexpected opcode!");
5427 case ISD::ADD:
5428 ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
5429 MOpc = GET_ND_IF_ENABLED(X86::ADD64rm);
5430 break;
5431 case ISD::SUB:
5432 ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
5433 MOpc = GET_ND_IF_ENABLED(X86::SUB64rm);
5434 break;
5435 case ISD::AND:
5436 ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
5437 MOpc = GET_ND_IF_ENABLED(X86::AND64rm);
5438 break;
5439 case ISD::OR:
5440 ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
5441 MOpc = GET_ND_IF_ENABLED(X86::OR64rm);
5442 break;
5443 case ISD::XOR:
5444 ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
5445 MOpc = GET_ND_IF_ENABLED(X86::XOR64rm);
5446 break;
5447 }
5448 break;
5449 }
5450
5451 // Ok this is a AND/OR/XOR/ADD/SUB with constant.
5452
5453 // If this is a not a subtract, we can still try to fold a load.
5454 if (Opcode != ISD::SUB) {
5455 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5456 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5457 SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5458 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5459 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5460 // Update the chain.
5461 ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
5462 // Record the mem-refs
5463 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
5464 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5465 CurDAG->RemoveDeadNode(Node);
5466 return;
5467 }
5468 }
5469
5470 CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
5471 return;
5472 }
5473
5474 case X86ISD::SMUL:
5475 // i16/i32/i64 are handled with isel patterns.
5476 if (NVT != MVT::i8)
5477 break;
5478 [[fallthrough]];
5479 case X86ISD::UMUL: {
5480 SDValue N0 = Node->getOperand(0);
5481 SDValue N1 = Node->getOperand(1);
5482
5483 unsigned LoReg, ROpc, MOpc;
5484 switch (NVT.SimpleTy) {
5485 default: llvm_unreachable("Unsupported VT!");
5486 case MVT::i8:
5487 LoReg = X86::AL;
5488 ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
5489 MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
5490 break;
5491 case MVT::i16:
5492 LoReg = X86::AX;
5493 ROpc = X86::MUL16r;
5494 MOpc = X86::MUL16m;
5495 break;
5496 case MVT::i32:
5497 LoReg = X86::EAX;
5498 ROpc = X86::MUL32r;
5499 MOpc = X86::MUL32m;
5500 break;
5501 case MVT::i64:
5502 LoReg = X86::RAX;
5503 ROpc = X86::MUL64r;
5504 MOpc = X86::MUL64m;
5505 break;
5506 }
5507
5508 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5509 bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5510 // Multiply is commutative.
5511 if (!FoldedLoad) {
5512 FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5513 if (FoldedLoad)
5514 std::swap(N0, N1);
5515 }
5516
5517 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5518 N0, SDValue()).getValue(1);
5519
5520 MachineSDNode *CNode;
5521 if (FoldedLoad) {
5522 // i16/i32/i64 use an instruction that produces a low and high result even
5523 // though only the low result is used.
5524 SDVTList VTs;
5525 if (NVT == MVT::i8)
5526 VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5527 else
5528 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
5529
5530 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5531 InGlue };
5532 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5533
5534 // Update the chain.
5535 ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
5536 // Record the mem-refs
5537 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5538 } else {
5539 // i16/i32/i64 use an instruction that produces a low and high result even
5540 // though only the low result is used.
5541 SDVTList VTs;
5542 if (NVT == MVT::i8)
5543 VTs = CurDAG->getVTList(NVT, MVT::i32);
5544 else
5545 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
5546
5547 CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InGlue});
5548 }
5549
5550 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5551 ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
5552 CurDAG->RemoveDeadNode(Node);
5553 return;
5554 }
5555
5556 case ISD::SMUL_LOHI:
5557 case ISD::UMUL_LOHI: {
5558 SDValue N0 = Node->getOperand(0);
5559 SDValue N1 = Node->getOperand(1);
5560
5561 unsigned Opc, MOpc;
5562 unsigned LoReg, HiReg;
5563 bool IsSigned = Opcode == ISD::SMUL_LOHI;
5564 bool UseMULX = !IsSigned && Subtarget->hasBMI2();
5565 bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
5566 switch (NVT.SimpleTy) {
5567 default: llvm_unreachable("Unsupported VT!");
5568 case MVT::i32:
5569 Opc = UseMULXHi ? X86::MULX32Hrr
5570 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
5571 : IsSigned ? X86::IMUL32r
5572 : X86::MUL32r;
5573 MOpc = UseMULXHi ? X86::MULX32Hrm
5574 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
5575 : IsSigned ? X86::IMUL32m
5576 : X86::MUL32m;
5577 LoReg = UseMULX ? X86::EDX : X86::EAX;
5578 HiReg = X86::EDX;
5579 break;
5580 case MVT::i64:
5581 Opc = UseMULXHi ? X86::MULX64Hrr
5582 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
5583 : IsSigned ? X86::IMUL64r
5584 : X86::MUL64r;
5585 MOpc = UseMULXHi ? X86::MULX64Hrm
5586 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
5587 : IsSigned ? X86::IMUL64m
5588 : X86::MUL64m;
5589 LoReg = UseMULX ? X86::RDX : X86::RAX;
5590 HiReg = X86::RDX;
5591 break;
5592 }
5593
5594 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5595 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5596 // Multiply is commutative.
5597 if (!foldedLoad) {
5598 foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5599 if (foldedLoad)
5600 std::swap(N0, N1);
5601 }
5602
5603 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5604 N0, SDValue()).getValue(1);
5605 SDValue ResHi, ResLo;
5606 if (foldedLoad) {
5607 SDValue Chain;
5608 MachineSDNode *CNode = nullptr;
5609 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5610 InGlue };
5611 if (UseMULXHi) {
5612 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
5613 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5614 ResHi = SDValue(CNode, 0);
5615 Chain = SDValue(CNode, 1);
5616 } else if (UseMULX) {
5617 SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
5618 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5619 ResHi = SDValue(CNode, 0);
5620 ResLo = SDValue(CNode, 1);
5621 Chain = SDValue(CNode, 2);
5622 } else {
5623 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
5624 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5625 Chain = SDValue(CNode, 0);
5626 InGlue = SDValue(CNode, 1);
5627 }
5628
5629 // Update the chain.
5630 ReplaceUses(N1.getValue(1), Chain);
5631 // Record the mem-refs
5632 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5633 } else {
5634 SDValue Ops[] = { N1, InGlue };
5635 if (UseMULXHi) {
5636 SDVTList VTs = CurDAG->getVTList(NVT);
5637 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5638 ResHi = SDValue(CNode, 0);
5639 } else if (UseMULX) {
5640 SDVTList VTs = CurDAG->getVTList(NVT, NVT);
5641 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5642 ResHi = SDValue(CNode, 0);
5643 ResLo = SDValue(CNode, 1);
5644 } else {
5645 SDVTList VTs = CurDAG->getVTList(MVT::Glue);
5646 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5647 InGlue = SDValue(CNode, 0);
5648 }
5649 }
5650
5651 // Copy the low half of the result, if it is needed.
5652 if (!SDValue(Node, 0).use_empty()) {
5653 if (!ResLo) {
5654 assert(LoReg && "Register for low half is not defined!");
5655 ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
5656 NVT, InGlue);
5657 InGlue = ResLo.getValue(2);
5658 }
5659 ReplaceUses(SDValue(Node, 0), ResLo);
5660 LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
5661 dbgs() << '\n');
5662 }
5663 // Copy the high half of the result, if it is needed.
5664 if (!SDValue(Node, 1).use_empty()) {
5665 if (!ResHi) {
5666 assert(HiReg && "Register for high half is not defined!");
5667 ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
5668 NVT, InGlue);
5669 InGlue = ResHi.getValue(2);
5670 }
5671 ReplaceUses(SDValue(Node, 1), ResHi);
5672 LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
5673 dbgs() << '\n');
5674 }
5675
5676 CurDAG->RemoveDeadNode(Node);
5677 return;
5678 }
5679
5680 case ISD::SDIVREM:
5681 case ISD::UDIVREM: {
5682 SDValue N0 = Node->getOperand(0);
5683 SDValue N1 = Node->getOperand(1);
5684
5685 unsigned ROpc, MOpc;
5686 bool isSigned = Opcode == ISD::SDIVREM;
5687 if (!isSigned) {
5688 switch (NVT.SimpleTy) {
5689 default: llvm_unreachable("Unsupported VT!");
5690 case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break;
5691 case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
5692 case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
5693 case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
5694 }
5695 } else {
5696 switch (NVT.SimpleTy) {
5697 default: llvm_unreachable("Unsupported VT!");
5698 case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
5699 case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
5700 case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
5701 case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
5702 }
5703 }
5704
5705 unsigned LoReg, HiReg, ClrReg;
5706 unsigned SExtOpcode;
5707 switch (NVT.SimpleTy) {
5708 default: llvm_unreachable("Unsupported VT!");
5709 case MVT::i8:
5710 LoReg = X86::AL; ClrReg = HiReg = X86::AH;
5711 SExtOpcode = 0; // Not used.
5712 break;
5713 case MVT::i16:
5714 LoReg = X86::AX; HiReg = X86::DX;
5715 ClrReg = X86::DX;
5716 SExtOpcode = X86::CWD;
5717 break;
5718 case MVT::i32:
5719 LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
5720 SExtOpcode = X86::CDQ;
5721 break;
5722 case MVT::i64:
5723 LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
5724 SExtOpcode = X86::CQO;
5725 break;
5726 }
5727
5728 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5729 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5730 bool signBitIsZero = CurDAG->SignBitIsZero(N0);
5731
5732 SDValue InGlue;
5733 if (NVT == MVT::i8) {
5734 // Special case for div8, just use a move with zero extension to AX to
5735 // clear the upper 8 bits (AH).
5736 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
5737 MachineSDNode *Move;
5738 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5739 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5740 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
5741 : X86::MOVZX16rm8;
5742 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
5743 Chain = SDValue(Move, 1);
5744 ReplaceUses(N0.getValue(1), Chain);
5745 // Record the mem-refs
5746 CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
5747 } else {
5748 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
5749 : X86::MOVZX16rr8;
5750 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
5751 Chain = CurDAG->getEntryNode();
5752 }
5753 Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
5754 SDValue());
5755 InGlue = Chain.getValue(1);
5756 } else {
5757 InGlue =
5758 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
5759 LoReg, N0, SDValue()).getValue(1);
5760 if (isSigned && !signBitIsZero) {
5761 // Sign extend the low part into the high part.
5762 InGlue =
5763 SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InGlue),0);
5764 } else {
5765 // Zero out the high part, effectively zero extending the input.
5766 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
5767 SDValue ClrNode = SDValue(
5768 CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, std::nullopt), 0);
5769 switch (NVT.SimpleTy) {
5770 case MVT::i16:
5771 ClrNode =
5772 SDValue(CurDAG->getMachineNode(
5773 TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
5774 CurDAG->getTargetConstant(X86::sub_16bit, dl,
5775 MVT::i32)),
5776 0);
5777 break;
5778 case MVT::i32:
5779 break;
5780 case MVT::i64:
5781 ClrNode =
5782 SDValue(CurDAG->getMachineNode(
5783 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
5784 CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
5785 CurDAG->getTargetConstant(X86::sub_32bit, dl,
5786 MVT::i32)),
5787 0);
5788 break;
5789 default:
5790 llvm_unreachable("Unexpected division source");
5791 }
5792
5793 InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
5794 ClrNode, InGlue).getValue(1);
5795 }
5796 }
5797
5798 if (foldedLoad) {
5799 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5800 InGlue };
5801 MachineSDNode *CNode =
5802 CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
5803 InGlue = SDValue(CNode, 1);
5804 // Update the chain.
5805 ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
5806 // Record the mem-refs
5807 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5808 } else {
5809 InGlue =
5810 SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InGlue), 0);
5811 }
5812
5813 // Prevent use of AH in a REX instruction by explicitly copying it to
5814 // an ABCD_L register.
5815 //
5816 // The current assumption of the register allocator is that isel
5817 // won't generate explicit references to the GR8_ABCD_H registers. If
5818 // the allocator and/or the backend get enhanced to be more robust in
5819 // that regard, this can be, and should be, removed.
5820 if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
5821 SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
5822 unsigned AHExtOpcode =
5823 isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
5824
5825 SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
5826 MVT::Glue, AHCopy, InGlue);
5827 SDValue Result(RNode, 0);
5828 InGlue = SDValue(RNode, 1);
5829
5830 Result =
5831 CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
5832
5833 ReplaceUses(SDValue(Node, 1), Result);
5834 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5835 dbgs() << '\n');
5836 }
5837 // Copy the division (low) result, if it is needed.
5838 if (!SDValue(Node, 0).use_empty()) {
5839 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
5840 LoReg, NVT, InGlue);
5841 InGlue = Result.getValue(2);
5842 ReplaceUses(SDValue(Node, 0), Result);
5843 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5844 dbgs() << '\n');
5845 }
5846 // Copy the remainder (high) result, if it is needed.
5847 if (!SDValue(Node, 1).use_empty()) {
5848 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
5849 HiReg, NVT, InGlue);
5850 InGlue = Result.getValue(2);
5851 ReplaceUses(SDValue(Node, 1), Result);
5852 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5853 dbgs() << '\n');
5854 }
5855 CurDAG->RemoveDeadNode(Node);
5856 return;
5857 }
5858
5859 case X86ISD::FCMP:
5861 case X86ISD::STRICT_FCMPS: {
5862 bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
5863 Node->getOpcode() == X86ISD::STRICT_FCMPS;
5864 SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0);
5865 SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1);
5866
5867 // Save the original VT of the compare.
5868 MVT CmpVT = N0.getSimpleValueType();
5869
5870 // Floating point needs special handling if we don't have FCOMI.
5871 if (Subtarget->canUseCMOV())
5872 break;
5873
5874 bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
5875
5876 unsigned Opc;
5877 switch (CmpVT.SimpleTy) {
5878 default: llvm_unreachable("Unexpected type!");
5879 case MVT::f32:
5880 Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
5881 break;
5882 case MVT::f64:
5883 Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
5884 break;
5885 case MVT::f80:
5886 Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
5887 break;
5888 }
5889
5890 SDValue Chain =
5891 IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode();
5892 SDValue Glue;
5893 if (IsStrictCmp) {
5894 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
5895 Chain = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0);
5896 Glue = Chain.getValue(1);
5897 } else {
5898 Glue = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N0, N1), 0);
5899 }
5900
5901 // Move FPSW to AX.
5902 SDValue FNSTSW =
5903 SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, Glue), 0);
5904
5905 // Extract upper 8-bits of AX.
5906 SDValue Extract =
5907 CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW);
5908
5909 // Move AH into flags.
5910 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
5911 assert(Subtarget->canUseLAHFSAHF() &&
5912 "Target doesn't support SAHF or FCOMI?");
5913 SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue());
5914 Chain = AH;
5915 SDValue SAHF = SDValue(
5916 CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0);
5917
5918 if (IsStrictCmp)
5919 ReplaceUses(SDValue(Node, 1), Chain);
5920
5921 ReplaceUses(SDValue(Node, 0), SAHF);
5922 CurDAG->RemoveDeadNode(Node);
5923 return;
5924 }
5925
5926 case X86ISD::CMP: {
5927 SDValue N0 = Node->getOperand(0);
5928 SDValue N1 = Node->getOperand(1);
5929
5930 // Optimizations for TEST compares.
5931 if (!isNullConstant(N1))
5932 break;
5933
5934 // Save the original VT of the compare.
5935 MVT CmpVT = N0.getSimpleValueType();
5936
5937 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
5938 // by a test instruction. The test should be removed later by
5939 // analyzeCompare if we are using only the zero flag.
5940 // TODO: Should we check the users and use the BEXTR flags directly?
5941 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
5942 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
5943 unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
5944 : X86::TEST32rr;
5945 SDValue BEXTR = SDValue(NewNode, 0);
5946 NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
5947 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
5948 CurDAG->RemoveDeadNode(Node);
5949 return;
5950 }
5951 }
5952
5953 // We can peek through truncates, but we need to be careful below.
5954 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
5955 N0 = N0.getOperand(0);
5956
5957 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
5958 // use a smaller encoding.
5959 // Look past the truncate if CMP is the only use of it.
5960 if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
5961 N0.getValueType() != MVT::i8) {
5962 auto *MaskC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
5963 if (!MaskC)
5964 break;
5965
5966 // We may have looked through a truncate so mask off any bits that
5967 // shouldn't be part of the compare.
5968 uint64_t Mask = MaskC->getZExtValue();
5969 Mask &= maskTrailingOnes<uint64_t>(CmpVT.getScalarSizeInBits());
5970
5971 // Check if we can replace AND+IMM{32,64} with a shift. This is possible
5972 // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
5973 // zero flag.
5974 if (CmpVT == MVT::i64 && !isInt<8>(Mask) && isShiftedMask_64(Mask) &&
5975 onlyUsesZeroFlag(SDValue(Node, 0))) {
5976 unsigned ShiftOpcode = ISD::DELETED_NODE;
5977 unsigned ShiftAmt;
5978 unsigned SubRegIdx;
5979 MVT SubRegVT;
5980 unsigned TestOpcode;
5981 unsigned LeadingZeros = llvm::countl_zero(Mask);
5982 unsigned TrailingZeros = llvm::countr_zero(Mask);
5983
5984 // With leading/trailing zeros, the transform is profitable if we can
5985 // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
5986 // incurring any extra register moves.
5987 bool SavesBytes = !isInt<32>(Mask) || N0.getOperand(0).hasOneUse();
5988 if (LeadingZeros == 0 && SavesBytes) {
5989 // If the mask covers the most significant bit, then we can replace
5990 // TEST+AND with a SHR and check eflags.
5991 // This emits a redundant TEST which is subsequently eliminated.
5992 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
5993 ShiftAmt = TrailingZeros;
5994 SubRegIdx = 0;
5995 TestOpcode = X86::TEST64rr;
5996 } else if (TrailingZeros == 0 && SavesBytes) {
5997 // If the mask covers the least significant bit, then we can replace
5998 // TEST+AND with a SHL and check eflags.
5999 // This emits a redundant TEST which is subsequently eliminated.
6000 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri);
6001 ShiftAmt = LeadingZeros;
6002 SubRegIdx = 0;
6003 TestOpcode = X86::TEST64rr;
6004 } else if (MaskC->hasOneUse() && !isInt<32>(Mask)) {
6005 // If the shifted mask extends into the high half and is 8/16/32 bits
6006 // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6007 unsigned PopCount = 64 - LeadingZeros - TrailingZeros;
6008 if (PopCount == 8) {
6009 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6010 ShiftAmt = TrailingZeros;
6011 SubRegIdx = X86::sub_8bit;
6012 SubRegVT = MVT::i8;
6013 TestOpcode = X86::TEST8rr;
6014 } else if (PopCount == 16) {
6015 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6016 ShiftAmt = TrailingZeros;
6017 SubRegIdx = X86::sub_16bit;
6018 SubRegVT = MVT::i16;
6019 TestOpcode = X86::TEST16rr;
6020 } else if (PopCount == 32) {
6021 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6022 ShiftAmt = TrailingZeros;
6023 SubRegIdx = X86::sub_32bit;
6024 SubRegVT = MVT::i32;
6025 TestOpcode = X86::TEST32rr;
6026 }
6027 }
6028 if (ShiftOpcode != ISD::DELETED_NODE) {
6029 SDValue ShiftC = CurDAG->getTargetConstant(ShiftAmt, dl, MVT::i64);
6030 SDValue Shift = SDValue(
6031 CurDAG->getMachineNode(ShiftOpcode, dl, MVT::i64, MVT::i32,
6032 N0.getOperand(0), ShiftC),
6033 0);
6034 if (SubRegIdx != 0) {
6035 Shift =
6036 CurDAG->getTargetExtractSubreg(SubRegIdx, dl, SubRegVT, Shift);
6037 }
6039 CurDAG->getMachineNode(TestOpcode, dl, MVT::i32, Shift, Shift);
6040 ReplaceNode(Node, Test);
6041 return;
6042 }
6043 }
6044
6045 MVT VT;
6046 int SubRegOp;
6047 unsigned ROpc, MOpc;
6048
6049 // For each of these checks we need to be careful if the sign flag is
6050 // being used. It is only safe to use the sign flag in two conditions,
6051 // either the sign bit in the shrunken mask is zero or the final test
6052 // size is equal to the original compare size.
6053
6054 if (isUInt<8>(Mask) &&
6055 (!(Mask & 0x80) || CmpVT == MVT::i8 ||
6056 hasNoSignFlagUses(SDValue(Node, 0)))) {
6057 // For example, convert "testl %eax, $8" to "testb %al, $8"
6058 VT = MVT::i8;
6059 SubRegOp = X86::sub_8bit;
6060 ROpc = X86::TEST8ri;
6061 MOpc = X86::TEST8mi;
6062 } else if (OptForMinSize && isUInt<16>(Mask) &&
6063 (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
6064 hasNoSignFlagUses(SDValue(Node, 0)))) {
6065 // For example, "testl %eax, $32776" to "testw %ax, $32776".
6066 // NOTE: We only want to form TESTW instructions if optimizing for
6067 // min size. Otherwise we only save one byte and possibly get a length
6068 // changing prefix penalty in the decoders.
6069 VT = MVT::i16;
6070 SubRegOp = X86::sub_16bit;
6071 ROpc = X86::TEST16ri;
6072 MOpc = X86::TEST16mi;
6073 } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
6074 ((!(Mask & 0x80000000) &&
6075 // Without minsize 16-bit Cmps can get here so we need to
6076 // be sure we calculate the correct sign flag if needed.
6077 (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
6078 CmpVT == MVT::i32 ||
6079 hasNoSignFlagUses(SDValue(Node, 0)))) {
6080 // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6081 // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6082 // Otherwize, we find ourselves in a position where we have to do
6083 // promotion. If previous passes did not promote the and, we assume
6084 // they had a good reason not to and do not promote here.
6085 VT = MVT::i32;
6086 SubRegOp = X86::sub_32bit;
6087 ROpc = X86::TEST32ri;
6088 MOpc = X86::TEST32mi;
6089 } else {
6090 // No eligible transformation was found.
6091 break;
6092 }
6093
6094 SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
6095 SDValue Reg = N0.getOperand(0);
6096
6097 // Emit a testl or testw.
6098 MachineSDNode *NewNode;
6099 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6100 if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
6101 if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) {
6102 if (!LoadN->isSimple()) {
6103 unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits();
6104 if ((MOpc == X86::TEST8mi && NumVolBits != 8) ||
6105 (MOpc == X86::TEST16mi && NumVolBits != 16) ||
6106 (MOpc == X86::TEST32mi && NumVolBits != 32))
6107 break;
6108 }
6109 }
6110 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
6111 Reg.getOperand(0) };
6112 NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
6113 // Update the chain.
6114 ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
6115 // Record the mem-refs
6116 CurDAG->setNodeMemRefs(NewNode,
6117 {cast<LoadSDNode>(Reg)->getMemOperand()});
6118 } else {
6119 // Extract the subregister if necessary.
6120 if (N0.getValueType() != VT)
6121 Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
6122
6123 NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
6124 }
6125 // Replace CMP with TEST.
6126 ReplaceNode(Node, NewNode);
6127 return;
6128 }
6129 break;
6130 }
6131 case X86ISD::PCMPISTR: {
6132 if (!Subtarget->hasSSE42())
6133 break;
6134
6135 bool NeedIndex = !SDValue(Node, 0).use_empty();
6136 bool NeedMask = !SDValue(Node, 1).use_empty();
6137 // We can't fold a load if we are going to make two instructions.
6138 bool MayFoldLoad = !NeedIndex || !NeedMask;
6139
6140 MachineSDNode *CNode;
6141 if (NeedMask) {
6142 unsigned ROpc =
6143 Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
6144 unsigned MOpc =
6145 Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
6146 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
6147 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6148 }
6149 if (NeedIndex || !NeedMask) {
6150 unsigned ROpc =
6151 Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
6152 unsigned MOpc =
6153 Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
6154 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
6155 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6156 }
6157
6158 // Connect the flag usage to the last instruction created.
6159 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6160 CurDAG->RemoveDeadNode(Node);
6161 return;
6162 }
6163 case X86ISD::PCMPESTR: {
6164 if (!Subtarget->hasSSE42())
6165 break;
6166
6167 // Copy the two implicit register inputs.
6168 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
6169 Node->getOperand(1),
6170 SDValue()).getValue(1);
6171 InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
6172 Node->getOperand(3), InGlue).getValue(1);
6173
6174 bool NeedIndex = !SDValue(Node, 0).use_empty();
6175 bool NeedMask = !SDValue(Node, 1).use_empty();
6176 // We can't fold a load if we are going to make two instructions.
6177 bool MayFoldLoad = !NeedIndex || !NeedMask;
6178
6179 MachineSDNode *CNode;
6180 if (NeedMask) {
6181 unsigned ROpc =
6182 Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
6183 unsigned MOpc =
6184 Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
6185 CNode =
6186 emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, InGlue);
6187 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6188 }
6189 if (NeedIndex || !NeedMask) {
6190 unsigned ROpc =
6191 Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
6192 unsigned MOpc =
6193 Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
6194 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InGlue);
6195 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6196 }
6197 // Connect the flag usage to the last instruction created.
6198 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6199 CurDAG->RemoveDeadNode(Node);
6200 return;
6201 }
6202
6203 case ISD::SETCC: {
6204 if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
6205 return;
6206
6207 break;
6208 }
6209
6210 case ISD::STORE:
6211 if (foldLoadStoreIntoMemOperand(Node))
6212 return;
6213 break;
6214
6215 case X86ISD::SETCC_CARRY: {
6216 MVT VT = Node->getSimpleValueType(0);
6218 if (Subtarget->hasSBBDepBreaking()) {
6219 // We have to do this manually because tblgen will put the eflags copy in
6220 // the wrong place if we use an extract_subreg in the pattern.
6221 // Copy flags to the EFLAGS register and glue it to next node.
6222 SDValue EFLAGS =
6223 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
6224 Node->getOperand(1), SDValue());
6225
6226 // Create a 64-bit instruction if the result is 64-bits otherwise use the
6227 // 32-bit version.
6228 unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
6229 MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
6230 Result = SDValue(
6231 CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)),
6232 0);
6233 } else {
6234 // The target does not recognize sbb with the same reg operand as a
6235 // no-source idiom, so we explicitly zero the input values.
6236 Result = getSBBZero(Node);
6237 }
6238
6239 // For less than 32-bits we need to extract from the 32-bit node.
6240 if (VT == MVT::i8 || VT == MVT::i16) {
6241 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6242 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6243 }
6244
6245 ReplaceUses(SDValue(Node, 0), Result);
6246 CurDAG->RemoveDeadNode(Node);
6247 return;
6248 }
6249 case X86ISD::SBB: {
6250 if (isNullConstant(Node->getOperand(0)) &&
6251 isNullConstant(Node->getOperand(1))) {
6252 SDValue Result = getSBBZero(Node);
6253
6254 // Replace the flag use.
6255 ReplaceUses(SDValue(Node, 1), Result.getValue(1));
6256
6257 // Replace the result use.
6258 if (!SDValue(Node, 0).use_empty()) {
6259 // For less than 32-bits we need to extract from the 32-bit node.
6260 MVT VT = Node->getSimpleValueType(0);
6261 if (VT == MVT::i8 || VT == MVT::i16) {
6262 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6263 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6264 }
6265 ReplaceUses(SDValue(Node, 0), Result);
6266 }
6267
6268 CurDAG->RemoveDeadNode(Node);
6269 return;
6270 }
6271 break;
6272 }
6273 case X86ISD::MGATHER: {
6274 auto *Mgt = cast<X86MaskedGatherSDNode>(Node);
6275 SDValue IndexOp = Mgt->getIndex();
6276 SDValue Mask = Mgt->getMask();
6277 MVT IndexVT = IndexOp.getSimpleValueType();
6278 MVT ValueVT = Node->getSimpleValueType(0);
6279 MVT MaskVT = Mask.getSimpleValueType();
6280
6281 // This is just to prevent crashes if the nodes are malformed somehow. We're
6282 // otherwise only doing loose type checking in here based on type what
6283 // a type constraint would say just like table based isel.
6284 if (!ValueVT.isVector() || !MaskVT.isVector())
6285 break;
6286
6287 unsigned NumElts = ValueVT.getVectorNumElements();
6288 MVT ValueSVT = ValueVT.getVectorElementType();
6289
6290 bool IsFP = ValueSVT.isFloatingPoint();
6291 unsigned EltSize = ValueSVT.getSizeInBits();
6292
6293 unsigned Opc = 0;
6294 bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
6295 if (AVX512Gather) {
6296 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6297 Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
6298 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6299 Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
6300 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6301 Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
6302 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6303 Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
6304 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6305 Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
6306 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6307 Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
6308 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6309 Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
6310 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6311 Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
6312 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6313 Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
6314 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6315 Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
6316 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6317 Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
6318 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6319 Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
6320 } else {
6321 assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
6322 "Unexpected mask VT!");
6323 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6324 Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
6325 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6326 Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
6327 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6328 Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
6329 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6330 Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
6331 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6332 Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
6333 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6334 Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
6335 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6336 Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
6337 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6338 Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
6339 }
6340
6341 if (!Opc)
6342 break;
6343
6344 SDValue Base, Scale, Index, Disp, Segment;
6345 if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(),
6346 Base, Scale, Index, Disp, Segment))
6347 break;
6348
6349 SDValue PassThru = Mgt->getPassThru();
6350 SDValue Chain = Mgt->getChain();
6351 // Gather instructions have a mask output not in the ISD node.
6352 SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other);
6353
6354 MachineSDNode *NewNode;
6355 if (AVX512Gather) {
6356 SDValue Ops[] = {PassThru, Mask, Base, Scale,
6357 Index, Disp, Segment, Chain};
6358 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6359 } else {
6360 SDValue Ops[] = {PassThru, Base, Scale, Index,
6361 Disp, Segment, Mask, Chain};
6362 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6363 }
6364 CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()});
6365 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6366 ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2));
6367 CurDAG->RemoveDeadNode(Node);
6368 return;
6369 }
6370 case X86ISD::MSCATTER: {
6371 auto *Sc = cast<X86MaskedScatterSDNode>(Node);
6372 SDValue Value = Sc->getValue();
6373 SDValue IndexOp = Sc->getIndex();
6374 MVT IndexVT = IndexOp.getSimpleValueType();
6375 MVT ValueVT = Value.getSimpleValueType();
6376
6377 // This is just to prevent crashes if the nodes are malformed somehow. We're
6378 // otherwise only doing loose type checking in here based on type what
6379 // a type constraint would say just like table based isel.
6380 if (!ValueVT.isVector())
6381 break;
6382
6383 unsigned NumElts = ValueVT.getVectorNumElements();
6384 MVT ValueSVT = ValueVT.getVectorElementType();
6385
6386 bool IsFP = ValueSVT.isFloatingPoint();
6387 unsigned EltSize = ValueSVT.getSizeInBits();
6388
6389 unsigned Opc;
6390 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6391 Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
6392 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6393 Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
6394 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6395 Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
6396 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6397 Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
6398 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6399 Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
6400 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6401 Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
6402 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6403 Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
6404 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6405 Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
6406 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6407 Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
6408 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6409 Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
6410 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6411 Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
6412 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6413 Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
6414 else
6415 break;
6416
6417 SDValue Base, Scale, Index, Disp, Segment;
6418 if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(),
6419 Base, Scale, Index, Disp, Segment))
6420 break;
6421
6422 SDValue Mask = Sc->getMask();
6423 SDValue Chain = Sc->getChain();
6424 // Scatter instructions have a mask output not in the ISD node.
6425 SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other);
6426 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
6427
6428 MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6429 CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()});
6430 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1));
6431 CurDAG->RemoveDeadNode(Node);
6432 return;
6433 }
6435 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6436 auto CallId = MFI->getPreallocatedIdForCallSite(
6437 cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6438 SDValue Chain = Node->getOperand(0);
6439 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6440 MachineSDNode *New = CurDAG->getMachineNode(
6441 TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain);
6442 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain
6443 CurDAG->RemoveDeadNode(Node);
6444 return;
6445 }
6446 case ISD::PREALLOCATED_ARG: {
6447 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6448 auto CallId = MFI->getPreallocatedIdForCallSite(
6449 cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6450 SDValue Chain = Node->getOperand(0);
6451 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6452 SDValue ArgIndex = Node->getOperand(2);
6453 SDValue Ops[3];
6454 Ops[0] = CallIdValue;
6455 Ops[1] = ArgIndex;
6456 Ops[2] = Chain;
6457 MachineSDNode *New = CurDAG->getMachineNode(
6458 TargetOpcode::PREALLOCATED_ARG, dl,
6459 CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()),
6460 MVT::Other),
6461 Ops);
6462 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer
6463 ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain
6464 CurDAG->RemoveDeadNode(Node);
6465 return;
6466 }
6471 if (!Subtarget->hasWIDEKL())
6472 break;
6473
6474 unsigned Opcode;
6475 switch (Node->getOpcode()) {
6476 default:
6477 llvm_unreachable("Unexpected opcode!");
6479 Opcode = GET_EGPR_IF_ENABLED(X86::AESENCWIDE128KL);
6480 break;
6482 Opcode = GET_EGPR_IF_ENABLED(X86::AESDECWIDE128KL);
6483 break;
6485 Opcode = GET_EGPR_IF_ENABLED(X86::AESENCWIDE256KL);
6486 break;
6488 Opcode = GET_EGPR_IF_ENABLED(X86::AESDECWIDE256KL);
6489 break;
6490#undef GET_EGPR_IF_ENABLED
6491 }
6492
6493 SDValue Chain = Node->getOperand(0);
6494 SDValue Addr = Node->getOperand(1);
6495
6496 SDValue Base, Scale, Index, Disp, Segment;
6497 if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment))
6498 break;
6499
6500 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2),
6501 SDValue());
6502 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3),
6503 Chain.getValue(1));
6504 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4),
6505 Chain.getValue(1));
6506 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5),
6507 Chain.getValue(1));
6508 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6),
6509 Chain.getValue(1));
6510 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7),
6511 Chain.getValue(1));
6512 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8),
6513 Chain.getValue(1));
6514 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9),
6515 Chain.getValue(1));
6516
6517 MachineSDNode *Res = CurDAG->getMachineNode(
6518 Opcode, dl, Node->getVTList(),
6519 {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)});
6520 CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand());
6521 ReplaceNode(Node, Res);
6522 return;
6523 }
6524 }
6525
6526 SelectCode(Node);
6527}
6528
6529bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6530 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
6531 std::vector<SDValue> &OutOps) {
6532 SDValue Op0, Op1, Op2, Op3, Op4;
6533 switch (ConstraintID) {
6534 default:
6535 llvm_unreachable("Unexpected asm memory constraint");
6536 case InlineAsm::ConstraintCode::o: // offsetable ??
6537 case InlineAsm::ConstraintCode::v: // not offsetable ??
6538 case InlineAsm::ConstraintCode::m: // memory
6539 case InlineAsm::ConstraintCode::X:
6540 case InlineAsm::ConstraintCode::p: // address
6541 if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
6542 return true;
6543 break;
6544 }
6545
6546 OutOps.push_back(Op0);
6547 OutOps.push_back(Op1);
6548 OutOps.push_back(Op2);
6549 OutOps.push_back(Op3);
6550 OutOps.push_back(Op4);
6551 return false;
6552}
6553
6554/// This pass converts a legalized DAG into a X86-specific DAG,
6555/// ready for instruction scheduling.
6557 CodeGenOptLevel OptLevel) {
6558 return new X86DAGToDAGISel(TM, OptLevel);
6559}
unsigned SubReg
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
aarch64 promote const
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
amdgpu AMDGPU Register Bank Select
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
#define P(N)
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static bool isRIPRelative(const MCInst &MI, const MCInstrInfo &MCII)
Check if the instruction uses RIP relative addressing.
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget)
static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM)
static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM)
static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, SDValue StoredVal, SelectionDAG *CurDAG, unsigned LoadOpNo, LoadSDNode *&LoadNode, SDValue &InputChain)
Check whether or not the chain ending in StoreNode is suitable for doing the {load; op; store} to mod...
#define GET_EGPR_IF_ENABLED(OPC)
static bool needBWI(MVT VT)
static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, bool FoldedBCast, bool Masked)
static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM, const X86Subtarget &Subtarget)
static bool mayUseCarryFlag(X86::CondCode CC)
static cl::opt< bool > EnablePromoteAnyextLoad("x86-promote-anyext-load", cl::init(true), cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden)
cl::opt< bool > IndirectBranchTracking
static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, SDValue Call, SDValue OrigChain)
Replace the original chain operand of the call with load's chain operand and move load below the call...
static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N)
#define GET_ND_IF_ENABLED(OPC)
#define VPTESTM_BROADCAST_CASES(SUFFIX)
static cl::opt< bool > AndImmShrink("x86-and-imm-shrink", cl::init(true), cl::desc("Enable setting constant bits to reduce size of mask immediates"), cl::Hidden)
static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, X86ISelAddressMode &AM)
#define VPTESTM_FULL_CASES(SUFFIX)
static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq)
Return true if call address is a load and it can be moved below CALLSEQ_START and the chains leading ...
static bool isDispSafeForFrameIndex(int64_t Val)
#define PASS_NAME
#define DEBUG_TYPE
static bool isEndbrImm64(uint64_t Imm)
#define GET_ND_IF_ENABLED(OPC)
Value * RHS
DEMANGLE_DUMP_METHOD void dump() const
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:906
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:349
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition: APInt.h:1548
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1482
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:367
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1606
The address of a basic block.
Definition: Constants.h:889
This is an important base class in LLVM.
Definition: Constant.h:41
This class represents an Operation in the Expression.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:681
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:678
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:675
std::optional< ConstantRange > getAbsoluteSymbolRange() const
If this is an absolute symbol reference, returns the range of the symbol, otherwise returns std::null...
Definition: Globals.cpp:388
This class is used to form a handle around another node that is persistent and is updated across invo...
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:40
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
MachineModuleInfo & getMMI() const
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const Module * getModule() const
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
Root of the metadata hierarchy.
Definition: Metadata.h:62
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition: Module.cpp:331
Register getReg() const
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
int getNodeId() const
Return the unique node id.
void dump() const
Dump this node, for debugging.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
bool use_empty() const
Return true if there are no uses of this node.
const SDValue & getOperand(unsigned Num) const
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
op_iterator op_end() const
op_iterator op_begin() const
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
bool use_empty() const
Return true if there are no nodes using value ResNo of Node.
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
unsigned getNumOperands() const
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector< SDValue > &OutOps)
SelectInlineAsmMemoryOperand - Select the specified address as a target addressing mode,...
virtual void PostprocessISelDAG()
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
static int getUninvalidatedNodeId(SDNode *N)
virtual void emitFunctionEntryCode()
virtual bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const
IsProfitableToFold - Returns true if it's profitable to fold the specific operand node N of U during ...
virtual bool ComplexPatternFuncMutatesDAG() const
Return true if complex patterns for this target can mutate the DAG.
virtual void PreprocessISelDAG()
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
static void InvalidateNodeId(SDNode *N)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
static constexpr unsigned MaxRecursionDepth
Definition: SelectionDAG.h:447
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
void RepositionNode(allnodes_iterator Position, SDNode *N)
Move node N in the AllNodes list to be immediately before the given iterator Position.
ilist< SDNode >::iterator allnodes_iterator
Definition: SelectionDAG.h:533
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
std::vector< ArgListEntry > ArgListTy
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
unsigned getID() const
Return the register class ID number.
Target - Wrapper for Target specific information.
static Type * getVoidTy(LLVMContext &C)
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
LLVM Value Representation.
Definition: Value.h:74
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:5239
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
size_t getPreallocatedIdForCallSite(const Value *CS)
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
self_iterator getIterator()
Definition: ilist_node.h:109
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:750
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:250
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:559
@ STRICT_FCEIL
Definition: ISDOpcodes.h:426
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1031
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:783
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:255
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:903
@ FrameIndex
Definition: ISDOpcodes.h:80
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:620
@ PREALLOCATED_SETUP
Definition: ISDOpcodes.h:1160
@ TargetExternalSymbol
Definition: ISDOpcodes.h:169
@ PREALLOCATED_ARG
Definition: ISDOpcodes.h:1163
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1052
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:208
@ TargetGlobalAddress
TargetGlobalAddress - Like GlobalAddress, but the DAG does no folding or anything else with this node...
Definition: ISDOpcodes.h:164
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:430
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:573
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition: ISDOpcodes.h:114
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition: ISDOpcodes.h:809
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:736
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:303
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:427
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:429
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:443
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:465
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:442
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:836
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:470
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:831
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:423
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
@ CALLSEQ_START
CALLSEQ_START/CALLSEQ_END - These operators mark the beginning and end of a call sequence,...
Definition: ISDOpcodes.h:1140
@ STRICT_FRINT
Definition: ISDOpcodes.h:422
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ TargetGlobalTLSAddress
Definition: ISDOpcodes.h:165
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1523
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
const uint64_t Magic
Definition: InstrProf.h:1114
SymbolFlags
Symbol flags.
Definition: Symbol.h:24
@ GlobalBaseReg
The result of the mflr at function entry, used for PIC code.
@ X86
Windows x64, Windows Itanium (IA-64)
@ SS
Definition: X86.h:207
@ FS
Definition: X86.h:206
@ GS
Definition: X86.h:205
Reg
All possible values of the reg field in the ModR/M byte.
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
Definition: X86BaseInfo.h:377
@ EVEX
EVEX - Specifies that this instruction use EVEX form which provides syntax support up to 32 512-bit r...
Definition: X86BaseInfo.h:839
@ VEX
VEX - encoding using 0xC4/0xC5.
Definition: X86BaseInfo.h:832
@ XOP
XOP - Opcode prefix used by XOP instructions.
Definition: X86BaseInfo.h:834
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ STRICT_FCMP
X86 strict FP compare instructions.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ CALL
These operations represent an abstract X86 call instruction, which includes a bunch of information.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FLD
This instruction implements an extending load to FP stack slots.
@ TC_RETURN
Tail call return.
@ FOR
Bitwise logical OR of floating point values.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
int getCondSrcNoFromDesc(const MCInstrDesc &MCID)
Return the source operand # for condition code by MCID.
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition: DWP.cpp:456
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:258
unsigned M1(unsigned Val)
Definition: VE.h:376
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:246
FunctionPass * createX86ISelDag(X86TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a X86-specific DAG, ready for instruction scheduling.
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
@ And
Bitwise or logical AND of integers.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Extended Value Type.
Definition: ValueTypes.h:34
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:93
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:203
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:208
This class contains a discriminated union of information about pointers in memory operands,...
unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This structure contains all information that is necessary for lowering calls.