[llvm] r316883 - [X86] Rearrange code in X86InstrInfo.cpp to put all the foldMemoryOperandImpl methods together without partial/undef register handling in the middle. NFC
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Sun Oct 29 21:39:18 PDT 2017
Author: ctopper
Date: Sun Oct 29 21:39:18 2017
New Revision: 316883
URL: http://llvm.org/viewvc/llvm-project?rev=316883&view=rev
Log:
[X86] Rearrange code in X86InstrInfo.cpp to put all the foldMemoryOperandImpl methods together without partial/undef register handling in the middle. NFC
I have a future patch that wants to make use of the one of the partial functions in one of the earlier memory folding methods and the current ordering prevents that.
Modified:
llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.cpp?rev=316883&r1=316882&r2=316883&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp Sun Oct 29 21:39:18 2017
@@ -7964,6 +7964,276 @@ bool X86InstrInfo::expandPostRAPseudo(Ma
return false;
}
+/// Return true for all instructions that only update
+/// the first 32 or 64-bits of the destination register and leave the rest
+/// unmodified. This can be used to avoid folding loads if the instructions
+/// only update part of the destination register, and the non-updated part is
+/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
+/// instructions breaks the partial register dependency and it can improve
+/// performance. e.g.:
+///
+/// movss (%rdi), %xmm0
+/// cvtss2sd %xmm0, %xmm0
+///
+/// Instead of
+/// cvtss2sd (%rdi), %xmm0
+///
+/// FIXME: This should be turned into a TSFlags.
+///
+static bool hasPartialRegUpdate(unsigned Opcode) {
+ switch (Opcode) {
+ case X86::CVTSI2SSrr:
+ case X86::CVTSI2SSrm:
+ case X86::CVTSI2SS64rr:
+ case X86::CVTSI2SS64rm:
+ case X86::CVTSI2SDrr:
+ case X86::CVTSI2SDrm:
+ case X86::CVTSI2SD64rr:
+ case X86::CVTSI2SD64rm:
+ case X86::CVTSD2SSrr:
+ case X86::CVTSD2SSrm:
+ case X86::CVTSS2SDrr:
+ case X86::CVTSS2SDrm:
+ case X86::MOVHPDrm:
+ case X86::MOVHPSrm:
+ case X86::MOVLPDrm:
+ case X86::MOVLPSrm:
+ case X86::RCPSSr:
+ case X86::RCPSSm:
+ case X86::RCPSSr_Int:
+ case X86::RCPSSm_Int:
+ case X86::ROUNDSDr:
+ case X86::ROUNDSDm:
+ case X86::ROUNDSSr:
+ case X86::ROUNDSSm:
+ case X86::RSQRTSSr:
+ case X86::RSQRTSSm:
+ case X86::RSQRTSSr_Int:
+ case X86::RSQRTSSm_Int:
+ case X86::SQRTSSr:
+ case X86::SQRTSSm:
+ case X86::SQRTSSr_Int:
+ case X86::SQRTSSm_Int:
+ case X86::SQRTSDr:
+ case X86::SQRTSDm:
+ case X86::SQRTSDr_Int:
+ case X86::SQRTSDm_Int:
+ return true;
+ }
+
+ return false;
+}
+
+/// Inform the ExecutionDepsFix pass how many idle
+/// instructions we would like before a partial register update.
+unsigned X86InstrInfo::getPartialRegUpdateClearance(
+ const MachineInstr &MI, unsigned OpNum,
+ const TargetRegisterInfo *TRI) const {
+ if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode()))
+ return 0;
+
+ // If MI is marked as reading Reg, the partial register update is wanted.
+ const MachineOperand &MO = MI.getOperand(0);
+ unsigned Reg = MO.getReg();
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (MO.readsReg() || MI.readsVirtualRegister(Reg))
+ return 0;
+ } else {
+ if (MI.readsRegister(Reg, TRI))
+ return 0;
+ }
+
+ // If any instructions in the clearance range are reading Reg, insert a
+ // dependency breaking instruction, which is inexpensive and is likely to
+ // be hidden in other instruction's cycles.
+ return PartialRegUpdateClearance;
+}
+
+// Return true for any instruction the copies the high bits of the first source
+// operand into the unused high bits of the destination operand.
+static bool hasUndefRegUpdate(unsigned Opcode) {
+ switch (Opcode) {
+ case X86::VCVTSI2SSrr:
+ case X86::VCVTSI2SSrm:
+ case X86::Int_VCVTSI2SSrr:
+ case X86::Int_VCVTSI2SSrm:
+ case X86::VCVTSI2SS64rr:
+ case X86::VCVTSI2SS64rm:
+ case X86::Int_VCVTSI2SS64rr:
+ case X86::Int_VCVTSI2SS64rm:
+ case X86::VCVTSI2SDrr:
+ case X86::VCVTSI2SDrm:
+ case X86::Int_VCVTSI2SDrr:
+ case X86::Int_VCVTSI2SDrm:
+ case X86::VCVTSI2SD64rr:
+ case X86::VCVTSI2SD64rm:
+ case X86::Int_VCVTSI2SD64rr:
+ case X86::Int_VCVTSI2SD64rm:
+ case X86::VCVTSD2SSrr:
+ case X86::VCVTSD2SSrm:
+ case X86::Int_VCVTSD2SSrr:
+ case X86::Int_VCVTSD2SSrm:
+ case X86::VCVTSS2SDrr:
+ case X86::VCVTSS2SDrm:
+ case X86::Int_VCVTSS2SDrr:
+ case X86::Int_VCVTSS2SDrm:
+ case X86::VRCPSSr:
+ case X86::VRCPSSr_Int:
+ case X86::VRCPSSm:
+ case X86::VRCPSSm_Int:
+ case X86::VROUNDSDr:
+ case X86::VROUNDSDm:
+ case X86::VROUNDSDr_Int:
+ case X86::VROUNDSDm_Int:
+ case X86::VROUNDSSr:
+ case X86::VROUNDSSm:
+ case X86::VROUNDSSr_Int:
+ case X86::VROUNDSSm_Int:
+ case X86::VRSQRTSSr:
+ case X86::VRSQRTSSr_Int:
+ case X86::VRSQRTSSm:
+ case X86::VRSQRTSSm_Int:
+ case X86::VSQRTSSr:
+ case X86::VSQRTSSr_Int:
+ case X86::VSQRTSSm:
+ case X86::VSQRTSSm_Int:
+ case X86::VSQRTSDr:
+ case X86::VSQRTSDr_Int:
+ case X86::VSQRTSDm:
+ case X86::VSQRTSDm_Int:
+ // AVX-512
+ case X86::VCVTSI2SSZrr:
+ case X86::VCVTSI2SSZrm:
+ case X86::VCVTSI2SSZrr_Int:
+ case X86::VCVTSI2SSZrrb_Int:
+ case X86::VCVTSI2SSZrm_Int:
+ case X86::VCVTSI642SSZrr:
+ case X86::VCVTSI642SSZrm:
+ case X86::VCVTSI642SSZrr_Int:
+ case X86::VCVTSI642SSZrrb_Int:
+ case X86::VCVTSI642SSZrm_Int:
+ case X86::VCVTSI2SDZrr:
+ case X86::VCVTSI2SDZrm:
+ case X86::VCVTSI2SDZrr_Int:
+ case X86::VCVTSI2SDZrrb_Int:
+ case X86::VCVTSI2SDZrm_Int:
+ case X86::VCVTSI642SDZrr:
+ case X86::VCVTSI642SDZrm:
+ case X86::VCVTSI642SDZrr_Int:
+ case X86::VCVTSI642SDZrrb_Int:
+ case X86::VCVTSI642SDZrm_Int:
+ case X86::VCVTUSI2SSZrr:
+ case X86::VCVTUSI2SSZrm:
+ case X86::VCVTUSI2SSZrr_Int:
+ case X86::VCVTUSI2SSZrrb_Int:
+ case X86::VCVTUSI2SSZrm_Int:
+ case X86::VCVTUSI642SSZrr:
+ case X86::VCVTUSI642SSZrm:
+ case X86::VCVTUSI642SSZrr_Int:
+ case X86::VCVTUSI642SSZrrb_Int:
+ case X86::VCVTUSI642SSZrm_Int:
+ case X86::VCVTUSI2SDZrr:
+ case X86::VCVTUSI2SDZrm:
+ case X86::VCVTUSI2SDZrr_Int:
+ case X86::VCVTUSI2SDZrm_Int:
+ case X86::VCVTUSI642SDZrr:
+ case X86::VCVTUSI642SDZrm:
+ case X86::VCVTUSI642SDZrr_Int:
+ case X86::VCVTUSI642SDZrrb_Int:
+ case X86::VCVTUSI642SDZrm_Int:
+ case X86::VCVTSD2SSZrr:
+ case X86::VCVTSD2SSZrr_Int:
+ case X86::VCVTSD2SSZrrb_Int:
+ case X86::VCVTSD2SSZrm:
+ case X86::VCVTSD2SSZrm_Int:
+ case X86::VCVTSS2SDZrr:
+ case X86::VCVTSS2SDZrr_Int:
+ case X86::VCVTSS2SDZrrb_Int:
+ case X86::VCVTSS2SDZrm:
+ case X86::VCVTSS2SDZrm_Int:
+ case X86::VRNDSCALESDr:
+ case X86::VRNDSCALESDrb:
+ case X86::VRNDSCALESDm:
+ case X86::VRNDSCALESSr:
+ case X86::VRNDSCALESSrb:
+ case X86::VRNDSCALESSm:
+ case X86::VRCP14SSrr:
+ case X86::VRCP14SSrm:
+ case X86::VRSQRT14SSrr:
+ case X86::VRSQRT14SSrm:
+ case X86::VSQRTSSZr:
+ case X86::VSQRTSSZr_Int:
+ case X86::VSQRTSSZrb_Int:
+ case X86::VSQRTSSZm:
+ case X86::VSQRTSSZm_Int:
+ case X86::VSQRTSDZr:
+ case X86::VSQRTSDZr_Int:
+ case X86::VSQRTSDZrb_Int:
+ case X86::VSQRTSDZm:
+ case X86::VSQRTSDZm_Int:
+ return true;
+ }
+
+ return false;
+}
+
+/// Inform the ExecutionDepsFix pass how many idle instructions we would like
+/// before certain undef register reads.
+///
+/// This catches the VCVTSI2SD family of instructions:
+///
+/// vcvtsi2sdq %rax, %xmm0<undef>, %xmm14
+///
+/// We should to be careful *not* to catch VXOR idioms which are presumably
+/// handled specially in the pipeline:
+///
+/// vxorps %xmm1<undef>, %xmm1<undef>, %xmm1
+///
+/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
+/// high bits that are passed-through are not live.
+unsigned
+X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
+ const TargetRegisterInfo *TRI) const {
+ if (!hasUndefRegUpdate(MI.getOpcode()))
+ return 0;
+
+ // Set the OpNum parameter to the first source operand.
+ OpNum = 1;
+
+ const MachineOperand &MO = MI.getOperand(OpNum);
+ if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+ return UndefRegClearance;
+ }
+ return 0;
+}
+
+void X86InstrInfo::breakPartialRegDependency(
+ MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
+ unsigned Reg = MI.getOperand(OpNum).getReg();
+ // If MI kills this register, the false dependence is already broken.
+ if (MI.killsRegister(Reg, TRI))
+ return;
+
+ if (X86::VR128RegClass.contains(Reg)) {
+ // These instructions are all floating point domain, so xorps is the best
+ // choice.
+ unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr;
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
+ MI.addRegisterKilled(Reg, TRI, true);
+ } else if (X86::VR256RegClass.contains(Reg)) {
+ // Use vxorps to clear the full ymm register.
+ // It wants to read and write the xmm sub-register.
+ unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm);
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg)
+ .addReg(XReg, RegState::Undef)
+ .addReg(XReg, RegState::Undef)
+ .addReg(Reg, RegState::ImplicitDefine);
+ MI.addRegisterKilled(Reg, TRI, true);
+ }
+}
+
static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs,
int PtrOffset = 0) {
unsigned NumAddrOps = MOs.size();
@@ -8196,362 +8466,92 @@ MachineInstr *X86InstrInfo::foldMemoryOp
// If this is a 64-bit load, but the spill slot is 32, then we can do
// a 32-bit load which is implicitly zero-extended. This likely is
// due to live interval analysis remat'ing a load from stack slot.
- if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
- return nullptr;
- Opcode = X86::MOV32rm;
- NarrowToMOV32rm = true;
- }
- }
-
- if (isTwoAddrFold)
- NewMI = FuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this);
- else
- NewMI = FuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this);
-
- if (NarrowToMOV32rm) {
- // If this is the special case where we use a MOV32rm to load a 32-bit
- // value and zero-extend the top bits. Change the destination register
- // to a 32-bit one.
- unsigned DstReg = NewMI->getOperand(0).getReg();
- if (TargetRegisterInfo::isPhysicalRegister(DstReg))
- NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
- else
- NewMI->getOperand(0).setSubReg(X86::sub_32bit);
- }
- return NewMI;
- }
- }
-
- // If the instruction and target operand are commutable, commute the
- // instruction and try again.
- if (AllowCommute) {
- unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex;
- if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
- bool HasDef = MI.getDesc().getNumDefs();
- unsigned Reg0 = HasDef ? MI.getOperand(0).getReg() : 0;
- unsigned Reg1 = MI.getOperand(CommuteOpIdx1).getReg();
- unsigned Reg2 = MI.getOperand(CommuteOpIdx2).getReg();
- bool Tied1 =
- 0 == MI.getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
- bool Tied2 =
- 0 == MI.getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);
-
- // If either of the commutable operands are tied to the destination
- // then we can not commute + fold.
- if ((HasDef && Reg0 == Reg1 && Tied1) ||
- (HasDef && Reg0 == Reg2 && Tied2))
- return nullptr;
-
- MachineInstr *CommutedMI =
- commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
- if (!CommutedMI) {
- // Unable to commute.
- return nullptr;
- }
- if (CommutedMI != &MI) {
- // New instruction. We can't fold from this.
- CommutedMI->eraseFromParent();
- return nullptr;
- }
-
- // Attempt to fold with the commuted version of the instruction.
- NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt,
- Size, Align, /*AllowCommute=*/false);
- if (NewMI)
- return NewMI;
-
- // Folding failed again - undo the commute before returning.
- MachineInstr *UncommutedMI =
- commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
- if (!UncommutedMI) {
- // Unable to commute.
- return nullptr;
- }
- if (UncommutedMI != &MI) {
- // New instruction. It doesn't need to be kept.
- UncommutedMI->eraseFromParent();
- return nullptr;
- }
-
- // Return here to prevent duplicate fuse failure report.
- return nullptr;
- }
- }
-
- // No fusion
- if (PrintFailedFusing && !MI.isCopy())
- dbgs() << "We failed to fuse operand " << OpNum << " in " << MI;
- return nullptr;
-}
-
-/// Return true for all instructions that only update
-/// the first 32 or 64-bits of the destination register and leave the rest
-/// unmodified. This can be used to avoid folding loads if the instructions
-/// only update part of the destination register, and the non-updated part is
-/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
-/// instructions breaks the partial register dependency and it can improve
-/// performance. e.g.:
-///
-/// movss (%rdi), %xmm0
-/// cvtss2sd %xmm0, %xmm0
-///
-/// Instead of
-/// cvtss2sd (%rdi), %xmm0
-///
-/// FIXME: This should be turned into a TSFlags.
-///
-static bool hasPartialRegUpdate(unsigned Opcode) {
- switch (Opcode) {
- case X86::CVTSI2SSrr:
- case X86::CVTSI2SSrm:
- case X86::CVTSI2SS64rr:
- case X86::CVTSI2SS64rm:
- case X86::CVTSI2SDrr:
- case X86::CVTSI2SDrm:
- case X86::CVTSI2SD64rr:
- case X86::CVTSI2SD64rm:
- case X86::CVTSD2SSrr:
- case X86::CVTSD2SSrm:
- case X86::CVTSS2SDrr:
- case X86::CVTSS2SDrm:
- case X86::MOVHPDrm:
- case X86::MOVHPSrm:
- case X86::MOVLPDrm:
- case X86::MOVLPSrm:
- case X86::RCPSSr:
- case X86::RCPSSm:
- case X86::RCPSSr_Int:
- case X86::RCPSSm_Int:
- case X86::ROUNDSDr:
- case X86::ROUNDSDm:
- case X86::ROUNDSSr:
- case X86::ROUNDSSm:
- case X86::RSQRTSSr:
- case X86::RSQRTSSm:
- case X86::RSQRTSSr_Int:
- case X86::RSQRTSSm_Int:
- case X86::SQRTSSr:
- case X86::SQRTSSm:
- case X86::SQRTSSr_Int:
- case X86::SQRTSSm_Int:
- case X86::SQRTSDr:
- case X86::SQRTSDm:
- case X86::SQRTSDr_Int:
- case X86::SQRTSDm_Int:
- return true;
- }
-
- return false;
-}
-
-/// Inform the ExecutionDepsFix pass how many idle
-/// instructions we would like before a partial register update.
-unsigned X86InstrInfo::getPartialRegUpdateClearance(
- const MachineInstr &MI, unsigned OpNum,
- const TargetRegisterInfo *TRI) const {
- if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode()))
- return 0;
-
- // If MI is marked as reading Reg, the partial register update is wanted.
- const MachineOperand &MO = MI.getOperand(0);
- unsigned Reg = MO.getReg();
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
- if (MO.readsReg() || MI.readsVirtualRegister(Reg))
- return 0;
- } else {
- if (MI.readsRegister(Reg, TRI))
- return 0;
- }
+ if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
+ return nullptr;
+ Opcode = X86::MOV32rm;
+ NarrowToMOV32rm = true;
+ }
+ }
- // If any instructions in the clearance range are reading Reg, insert a
- // dependency breaking instruction, which is inexpensive and is likely to
- // be hidden in other instruction's cycles.
- return PartialRegUpdateClearance;
-}
+ if (isTwoAddrFold)
+ NewMI = FuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this);
+ else
+ NewMI = FuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this);
-// Return true for any instruction the copies the high bits of the first source
-// operand into the unused high bits of the destination operand.
-static bool hasUndefRegUpdate(unsigned Opcode) {
- switch (Opcode) {
- case X86::VCVTSI2SSrr:
- case X86::VCVTSI2SSrm:
- case X86::Int_VCVTSI2SSrr:
- case X86::Int_VCVTSI2SSrm:
- case X86::VCVTSI2SS64rr:
- case X86::VCVTSI2SS64rm:
- case X86::Int_VCVTSI2SS64rr:
- case X86::Int_VCVTSI2SS64rm:
- case X86::VCVTSI2SDrr:
- case X86::VCVTSI2SDrm:
- case X86::Int_VCVTSI2SDrr:
- case X86::Int_VCVTSI2SDrm:
- case X86::VCVTSI2SD64rr:
- case X86::VCVTSI2SD64rm:
- case X86::Int_VCVTSI2SD64rr:
- case X86::Int_VCVTSI2SD64rm:
- case X86::VCVTSD2SSrr:
- case X86::VCVTSD2SSrm:
- case X86::Int_VCVTSD2SSrr:
- case X86::Int_VCVTSD2SSrm:
- case X86::VCVTSS2SDrr:
- case X86::VCVTSS2SDrm:
- case X86::Int_VCVTSS2SDrr:
- case X86::Int_VCVTSS2SDrm:
- case X86::VRCPSSr:
- case X86::VRCPSSr_Int:
- case X86::VRCPSSm:
- case X86::VRCPSSm_Int:
- case X86::VROUNDSDr:
- case X86::VROUNDSDm:
- case X86::VROUNDSDr_Int:
- case X86::VROUNDSDm_Int:
- case X86::VROUNDSSr:
- case X86::VROUNDSSm:
- case X86::VROUNDSSr_Int:
- case X86::VROUNDSSm_Int:
- case X86::VRSQRTSSr:
- case X86::VRSQRTSSr_Int:
- case X86::VRSQRTSSm:
- case X86::VRSQRTSSm_Int:
- case X86::VSQRTSSr:
- case X86::VSQRTSSr_Int:
- case X86::VSQRTSSm:
- case X86::VSQRTSSm_Int:
- case X86::VSQRTSDr:
- case X86::VSQRTSDr_Int:
- case X86::VSQRTSDm:
- case X86::VSQRTSDm_Int:
- // AVX-512
- case X86::VCVTSI2SSZrr:
- case X86::VCVTSI2SSZrm:
- case X86::VCVTSI2SSZrr_Int:
- case X86::VCVTSI2SSZrrb_Int:
- case X86::VCVTSI2SSZrm_Int:
- case X86::VCVTSI642SSZrr:
- case X86::VCVTSI642SSZrm:
- case X86::VCVTSI642SSZrr_Int:
- case X86::VCVTSI642SSZrrb_Int:
- case X86::VCVTSI642SSZrm_Int:
- case X86::VCVTSI2SDZrr:
- case X86::VCVTSI2SDZrm:
- case X86::VCVTSI2SDZrr_Int:
- case X86::VCVTSI2SDZrrb_Int:
- case X86::VCVTSI2SDZrm_Int:
- case X86::VCVTSI642SDZrr:
- case X86::VCVTSI642SDZrm:
- case X86::VCVTSI642SDZrr_Int:
- case X86::VCVTSI642SDZrrb_Int:
- case X86::VCVTSI642SDZrm_Int:
- case X86::VCVTUSI2SSZrr:
- case X86::VCVTUSI2SSZrm:
- case X86::VCVTUSI2SSZrr_Int:
- case X86::VCVTUSI2SSZrrb_Int:
- case X86::VCVTUSI2SSZrm_Int:
- case X86::VCVTUSI642SSZrr:
- case X86::VCVTUSI642SSZrm:
- case X86::VCVTUSI642SSZrr_Int:
- case X86::VCVTUSI642SSZrrb_Int:
- case X86::VCVTUSI642SSZrm_Int:
- case X86::VCVTUSI2SDZrr:
- case X86::VCVTUSI2SDZrm:
- case X86::VCVTUSI2SDZrr_Int:
- case X86::VCVTUSI2SDZrm_Int:
- case X86::VCVTUSI642SDZrr:
- case X86::VCVTUSI642SDZrm:
- case X86::VCVTUSI642SDZrr_Int:
- case X86::VCVTUSI642SDZrrb_Int:
- case X86::VCVTUSI642SDZrm_Int:
- case X86::VCVTSD2SSZrr:
- case X86::VCVTSD2SSZrr_Int:
- case X86::VCVTSD2SSZrrb_Int:
- case X86::VCVTSD2SSZrm:
- case X86::VCVTSD2SSZrm_Int:
- case X86::VCVTSS2SDZrr:
- case X86::VCVTSS2SDZrr_Int:
- case X86::VCVTSS2SDZrrb_Int:
- case X86::VCVTSS2SDZrm:
- case X86::VCVTSS2SDZrm_Int:
- case X86::VRNDSCALESDr:
- case X86::VRNDSCALESDrb:
- case X86::VRNDSCALESDm:
- case X86::VRNDSCALESSr:
- case X86::VRNDSCALESSrb:
- case X86::VRNDSCALESSm:
- case X86::VRCP14SSrr:
- case X86::VRCP14SSrm:
- case X86::VRSQRT14SSrr:
- case X86::VRSQRT14SSrm:
- case X86::VSQRTSSZr:
- case X86::VSQRTSSZr_Int:
- case X86::VSQRTSSZrb_Int:
- case X86::VSQRTSSZm:
- case X86::VSQRTSSZm_Int:
- case X86::VSQRTSDZr:
- case X86::VSQRTSDZr_Int:
- case X86::VSQRTSDZrb_Int:
- case X86::VSQRTSDZm:
- case X86::VSQRTSDZm_Int:
- return true;
+ if (NarrowToMOV32rm) {
+ // If this is the special case where we use a MOV32rm to load a 32-bit
+ // value and zero-extend the top bits. Change the destination register
+ // to a 32-bit one.
+ unsigned DstReg = NewMI->getOperand(0).getReg();
+ if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+ NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
+ else
+ NewMI->getOperand(0).setSubReg(X86::sub_32bit);
+ }
+ return NewMI;
+ }
}
- return false;
-}
+ // If the instruction and target operand are commutable, commute the
+ // instruction and try again.
+ if (AllowCommute) {
+ unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex;
+ if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
+ bool HasDef = MI.getDesc().getNumDefs();
+ unsigned Reg0 = HasDef ? MI.getOperand(0).getReg() : 0;
+ unsigned Reg1 = MI.getOperand(CommuteOpIdx1).getReg();
+ unsigned Reg2 = MI.getOperand(CommuteOpIdx2).getReg();
+ bool Tied1 =
+ 0 == MI.getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
+ bool Tied2 =
+ 0 == MI.getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);
-/// Inform the ExecutionDepsFix pass how many idle instructions we would like
-/// before certain undef register reads.
-///
-/// This catches the VCVTSI2SD family of instructions:
-///
-/// vcvtsi2sdq %rax, %xmm0<undef>, %xmm14
-///
-/// We should to be careful *not* to catch VXOR idioms which are presumably
-/// handled specially in the pipeline:
-///
-/// vxorps %xmm1<undef>, %xmm1<undef>, %xmm1
-///
-/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
-/// high bits that are passed-through are not live.
-unsigned
-X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
- const TargetRegisterInfo *TRI) const {
- if (!hasUndefRegUpdate(MI.getOpcode()))
- return 0;
+ // If either of the commutable operands are tied to the destination
+ // then we can not commute + fold.
+ if ((HasDef && Reg0 == Reg1 && Tied1) ||
+ (HasDef && Reg0 == Reg2 && Tied2))
+ return nullptr;
- // Set the OpNum parameter to the first source operand.
- OpNum = 1;
+ MachineInstr *CommutedMI =
+ commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
+ if (!CommutedMI) {
+ // Unable to commute.
+ return nullptr;
+ }
+ if (CommutedMI != &MI) {
+ // New instruction. We can't fold from this.
+ CommutedMI->eraseFromParent();
+ return nullptr;
+ }
- const MachineOperand &MO = MI.getOperand(OpNum);
- if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
- return UndefRegClearance;
- }
- return 0;
-}
+ // Attempt to fold with the commuted version of the instruction.
+ NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt,
+ Size, Align, /*AllowCommute=*/false);
+ if (NewMI)
+ return NewMI;
-void X86InstrInfo::breakPartialRegDependency(
- MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
- unsigned Reg = MI.getOperand(OpNum).getReg();
- // If MI kills this register, the false dependence is already broken.
- if (MI.killsRegister(Reg, TRI))
- return;
+ // Folding failed again - undo the commute before returning.
+ MachineInstr *UncommutedMI =
+ commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
+ if (!UncommutedMI) {
+ // Unable to commute.
+ return nullptr;
+ }
+ if (UncommutedMI != &MI) {
+ // New instruction. It doesn't need to be kept.
+ UncommutedMI->eraseFromParent();
+ return nullptr;
+ }
- if (X86::VR128RegClass.contains(Reg)) {
- // These instructions are all floating point domain, so xorps is the best
- // choice.
- unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr;
- BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg)
- .addReg(Reg, RegState::Undef)
- .addReg(Reg, RegState::Undef);
- MI.addRegisterKilled(Reg, TRI, true);
- } else if (X86::VR256RegClass.contains(Reg)) {
- // Use vxorps to clear the full ymm register.
- // It wants to read and write the xmm sub-register.
- unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm);
- BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg)
- .addReg(XReg, RegState::Undef)
- .addReg(XReg, RegState::Undef)
- .addReg(Reg, RegState::ImplicitDefine);
- MI.addRegisterKilled(Reg, TRI, true);
+ // Return here to prevent duplicate fuse failure report.
+ return nullptr;
+ }
}
+
+ // No fusion
+ if (PrintFailedFusing && !MI.isCopy())
+ dbgs() << "We failed to fuse operand " << OpNum << " in " << MI;
+ return nullptr;
}
MachineInstr *
More information about the llvm-commits
mailing list