[llvm] r244202 - [X86] Improve EmitLoweredSelect for contiguous CMOV pseudo instructions.
Michael Kuperstein via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 6 01:45:34 PDT 2015
Author: mkuper
Date: Thu Aug 6 03:45:34 2015
New Revision: 244202
URL: http://llvm.org/viewvc/llvm-project?rev=244202&view=rev
Log:
[X86] Improve EmitLoweredSelect for contiguous CMOV pseudo instructions.
This change improves EmitLoweredSelect() so that multiple contiguous CMOV pseudo
instructions with the same (or exactly opposite) conditions get lowered using a single
new basic-block. This eliminates unnecessary extra basic-blocks (and CFG merge points)
when contiguous CMOVs are being lowered.
Patch by: kevin.b.smith at intel.com
Differential Revision: http://reviews.llvm.org/D11428
Added:
llvm/trunk/test/CodeGen/X86/pseudo_cmov_lower.ll
llvm/trunk/test/CodeGen/X86/pseudo_cmov_lower1.ll
llvm/trunk/test/CodeGen/X86/pseudo_cmov_lower2.ll
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=244202&r1=244201&r2=244202&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Aug 6 03:45:34 2015
@@ -19947,6 +19947,39 @@ static bool checkAndUpdateEFLAGSKill(Mac
return true;
}
+// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
+// together with other CMOV pseudo-opcodes into a single basic-block with
+// conditional jump around it.
+static bool isCMOVPseudo(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ case X86::CMOV_FR32:
+ case X86::CMOV_FR64:
+ case X86::CMOV_GR8:
+ case X86::CMOV_GR16:
+ case X86::CMOV_GR32:
+ case X86::CMOV_RFP32:
+ case X86::CMOV_RFP64:
+ case X86::CMOV_RFP80:
+ case X86::CMOV_V2F64:
+ case X86::CMOV_V2I64:
+ case X86::CMOV_V4F32:
+ case X86::CMOV_V4F64:
+ case X86::CMOV_V4I64:
+ case X86::CMOV_V16F32:
+ case X86::CMOV_V8F32:
+ case X86::CMOV_V8F64:
+ case X86::CMOV_V8I64:
+ case X86::CMOV_V8I1:
+ case X86::CMOV_V16I1:
+ case X86::CMOV_V32I1:
+ case X86::CMOV_V64I1:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
MachineBasicBlock *
X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
MachineBasicBlock *BB) const {
@@ -19970,8 +20003,41 @@ X86TargetLowering::EmitLoweredSelect(Mac
MachineBasicBlock *thisMBB = BB;
MachineFunction *F = BB->getParent();
- // We also lower double CMOVs:
+ // This code lowers all pseudo-CMOV instructions. Generally it lowers these
+ // as described above, by inserting a BB, and then making a PHI at the join
+ // point to select the true and false operands of the CMOV in the PHI.
+ //
+ // The code also handles two different cases of multiple CMOV opcodes
+ // in a row.
+ //
+ // Case 1:
+ // In this case, there are multiple CMOVs in a row, all which are based on
+ // the same condition setting (or the exact opposite condition setting).
+ // In this case we can lower all the CMOVs using a single inserted BB, and
+ // then make a number of PHIs at the join point to model the CMOVs. The only
+ // trickiness here, is that in a case like:
+ //
+ // t2 = CMOV cond1 t1, f1
+ // t3 = CMOV cond1 t2, f2
+ //
+ // when rewriting this into PHIs, we have to perform some renaming on the
+ // temps since you cannot have a PHI operand refer to a PHI result earlier
+ // in the same block. The "simple" but wrong lowering would be:
+ //
+ // t2 = PHI t1(BB1), f1(BB2)
+ // t3 = PHI t2(BB1), f2(BB2)
+ //
+ // but clearly t2 is not defined in BB1, so that is incorrect. The proper
+ // renaming is to note that on the path through BB1, t2 is really just a
+ // copy of t1, and do that renaming, properly generating:
+ //
+ // t2 = PHI t1(BB1), f1(BB2)
+ // t3 = PHI t1(BB1), f2(BB2)
+ //
+ // Case 2, we lower cascaded CMOVs such as
+ //
// (CMOV (CMOV F, T, cc1), T, cc2)
+ //
// to two successives branches. For that, we look for another CMOV as the
// following instruction.
//
@@ -20037,19 +20103,42 @@ X86TargetLowering::EmitLoweredSelect(Mac
// .LBB5_4:
// retq
//
- MachineInstr *NextCMOV = nullptr;
+ MachineInstr *CascadedCMOV = nullptr;
+ MachineInstr *LastCMOV = MI;
+ X86::CondCode CC = X86::CondCode(MI->getOperand(3).getImm());
+ X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
MachineBasicBlock::iterator NextMIIt =
std::next(MachineBasicBlock::iterator(MI));
- if (NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() &&
+
+ // Check for case 1, where there are multiple CMOVs with the same condition
+ // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
+ // number of jumps the most.
+
+ if (isCMOVPseudo(MI)) {
+ // See if we have a string of CMOVS with the same condition.
+ while (NextMIIt != BB->end() &&
+ isCMOVPseudo(NextMIIt) &&
+ (NextMIIt->getOperand(3).getImm() == CC ||
+ NextMIIt->getOperand(3).getImm() == OppCC)) {
+ LastCMOV = &*NextMIIt;
+ ++NextMIIt;
+ }
+ }
+
+ // This checks for case 2, but only do this if we didn't already find
+ // case 1, as indicated by LastCMOV == MI.
+ if (LastCMOV == MI &&
+ NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() &&
NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() &&
- NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg())
- NextCMOV = &*NextMIIt;
+ NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) {
+ CascadedCMOV = &*NextMIIt;
+ }
MachineBasicBlock *jcc1MBB = nullptr;
- // If we have a double CMOV, we lower it to two successive branches to
+ // If we have a cascaded CMOV, we lower it to two successive branches to
// the same block. EFLAGS is used by both, so mark it as live in the second.
- if (NextCMOV) {
+ if (CascadedCMOV) {
jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
F->insert(It, jcc1MBB);
jcc1MBB->addLiveIn(X86::EFLAGS);
@@ -20064,7 +20153,7 @@ X86TargetLowering::EmitLoweredSelect(Mac
// live into the sink and copy blocks.
const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
- MachineInstr *LastEFLAGSUser = NextCMOV ? NextCMOV : MI;
+ MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
!checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
copy0MBB->addLiveIn(X86::EFLAGS);
@@ -20073,12 +20162,12 @@ X86TargetLowering::EmitLoweredSelect(Mac
// Transfer the remainder of BB and its successor edges to sinkMBB.
sinkMBB->splice(sinkMBB->begin(), BB,
- std::next(MachineBasicBlock::iterator(MI)), BB->end());
+ std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
// Add the true and fallthrough blocks as its successors.
- if (NextCMOV) {
- // The fallthrough block may be jcc1MBB, if we have a double CMOV.
+ if (CascadedCMOV) {
+ // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
BB->addSuccessor(jcc1MBB);
// In that case, jcc1MBB will itself fallthrough the copy0MBB, and
@@ -20093,13 +20182,12 @@ X86TargetLowering::EmitLoweredSelect(Mac
BB->addSuccessor(sinkMBB);
// Create the conditional branch instruction.
- unsigned Opc =
- X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
+ unsigned Opc = X86::GetCondBranchFromCond(CC);
BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
- if (NextCMOV) {
+ if (CascadedCMOV) {
unsigned Opc2 = X86::GetCondBranchFromCond(
- (X86::CondCode)NextCMOV->getOperand(3).getImm());
+ (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
}
@@ -20111,24 +20199,62 @@ X86TargetLowering::EmitLoweredSelect(Mac
// sinkMBB:
// %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
// ...
- MachineInstrBuilder MIB =
- BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI),
- MI->getOperand(0).getReg())
- .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
- .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+ MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
+ MachineBasicBlock::iterator MIItEnd =
+ std::next(MachineBasicBlock::iterator(LastCMOV));
+ MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
+ DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
+ MachineInstrBuilder MIB;
- // If we have a double CMOV, the second Jcc provides the same incoming
+ // As we are creating the PHIs, we have to be careful if there is more than
+ // one. Later CMOVs may reference the results of earlier CMOVs, but later
+ // PHIs have to reference the individual true/false inputs from earlier PHIs.
+ // That also means that PHI construction must work forward from earlier to
+ // later, and that the code must maintain a mapping from earlier PHI's
+ // destination registers, and the registers that went into the PHI.
+
+ for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
+ unsigned DestReg = MIIt->getOperand(0).getReg();
+ unsigned Op1Reg = MIIt->getOperand(1).getReg();
+ unsigned Op2Reg = MIIt->getOperand(2).getReg();
+
+ // If this CMOV we are generating is the opposite condition from
+ // the jump we generated, then we have to swap the operands for the
+ // PHI that is going to be generated.
+ if (MIIt->getOperand(3).getImm() == OppCC)
+ std::swap(Op1Reg, Op2Reg);
+
+ if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
+ Op1Reg = RegRewriteTable[Op1Reg].first;
+
+ if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
+ Op2Reg = RegRewriteTable[Op2Reg].second;
+
+ MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
+ TII->get(X86::PHI), DestReg)
+ .addReg(Op1Reg).addMBB(copy0MBB)
+ .addReg(Op2Reg).addMBB(thisMBB);
+
+ // Add this PHI to the rewrite table.
+ RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
+ }
+
+ // If we have a cascaded CMOV, the second Jcc provides the same incoming
// value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
- if (NextCMOV) {
+ if (CascadedCMOV) {
MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB);
// Copy the PHI result to the register defined by the second CMOV.
BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
- DL, TII->get(TargetOpcode::COPY), NextCMOV->getOperand(0).getReg())
+ DL, TII->get(TargetOpcode::COPY),
+ CascadedCMOV->getOperand(0).getReg())
.addReg(MI->getOperand(0).getReg());
- NextCMOV->eraseFromParent();
+ CascadedCMOV->eraseFromParent();
}
- MI->eraseFromParent(); // The pseudo instruction is gone now.
+ // Now remove the CMOV(s).
+ for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
+ (MIIt++)->eraseFromParent();
+
return sinkMBB;
}
@@ -20703,23 +20829,23 @@ X86TargetLowering::EmitInstrWithCustomIn
case X86::TLSCall_32:
case X86::TLSCall_64:
return EmitLoweredTLSCall(MI, BB);
- case X86::CMOV_GR8:
case X86::CMOV_FR32:
case X86::CMOV_FR64:
- case X86::CMOV_V4F32:
+ case X86::CMOV_GR8:
+ case X86::CMOV_GR16:
+ case X86::CMOV_GR32:
+ case X86::CMOV_RFP32:
+ case X86::CMOV_RFP64:
+ case X86::CMOV_RFP80:
case X86::CMOV_V2F64:
case X86::CMOV_V2I64:
- case X86::CMOV_V8F32:
+ case X86::CMOV_V4F32:
case X86::CMOV_V4F64:
case X86::CMOV_V4I64:
case X86::CMOV_V16F32:
+ case X86::CMOV_V8F32:
case X86::CMOV_V8F64:
case X86::CMOV_V8I64:
- case X86::CMOV_GR16:
- case X86::CMOV_GR32:
- case X86::CMOV_RFP32:
- case X86::CMOV_RFP64:
- case X86::CMOV_RFP80:
case X86::CMOV_V8I1:
case X86::CMOV_V16I1:
case X86::CMOV_V32I1:
Added: llvm/trunk/test/CodeGen/X86/pseudo_cmov_lower.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pseudo_cmov_lower.ll?rev=244202&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pseudo_cmov_lower.ll (added)
+++ llvm/trunk/test/CodeGen/X86/pseudo_cmov_lower.ll Thu Aug 6 03:45:34 2015
@@ -0,0 +1,267 @@
+; RUN: llc < %s -mtriple=i386-linux-gnu -o - | FileCheck %s
+
+; This test checks that only a single js gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo1:
+; CHECK: js
+; CHECK-NOT: js
+define i32 @foo1(i32 %v1, i32 %v2, i32 %v3) nounwind {
+entry:
+ %cmp = icmp slt i32 %v1, 0
+ %v2.v3 = select i1 %cmp, i32 %v2, i32 %v3
+ %v1.v2 = select i1 %cmp, i32 %v1, i32 %v2
+ %sub = sub i32 %v1.v2, %v2.v3
+ ret i32 %sub
+}
+
+; This test checks that only a single js gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR. This makes
+; sure the code for the lowering for opposite conditions gets tested.
+; CHECK-LABEL: foo11:
+; CHECK: js
+; CHECK-NOT: js
+; CHECK-NOT: jns
+define i32 @foo11(i32 %v1, i32 %v2, i32 %v3) nounwind {
+entry:
+ %cmp1 = icmp slt i32 %v1, 0
+ %v2.v3 = select i1 %cmp1, i32 %v2, i32 %v3
+ %cmp2 = icmp sge i32 %v1, 0
+ %v1.v2 = select i1 %cmp2, i32 %v1, i32 %v2
+ %sub = sub i32 %v1.v2, %v2.v3
+ ret i32 %sub
+}
+
+; This test checks that only a single js gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo2:
+; CHECK: js
+; CHECK-NOT: js
+define i32 @foo2(i8 %v1, i8 %v2, i8 %v3) nounwind {
+entry:
+ %cmp = icmp slt i8 %v1, 0
+ %v2.v3 = select i1 %cmp, i8 %v2, i8 %v3
+ %v1.v2 = select i1 %cmp, i8 %v1, i8 %v2
+ %t1 = sext i8 %v2.v3 to i32
+ %t2 = sext i8 %v1.v2 to i32
+ %sub = sub i32 %t1, %t2
+ ret i32 %sub
+}
+
+; This test checks that only a single js gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo3:
+; CHECK: js
+; CHECK-NOT: js
+define i32 @foo3(i16 %v1, i16 %v2, i16 %v3) nounwind {
+entry:
+ %cmp = icmp slt i16 %v1, 0
+ %v2.v3 = select i1 %cmp, i16 %v2, i16 %v3
+ %v1.v2 = select i1 %cmp, i16 %v1, i16 %v2
+ %t1 = sext i16 %v2.v3 to i32
+ %t2 = sext i16 %v1.v2 to i32
+ %sub = sub i32 %t1, %t2
+ ret i32 %sub
+}
+
+; This test checks that only a single js gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo4:
+; CHECK: js
+; CHECK-NOT: js
+define float @foo4(i32 %v1, float %v2, float %v3, float %v4) nounwind {
+entry:
+ %cmp = icmp slt i32 %v1, 0
+ %t1 = select i1 %cmp, float %v2, float %v3
+ %t2 = select i1 %cmp, float %v3, float %v4
+ %sub = fsub float %t1, %t2
+ ret float %sub
+}
+
+; This test checks that only a single je gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo5:
+; CHECK: je
+; CHECK-NOT: je
+define double @foo5(i32 %v1, double %v2, double %v3, double %v4) nounwind {
+entry:
+ %cmp = icmp eq i32 %v1, 0
+ %t1 = select i1 %cmp, double %v2, double %v3
+ %t2 = select i1 %cmp, double %v3, double %v4
+ %sub = fsub double %t1, %t2
+ ret double %sub
+}
+
+; This test checks that only a single je gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo6:
+; CHECK: je
+; CHECK-NOT: je
+define <4 x float> @foo6(i32 %v1, <4 x float> %v2, <4 x float> %v3, <4 x float> %v4) nounwind {
+entry:
+ %cmp = icmp eq i32 %v1, 0
+ %t1 = select i1 %cmp, <4 x float> %v2, <4 x float> %v3
+ %t2 = select i1 %cmp, <4 x float> %v3, <4 x float> %v4
+ %sub = fsub <4 x float> %t1, %t2
+ ret <4 x float> %sub
+}
+
+; This test checks that only a single je gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo7:
+; CHECK: je
+; CHECK-NOT: je
+define <2 x double> @foo7(i32 %v1, <2 x double> %v2, <2 x double> %v3, <2 x double> %v4) nounwind {
+entry:
+ %cmp = icmp eq i32 %v1, 0
+ %t1 = select i1 %cmp, <2 x double> %v2, <2 x double> %v3
+ %t2 = select i1 %cmp, <2 x double> %v3, <2 x double> %v4
+ %sub = fsub <2 x double> %t1, %t2
+ ret <2 x double> %sub
+}
+
+; This test checks that only a single ja gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR. This combines
+; all the supported types together into one long string of selects based
+; on the same condition.
+; CHECK-LABEL: foo8:
+; CHECK: ja
+; CHECK-NOT: ja
+define void @foo8(i32 %v1,
+ i8 %v2, i8 %v3,
+ i16 %v12, i16 %v13,
+ i32 %v22, i32 %v23,
+ float %v32, float %v33,
+ double %v42, double %v43,
+ <4 x float> %v52, <4 x float> %v53,
+ <2 x double> %v62, <2 x double> %v63,
+ <8 x float> %v72, <8 x float> %v73,
+ <4 x double> %v82, <4 x double> %v83,
+ <16 x float> %v92, <16 x float> %v93,
+ <8 x double> %v102, <8 x double> %v103,
+ i8 * %dst) nounwind {
+entry:
+ %add.ptr11 = getelementptr inbounds i8, i8* %dst, i32 2
+ %a11 = bitcast i8* %add.ptr11 to i16*
+
+ %add.ptr21 = getelementptr inbounds i8, i8* %dst, i32 4
+ %a21 = bitcast i8* %add.ptr21 to i32*
+
+ %add.ptr31 = getelementptr inbounds i8, i8* %dst, i32 8
+ %a31 = bitcast i8* %add.ptr31 to float*
+
+ %add.ptr41 = getelementptr inbounds i8, i8* %dst, i32 16
+ %a41 = bitcast i8* %add.ptr41 to double*
+
+ %add.ptr51 = getelementptr inbounds i8, i8* %dst, i32 32
+ %a51 = bitcast i8* %add.ptr51 to <4 x float>*
+
+ %add.ptr61 = getelementptr inbounds i8, i8* %dst, i32 48
+ %a61 = bitcast i8* %add.ptr61 to <2 x double>*
+
+ %add.ptr71 = getelementptr inbounds i8, i8* %dst, i32 64
+ %a71 = bitcast i8* %add.ptr71 to <8 x float>*
+
+ %add.ptr81 = getelementptr inbounds i8, i8* %dst, i32 128
+ %a81 = bitcast i8* %add.ptr81 to <4 x double>*
+
+ %add.ptr91 = getelementptr inbounds i8, i8* %dst, i32 64
+ %a91 = bitcast i8* %add.ptr91 to <16 x float>*
+
+ %add.ptr101 = getelementptr inbounds i8, i8* %dst, i32 128
+ %a101 = bitcast i8* %add.ptr101 to <8 x double>*
+
+ ; These operations are necessary, because select of two single use loads
+ ; ends up getting optimized into a select of two leas, followed by a
+ ; single load of the selected address.
+ %t13 = xor i16 %v13, 11
+ %t23 = xor i32 %v23, 1234
+ %t33 = fadd float %v33, %v32
+ %t43 = fadd double %v43, %v42
+ %t53 = fadd <4 x float> %v53, %v52
+ %t63 = fadd <2 x double> %v63, %v62
+ %t73 = fsub <8 x float> %v73, %v72
+ %t83 = fsub <4 x double> %v83, %v82
+ %t93 = fsub <16 x float> %v93, %v92
+ %t103 = fsub <8 x double> %v103, %v102
+
+ %cmp = icmp ugt i32 %v1, 31
+ %t11 = select i1 %cmp, i16 %v12, i16 %t13
+ %t21 = select i1 %cmp, i32 %v22, i32 %t23
+ %t31 = select i1 %cmp, float %v32, float %t33
+ %t41 = select i1 %cmp, double %v42, double %t43
+ %t51 = select i1 %cmp, <4 x float> %v52, <4 x float> %t53
+ %t61 = select i1 %cmp, <2 x double> %v62, <2 x double> %t63
+ %t71 = select i1 %cmp, <8 x float> %v72, <8 x float> %t73
+ %t81 = select i1 %cmp, <4 x double> %v82, <4 x double> %t83
+ %t91 = select i1 %cmp, <16 x float> %v92, <16 x float> %t93
+ %t101 = select i1 %cmp, <8 x double> %v102, <8 x double> %t103
+
+ store i16 %t11, i16* %a11, align 2
+ store i32 %t21, i32* %a21, align 4
+ store float %t31, float* %a31, align 4
+ store double %t41, double* %a41, align 8
+ store <4 x float> %t51, <4 x float>* %a51, align 16
+ store <2 x double> %t61, <2 x double>* %a61, align 16
+ store <8 x float> %t71, <8 x float>* %a71, align 32
+ store <4 x double> %t81, <4 x double>* %a81, align 32
+ store <16 x float> %t91, <16 x float>* %a91, align 32
+ store <8 x double> %t101, <8 x double>* %a101, align 32
+
+ ret void
+}
+
+; This test checks that only a single ja gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; on the same condition.
+; Contrary to my expectations, this doesn't exercise the code for
+; CMOV_V8I1, CMOV_V16I1, CMOV_V32I1, or CMOV_V64I1. Instead the selects all
+; get lowered into vector length number of selects, which all eventually turn
+; into a huge number of CMOV_GR8, which are all contiguous, so the optimization
+; kicks in as long as CMOV_GR8 is supported. I couldn't find a way to get
+; CMOV_V*I1 pseudo-opcodes to get generated. If a way exists to get CMOV_V*1
+; pseudo-opcodes to be generated, this test should be replaced with one that
+; tests those opcodes.
+;
+; CHECK-LABEL: foo9:
+; CHECK: ja
+; CHECK-NOT: ja
+define void @foo9(i32 %v1,
+ <8 x i1> %v12, <8 x i1> %v13,
+ <16 x i1> %v22, <16 x i1> %v23,
+ <32 x i1> %v32, <32 x i1> %v33,
+ <64 x i1> %v42, <64 x i1> %v43,
+ i8 * %dst) nounwind {
+entry:
+ %add.ptr11 = getelementptr inbounds i8, i8* %dst, i32 0
+ %a11 = bitcast i8* %add.ptr11 to <8 x i1>*
+
+ %add.ptr21 = getelementptr inbounds i8, i8* %dst, i32 4
+ %a21 = bitcast i8* %add.ptr21 to <16 x i1>*
+
+ %add.ptr31 = getelementptr inbounds i8, i8* %dst, i32 8
+ %a31 = bitcast i8* %add.ptr31 to <32 x i1>*
+
+ %add.ptr41 = getelementptr inbounds i8, i8* %dst, i32 16
+ %a41 = bitcast i8* %add.ptr41 to <64 x i1>*
+
+ ; These operations are necessary, because select of two single use loads
+ ; ends up getting optimized into a select of two leas, followed by a
+ ; single load of the selected address.
+ %t13 = xor <8 x i1> %v13, %v12
+ %t23 = xor <16 x i1> %v23, %v22
+ %t33 = xor <32 x i1> %v33, %v32
+ %t43 = xor <64 x i1> %v43, %v42
+
+ %cmp = icmp ugt i32 %v1, 31
+ %t11 = select i1 %cmp, <8 x i1> %v12, <8 x i1> %t13
+ %t21 = select i1 %cmp, <16 x i1> %v22, <16 x i1> %t23
+ %t31 = select i1 %cmp, <32 x i1> %v32, <32 x i1> %t33
+ %t41 = select i1 %cmp, <64 x i1> %v42, <64 x i1> %t43
+
+ store <8 x i1> %t11, <8 x i1>* %a11, align 16
+ store <16 x i1> %t21, <16 x i1>* %a21, align 4
+ store <32 x i1> %t31, <32 x i1>* %a31, align 8
+ store <64 x i1> %t41, <64 x i1>* %a41, align 16
+
+ ret void
+}
Added: llvm/trunk/test/CodeGen/X86/pseudo_cmov_lower1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pseudo_cmov_lower1.ll?rev=244202&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pseudo_cmov_lower1.ll (added)
+++ llvm/trunk/test/CodeGen/X86/pseudo_cmov_lower1.ll Thu Aug 6 03:45:34 2015
@@ -0,0 +1,39 @@
+; RUN: llc < %s -mtriple=i386-linux-gnu -mattr=+sse2 -o - | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -o - | FileCheck %s
+
+; This test checks that only a single jae gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo1:
+; CHECK: jae
+; CHECK-NOT: jae
+define double @foo1(float %p1, double %p2, double %p3) nounwind {
+entry:
+ %c1 = fcmp oge float %p1, 0.000000e+00
+ %d0 = fadd double %p2, 1.25e0
+ %d1 = fadd double %p3, 1.25e0
+ %d2 = select i1 %c1, double %d0, double %d1
+ %d3 = select i1 %c1, double %d0, double %p2
+ %d4 = select i1 %c1, double %p3, double %d1
+ %d5 = fsub double %d2, %d3
+ %d6 = fadd double %d5, %d4
+ ret double %d6
+}
+
+; This test checks that only a single jae gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo2:
+; CHECK: jae
+; CHECK-NOT: jae
+define float @foo2(float %p1, float %p2, float %p3) nounwind {
+entry:
+ %c1 = fcmp oge float %p1, 0.000000e+00
+ %d0 = fadd float %p2, 1.25e0
+ %d1 = fadd float %p3, 1.25e0
+ %d2 = select i1 %c1, float %d0, float %d1
+ %d3 = select i1 %c1, float %d1, float %p2
+ %d4 = select i1 %c1, float %d0, float %p3
+ %d5 = fsub float %d2, %d3
+ %d6 = fadd float %d5, %d4
+ ret float %d6
+}
+
Added: llvm/trunk/test/CodeGen/X86/pseudo_cmov_lower2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pseudo_cmov_lower2.ll?rev=244202&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pseudo_cmov_lower2.ll (added)
+++ llvm/trunk/test/CodeGen/X86/pseudo_cmov_lower2.ll Thu Aug 6 03:45:34 2015
@@ -0,0 +1,100 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -o - | FileCheck %s
+
+; This test checks that only a single jae gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR. The tricky part
+; of this test is that it tests the special PHI operand rewriting code in
+; X86TargetLowering::EmitLoweredSelect.
+;
+; CHECK-LABEL: foo1:
+; CHECK: jae
+; CHECK-NOT: jae
+define double @foo1(float %p1, double %p2, double %p3) nounwind {
+entry:
+ %c1 = fcmp oge float %p1, 0.000000e+00
+ %d0 = fadd double %p2, 1.25e0
+ %d1 = fadd double %p3, 1.25e0
+ %d2 = select i1 %c1, double %d0, double %d1
+ %d3 = select i1 %c1, double %d2, double %p2
+ %d4 = select i1 %c1, double %d3, double %p3
+ %d5 = fsub double %d2, %d3
+ %d6 = fadd double %d5, %d4
+ ret double %d6
+}
+
+; This test checks that only a single jae gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR. The tricky part
+; of this test is that it tests the special PHI operand rewriting code in
+; X86TargetLowering::EmitLoweredSelect.
+;
+; CHECK-LABEL: foo2:
+; CHECK: jae
+; CHECK-NOT: jae
+define double @foo2(float %p1, double %p2, double %p3) nounwind {
+entry:
+ %c1 = fcmp oge float %p1, 0.000000e+00
+ %d0 = fadd double %p2, 1.25e0
+ %d1 = fadd double %p3, 1.25e0
+ %d2 = select i1 %c1, double %d0, double %d1
+ %d3 = select i1 %c1, double %p2, double %d2
+ %d4 = select i1 %c1, double %p3, double %d3
+ %d5 = fsub double %d2, %d3
+ %d6 = fadd double %d5, %d4
+ ret double %d6
+}
+
+; This test checks that only a single js gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR. The tricky part
+; of this test is that it tests the special PHI operand rewriting code in
+; X86TargetLowering::EmitLoweredSelect. It also tests to make sure all
+; the operands of the resulting instructions are from the proper places.
+;
+; CHECK-LABEL: foo3:
+; CHECK: js
+; CHECK-NOT: js
+; CHECK-LABEL: # BB#1:
+; CHECK-DAG: movapd %xmm2, %xmm1
+; CHECK-DAG: movapd %xmm2, %xmm0
+; CHECK-LABEL:.LBB2_2:
+; CHECK: divsd %xmm1, %xmm0
+; CHECK: ret
+define double @foo3(i32 %p1, double %p2, double %p3,
+ double %p4, double %p5) nounwind {
+entry:
+ %c1 = icmp slt i32 %p1, 0
+ %d2 = select i1 %c1, double %p2, double %p3
+ %d3 = select i1 %c1, double %p3, double %p4
+ %d4 = select i1 %c1, double %d2, double %d3
+ %d5 = fdiv double %d4, %d3
+ ret double %d5
+}
+
+; This test checks that only a single js gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR. The tricky part
+; of this test is that it tests the special PHI operand rewriting code in
+; X86TargetLowering::EmitLoweredSelect. It also tests to make sure all
+; the operands of the resulting instructions are from the proper places
+; when the "opposite condition" handling code in the compiler is used.
+; This should be the same code as foo3 above, because we use the opposite
+; condition code in the second two selects, but we also swap the operands
+; of the selects to give the same actual computation.
+;
+; CHECK-LABEL: foo4:
+; CHECK: js
+; CHECK-NOT: js
+; CHECK-LABEL: # BB#1:
+; CHECK-DAG: movapd %xmm2, %xmm1
+; CHECK-DAG: movapd %xmm2, %xmm0
+; CHECK-LABEL:.LBB3_2:
+; CHECK: divsd %xmm1, %xmm0
+; CHECK: ret
+define double @foo4(i32 %p1, double %p2, double %p3,
+ double %p4, double %p5) nounwind {
+entry:
+ %c1 = icmp slt i32 %p1, 0
+ %d2 = select i1 %c1, double %p2, double %p3
+ %c2 = icmp sge i32 %p1, 0
+ %d3 = select i1 %c2, double %p4, double %p3
+ %d4 = select i1 %c2, double %d3, double %d2
+ %d5 = fdiv double %d4, %d3
+ ret double %d5
+}
More information about the llvm-commits
mailing list