[llvm] r288053 - [AMDGPU] Allow hoisting of comparisons out of a loop and eliminate condition copies
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 28 10:58:49 PST 2016
Author: rampitec
Date: Mon Nov 28 12:58:49 2016
New Revision: 288053
URL: http://llvm.org/viewvc/llvm-project?rev=288053&view=rev
Log:
[AMDGPU] Allow hoisting of comparisons out of a loop and eliminate condition copies
Codegen prepare sinks comparisons close to a user is we have only one register
for conditions. For AMDGPU we have many SGPRs capable to hold vector conditions.
Changed BE to report we have many condition registers. That way IR LICM pass
would hoist an invariant comparison out of a loop and codegen prepare will not
sink it.
With that done a condition is calculated in one block and used in another.
Current behavior is to store workitem's condition in a VGPR using v_cndmask_b32
and then restore it with yet another v_cmp instruction from that v_cndmask's
result. To mitigate the issue a propagation of source SGPR pair in place of v_cmp
is implemented. Additional side effect of this is that we may consume less VGPRs
at a cost of more SGPRs in case if holding of multiple conditions is needed, and
that is a clear win in most cases.
Differential Revision: https://reviews.llvm.org/D26114
Added:
llvm/trunk/test/CodeGen/AMDGPU/hoist-cond.ll
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp
llvm/trunk/lib/Target/AMDGPU/SILowerI1Copies.cpp
llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=288053&r1=288052&r2=288053&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Mon Nov 28 12:58:49 2016
@@ -440,6 +440,7 @@ AMDGPUTargetLowering::AMDGPUTargetLoweri
setSchedulingPreference(Sched::RegPressure);
setJumpIsExpensive(true);
+ setHasMultipleConditionRegisters(true);
// SI at least has hardware support for floating point exceptions, but no way
// of using or handling them is implemented. They are also optional in OpenCL
Modified: llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp?rev=288053&r1=288052&r2=288053&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp Mon Nov 28 12:58:49 2016
@@ -80,6 +80,11 @@ private:
void emitLoop(MachineInstr &MI);
void emitEndCf(MachineInstr &MI);
+ void findMaskOperands(MachineInstr &MI, unsigned OpNo,
+ SmallVectorImpl<MachineOperand> &Src) const;
+
+ void combineMasks(MachineInstr &MI);
+
public:
static char ID;
@@ -336,6 +341,62 @@ void SILowerControlFlow::emitEndCf(Machi
LIS->handleMove(*NewMI);
}
+// Returns replace operands for a logical operation, either single result
+// for exec or two operands if source was another equivalent operation.
+void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
+ SmallVectorImpl<MachineOperand> &Src) const {
+ MachineOperand &Op = MI.getOperand(OpNo);
+ if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) {
+ Src.push_back(Op);
+ return;
+ }
+
+ MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
+ if (!Def || Def->getParent() != MI.getParent() ||
+ !(Def->isFullCopy() || (Def->getOpcode() == MI.getOpcode())))
+ return;
+
+ // Make sure we do not modify exec between def and use.
+ // A copy with implcitly defined exec inserted earlier is an exclusion, it
+ // does not really modify exec.
+ for (auto I = Def->getIterator(); I != MI.getIterator(); ++I)
+ if (I->modifiesRegister(AMDGPU::EXEC, TRI) &&
+ !(I->isCopy() && I->getOperand(0).getReg() != AMDGPU::EXEC))
+ return;
+
+ for (const auto &SrcOp : Def->explicit_operands())
+ if (SrcOp.isUse() && (!SrcOp.isReg() ||
+ TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) ||
+ SrcOp.getReg() == AMDGPU::EXEC))
+ Src.push_back(SrcOp);
+}
+
+// Search and combine pairs of equivalent instructions, like
+// S_AND_B64 x, (S_AND_B64 x, y) => S_AND_B64 x, y
+// S_OR_B64 x, (S_OR_B64 x, y) => S_OR_B64 x, y
+// One of the operands is exec mask.
+void SILowerControlFlow::combineMasks(MachineInstr &MI) {
+ assert(MI.getNumExplicitOperands() == 3);
+ SmallVector<MachineOperand, 4> Ops;
+ unsigned OpToReplace = 1;
+ findMaskOperands(MI, 1, Ops);
+ if (Ops.size() == 1) OpToReplace = 2; // First operand can be exec or its copy
+ findMaskOperands(MI, 2, Ops);
+ if (Ops.size() != 3) return;
+
+ unsigned UniqueOpndIdx;
+ if (Ops[0].isIdenticalTo(Ops[1])) UniqueOpndIdx = 2;
+ else if (Ops[0].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
+ else if (Ops[1].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
+ else return;
+
+ unsigned Reg = MI.getOperand(OpToReplace).getReg();
+ MI.RemoveOperand(OpToReplace);
+ MI.addOperand(Ops[UniqueOpndIdx]);
+ if (MRI->use_empty(Reg))
+ MRI->getUniqueVRegDef(Reg)->eraseFromParent();
+}
+
bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
TII = ST.getInstrInfo();
@@ -351,9 +412,9 @@ bool SILowerControlFlow::runOnMachineFun
NextBB = std::next(BI);
MachineBasicBlock &MBB = *BI;
- MachineBasicBlock::iterator I, Next;
+ MachineBasicBlock::iterator I, Next, Last;
- for (I = MBB.begin(); I != MBB.end(); I = Next) {
+ for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) {
Next = std::next(I);
MachineInstr &MI = *I;
@@ -386,9 +447,20 @@ bool SILowerControlFlow::runOnMachineFun
emitEndCf(MI);
break;
+ case AMDGPU::S_AND_B64:
+ case AMDGPU::S_OR_B64:
+ // Cleanup bit manipulations on exec mask
+ combineMasks(MI);
+ Last = I;
+ continue;
+
default:
- break;
+ Last = I;
+ continue;
}
+
+ // Replay newly inserted code to combine masks
+ Next = (Last == MBB.end()) ? MBB.begin() : Last;
}
}
Modified: llvm/trunk/lib/Target/AMDGPU/SILowerI1Copies.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SILowerI1Copies.cpp?rev=288053&r1=288052&r2=288053&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SILowerI1Copies.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SILowerI1Copies.cpp Mon Nov 28 12:58:49 2016
@@ -100,12 +100,12 @@ bool SILowerI1Copies::runOnMachineFuncti
const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg());
const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg());
+ DebugLoc DL = MI.getDebugLoc();
+ MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
if (DstRC == &AMDGPU::VReg_1RegClass &&
TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
I1Defs.push_back(Dst.getReg());
- DebugLoc DL = MI.getDebugLoc();
- MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) {
if (DefInst->getOperand(1).isImm()) {
I1Defs.push_back(Dst.getReg());
@@ -129,10 +129,26 @@ bool SILowerI1Copies::runOnMachineFuncti
MI.eraseFromParent();
} else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
SrcRC == &AMDGPU::VReg_1RegClass) {
- BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_U32_e64))
- .addOperand(Dst)
- .addOperand(Src)
- .addImm(0);
+ if (DefInst->getOpcode() == AMDGPU::V_CNDMASK_B32_e64 &&
+ DefInst->getOperand(1).isImm() && DefInst->getOperand(2).isImm() &&
+ DefInst->getOperand(1).getImm() == 0 &&
+ DefInst->getOperand(2).getImm() != 0 &&
+ DefInst->getOperand(3).isReg() &&
+ TargetRegisterInfo::isVirtualRegister(
+ DefInst->getOperand(3).getReg()) &&
+ TRI->getCommonSubClass(
+ MRI.getRegClass(DefInst->getOperand(3).getReg()),
+ &AMDGPU::SGPR_64RegClass)) {
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64))
+ .addOperand(Dst)
+ .addReg(AMDGPU::EXEC)
+ .addOperand(DefInst->getOperand(3));
+ } else {
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64))
+ .addOperand(Dst)
+ .addOperand(Src)
+ .addImm(0);
+ }
MI.eraseFromParent();
}
}
Modified: llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll?rev=288053&r1=288052&r2=288053&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll Mon Nov 28 12:58:49 2016
@@ -493,9 +493,9 @@ ret:
; GCN: s_setpc_b64
; GCN: [[LONG_BR_DEST0]]
-; GCN: s_cmp_eq_u32
+; GCN: v_cmp_ne_u32_e32
; GCN-NEXT: ; implicit-def
-; GCN-NEXT: s_cbranch_scc0
+; GCN-NEXT: s_cbranch_vccz
; GCN: s_setpc_b64
; GCN: s_endpgm
Added: llvm/trunk/test/CodeGen/AMDGPU/hoist-cond.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/hoist-cond.ll?rev=288053&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/hoist-cond.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/hoist-cond.ll Mon Nov 28 12:58:49 2016
@@ -0,0 +1,46 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+
+; Check that invariant compare is hoisted out of the loop.
+; At the same time condition shall not be serialized into a VGPR and deserialized later
+; using another v_cmp + v_cndmask, but used directly in s_and_saveexec_b64.
+
+; CHECK: v_cmp_{{..}}_u32_e64 [[COND:s\[[0-9]+:[0-9]+\]]]
+; CHECK: BB0_1:
+; CHECK-NOT: v_cmp
+; CHECK_NOT: v_cndmask
+; CHECK: s_and_saveexec_b64 s[{{[[0-9]+:[0-9]+}}], [[COND]]
+; CHECK: BB0_2:
+
+define amdgpu_kernel void @hoist_cond(float addrspace(1)* nocapture %arg, float addrspace(1)* noalias nocapture readonly %arg1, i32 %arg3, i32 %arg4) {
+bb:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
+ %tmp5 = icmp ult i32 %tmp, %arg3
+ br label %bb1
+
+bb1: ; preds = %bb3, %bb
+ %tmp7 = phi i32 [ %arg4, %bb ], [ %tmp16, %bb3 ]
+ %tmp8 = phi float [ 0.000000e+00, %bb ], [ %tmp15, %bb3 ]
+ br i1 %tmp5, label %bb2, label %bb3
+
+bb2: ; preds = %bb1
+ %tmp10 = zext i32 %tmp7 to i64
+ %tmp11 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %tmp10
+ %tmp12 = load float, float addrspace(1)* %tmp11, align 4
+ br label %bb3
+
+bb3: ; preds = %bb2, %bb1
+ %tmp14 = phi float [ %tmp12, %bb2 ], [ 0.000000e+00, %bb1 ]
+ %tmp15 = fadd float %tmp8, %tmp14
+ %tmp16 = add i32 %tmp7, -1
+ %tmp17 = icmp eq i32 %tmp16, 0
+ br i1 %tmp17, label %bb4, label %bb1
+
+bb4: ; preds = %bb3
+ store float %tmp15, float addrspace(1)* %arg, align 4
+ ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
More information about the llvm-commits
mailing list