[PATCH] R600/SI: Remove the M0Reg register class
Christian König
deathsimple at vodafone.de
Thu Jun 12 08:41:14 PDT 2014
Am 12.06.2014 17:08, schrieb Tom Stellard:
> On Thu, Jun 12, 2014 at 08:27:47AM +0200, Christian König wrote:
>> Am 11.06.2014 22:56, schrieb Tom Stellard:
>>> This class only contined one register, M0, and was used to ensure
>>> some operands always read from the M0 register. With the
>>> MachineScheduler enabled we sometimes run into a situation where we
>>> have to spill a M0Reg, which is not ideal, so instead of using this
>>> special class, we use a custom inserter to force operands to M0.
>> Mhm, I'm wondering why this should be better than spilling M0?
>>
>> It sounds to me that we now always insert a move from a SGPR to M0
>> before the instruction which uses M0, while in the other case we only
>> insert moves to spill M0 in some unusual cases.
>>
>
> The problem is code like this:
>
> S_MOV_B32 %vreg0, 0 ; %vreg0 = M0RegClass
> %vreg0 use
> %vreg0 use
>
> S_MOV_B32 %vreg1, 1 ; %vreg1 = M0RegClass
> %vreg1 use
> %vreg1 use
>
> The scheduler will reorder the sequence like this, which requires a
> spill:
>
> S_MOV_B32 %vreg0, 0 ; %vreg0 = M0RegClass
> S_MOV_B32 %vreg1, 1 ; %vreg1 = M0RegClass
> %vreg0 use
> %vreg0 use
> %vreg1 use
> %vreg1 use
>
> In this case the spill is completely unnecessary, which is what I'm
> trying to avoid.
But that looks more like the scheduler doesn't take the register
pressure into account and should be fixed there.
Essentially we had it before to implicitly use M0 with a custom inserter
and that resulted in something like this with every interpolation operation:
S_MOV_B32 M0, a
V_INTER...
S_MOV_B32 M0, a
V_INTER...
S_MOV_B32 M0, a
V_INTER...
S_MOV_B32 M0, a
V_INTER...
S_MOV_B32 M0, a
...
E.g. creating a lot of unnecessary moves to M0.
Christian.
>
> -Tom
>
>> Thanks,
>> Christian.
>>
>>> ---
>>> lib/Target/R600/SIFixSGPRCopies.cpp | 1 -
>>> lib/Target/R600/SIISelLowering.cpp | 17 +++++++++++++++++
>>> lib/Target/R600/SIInstructions.td | 34 +++++++++++++++++++++++-----------
>>> lib/Target/R600/SIRegisterInfo.td | 5 ++---
>>> 4 files changed, 42 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/lib/Target/R600/SIFixSGPRCopies.cpp b/lib/Target/R600/SIFixSGPRCopies.cpp
>>> index 5f71453..782f651 100644
>>> --- a/lib/Target/R600/SIFixSGPRCopies.cpp
>>> +++ b/lib/Target/R600/SIFixSGPRCopies.cpp
>>> @@ -185,7 +185,6 @@ bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy,
>>> const TargetRegisterClass *SrcRC;
>>>
>>> if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
>>> - DstRC == &AMDGPU::M0RegRegClass ||
>>> MRI.getRegClass(SrcReg) == &AMDGPU::VReg_1RegClass)
>>> return false;
>>>
>>> diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
>>> index 608aad2..01ce64e 100644
>>> --- a/lib/Target/R600/SIISelLowering.cpp
>>> +++ b/lib/Target/R600/SIISelLowering.cpp
>>> @@ -545,6 +545,23 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
>>> .addImm(1) // CLAMP
>>> .addImm(0); // OMOD
>>> MI->eraseFromParent();
>>> + break;
>>> + }
>>> + case AMDGPU::SI_MOV_M0: {
>>> + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
>>> + .addOperand(MI->getOperand(1));
>>> +
>>> + unsigned DstReg = MI->getOperand(0).getReg();
>>> + for (MachineOperand &MO : MRI.use_operands(DstReg)) {
>>> + // Clear the kill flag from each use. Since we are potentailly inserting
>>> + // M0 uses into more than one instruction, then we need to remove the
>>> + // kill flag otherwise all uses after the first will be considered dead,
>>> + // and the machine verifier will complain.
>>> + MO.setIsKill(false);
>>> + }
>>> + MRI.replaceRegWith(MI->getOperand(0).getReg(), AMDGPU::M0);
>>> + MI->eraseFromParent();
>>> + break;
>>> }
>>> }
>>> return BB;
>>> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
>>> index 475f28e..8080d5b 100644
>>> --- a/lib/Target/R600/SIInstructions.td
>>> +++ b/lib/Target/R600/SIInstructions.td
>>> @@ -434,11 +434,12 @@ def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "S_WAITCNT $simm16",
>>> //def S_SETPRIO : SOPP_ <0x0000000f, "S_SETPRIO", []>;
>>>
>>> let Uses = [EXEC] in {
>>> - def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16, M0Reg:$m0), "S_SENDMSG $simm16",
>>> - [(int_SI_sendmsg imm:$simm16, M0Reg:$m0)]
>>> +
>>> +def S_SENDMSG : SOPP <
>>> + 0x00000010, (ins SendMsgImm:$simm16, SReg_32:$m0), "S_SENDMSG $simm16", []
>>> > {
>>> - let DisableEncoding = "$m0";
>>> - }
>>> + let DisableEncoding = "$m0";
>>> +}
>>> } // End Uses = [EXEC]
>>>
>>> //def S_SENDMSGHALT : SOPP_ <0x00000011, "S_SENDMSGHALT", []>;
>>> @@ -1064,7 +1065,7 @@ defm V_MOVRELSD_B32 : VOP1_32 <0x00000044, "V_MOVRELSD_B32", []>;
>>> def V_INTERP_P1_F32 : VINTRP <
>>> 0x00000000,
>>> (outs VReg_32:$dst),
>>> - (ins VReg_32:$i, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
>>> + (ins VReg_32:$i, i32imm:$attr_chan, i32imm:$attr, SReg_32:$m0),
>>> "V_INTERP_P1_F32 $dst, $i, $attr_chan, $attr, [$m0]",
>>> []> {
>>> let DisableEncoding = "$m0";
>>> @@ -1073,7 +1074,7 @@ def V_INTERP_P1_F32 : VINTRP <
>>> def V_INTERP_P2_F32 : VINTRP <
>>> 0x00000001,
>>> (outs VReg_32:$dst),
>>> - (ins VReg_32:$src0, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
>>> + (ins VReg_32:$src0, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, SReg_32:$m0),
>>> "V_INTERP_P2_F32 $dst, [$src0], $j, $attr_chan, $attr, [$m0]",
>>> []> {
>>>
>>> @@ -1085,7 +1086,7 @@ def V_INTERP_P2_F32 : VINTRP <
>>> def V_INTERP_MOV_F32 : VINTRP <
>>> 0x00000002,
>>> (outs VReg_32:$dst),
>>> - (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
>>> + (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr, SReg_32:$m0),
>>> "V_INTERP_MOV_F32 $dst, $src0, $attr_chan, $attr, [$m0]",
>>> []> {
>>> let DisableEncoding = "$m0";
>>> @@ -1520,6 +1521,9 @@ def V_SUB_F64 : InstSI <
>>> []
>>> >;
>>>
>>> +// Store a value in the M0 register
>>> +def SI_MOV_M0 : InstSI <(outs SReg_32:$dst), (ins SReg_32:$src0), "", []>;
>>> +
>>> } // end usesCustomInserter
>>>
>>> multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
>>> @@ -1635,6 +1639,13 @@ def : Pat <
>>> >;
>>>
>>> //===----------------------------------------------------------------------===//
>>> +// SOPP Patterns
>>> +//===----------------------------------------------------------------------===//
>>> +def : Pat <
>>> + (int_SI_sendmsg imm:$simm16, i32:$m0),
>>> + (S_SENDMSG imm:$simm16, (SI_MOV_M0 $m0))
>>> +>;
>>> +//===----------------------------------------------------------------------===//
>>> // VOP2 Patterns
>>> //===----------------------------------------------------------------------===//
>>>
>>> @@ -1987,15 +1998,16 @@ def : Pat <
>>>
>>> def : Pat <
>>> (int_SI_fs_constant imm:$attr_chan, imm:$attr, i32:$params),
>>> - (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, $params)
>>> + (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, (SI_MOV_M0 $params))
>>> >;
>>>
>>> def : Pat <
>>> - (int_SI_fs_interp imm:$attr_chan, imm:$attr, M0Reg:$params, v2i32:$ij),
>>> + (int_SI_fs_interp imm:$attr_chan, imm:$attr, i32:$params, v2i32:$ij),
>>> (V_INTERP_P2_F32 (V_INTERP_P1_F32 (EXTRACT_SUBREG v2i32:$ij, sub0),
>>> - imm:$attr_chan, imm:$attr, i32:$params),
>>> + imm:$attr_chan, imm:$attr,
>>> + (SI_MOV_M0 $params)),
>>> (EXTRACT_SUBREG $ij, sub1),
>>> - imm:$attr_chan, imm:$attr, $params)
>>> + imm:$attr_chan, imm:$attr, (SI_MOV_M0 $params))
>>> >;
>>>
>>> /********** ================== **********/
>>> diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td
>>> index f1f01de..d6fb960 100644
>>> --- a/lib/Target/R600/SIRegisterInfo.td
>>> +++ b/lib/Target/R600/SIRegisterInfo.td
>>> @@ -151,15 +151,14 @@ def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
>>> // Register classes used as source and destination
>>> //===----------------------------------------------------------------------===//
>>>
>>> -// Special register classes for predicates and the M0 register
>>> +// Special register classes for predicates.
>>> def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)>;
>>> def VCCReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add VCC)>;
>>> def EXECReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add EXEC)>;
>>> -def M0Reg : RegisterClass<"AMDGPU", [i32], 32, (add M0)>;
>>>
>>> // Register class for all scalar registers (SGPRs + Special Registers)
>>> def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
>>> - (add SGPR_32, M0Reg, VCC_LO)
>>> + (add SGPR_32, M0, VCC_LO)
>>> >;
>>>
>>> def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64], 64, (add SGPR_64Regs)>;
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list