[Mesa-dev] [PATCH] R600/SI: add a workaround for a scalar-memory-read hw bug on CI

Wed Oct 30 06:59:02 PDT 2013

I thought that doing S_CMPK followed by S_CBRANCH has less overhead
than doing a memory read. If we used one of
S_BUFFER_LOAD_DWORDX2,4,8,16, it wouldn't be so bad. I don't know.

Marek

On Wed, Oct 30, 2013 at 2:48 PM, Christian König
<deathsimple at vodafone.de> wrote:
> Am 30.10.2013 14:23, schrieb Marek Olšák:
>
>> From: Marek Olšák <marek.olsak at amd.com>
>>
>> This also fixes scalar compare instructions which were always eliminated,
>> because they didn't have a destination of SCC.
>
>
> Uff, that looks like quite a bit of overhead, isn't there a simpler
> approach? Like setting the the NumRecord to one and letting unused constants
> pointing to a dummy buffer or soemthing like this?
>
> Christian.
>
>
>>
>> Signed-off-by: Marek Olšák <marek.olsak at amd.com>
>> ---
>>   lib/Target/R600/SIISelLowering.cpp | 30 ++++++++++++++++++++++++++----
>>   lib/Target/R600/SIInsertWaits.cpp  |  6 ++++++
>>   lib/Target/R600/SIInstrInfo.td     |  5 +++++
>>   lib/Target/R600/SIInstructions.td  | 26 +++++++++++++++-----------
>>   4 files changed, 52 insertions(+), 15 deletions(-)
>>
>> diff --git a/lib/Target/R600/SIISelLowering.cpp
>> b/lib/Target/R600/SIISelLowering.cpp
>> index 371572e..e9f4035 100644
>> --- a/lib/Target/R600/SIISelLowering.cpp
>> +++ b/lib/Target/R600/SIISelLowering.cpp
>> @@ -14,6 +14,7 @@
>>     #include "SIISelLowering.h"
>>   #include "AMDGPU.h"
>> +#include "AMDGPUSubtarget.h"
>>   #include "AMDILIntrinsicInfo.h"
>>   #include "SIInstrInfo.h"
>>   #include "SIMachineFunctionInfo.h"
>> @@ -302,14 +303,37 @@ MachineBasicBlock *
>> SITargetLowering::EmitInstrWithCustomInserter(
>>       MachineInstr * MI, MachineBasicBlock * BB) const {
>>       MachineBasicBlock::iterator I = *MI;
>> +  const SIInstrInfo *TII =
>> +    static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
>> +
>> +  // Sea Islands must conditionally execute SMRD instructions depending
>> +  // on the value of SQ_BUF_RSRC_WORD2.NUM_RECORDS, because the hardware
>> +  // doesn't skip the instructions if NUM_RECORDS is 0.
>> +  if (TII->isSMRD(MI->getOpcode())) {
>> +    if
>> (getTargetMachine().getSubtarget<AMDGPUSubtarget>().getGeneration() !=
>> +        AMDGPUSubtarget::SEA_ISLANDS)
>> +      return BB;
>> +
>> +    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
>> +    unsigned NumRecords =
>> MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
>> +
>> +    // XXX should we save and restore the SCC register?
>> +    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::COPY),
>> NumRecords)
>> +        .addReg(MI->getOperand(1).getReg(), 0, AMDGPU::sub2);
>> +    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_CMPK_EQ_U32),
>> AMDGPU::SCC)
>> +        .addReg(NumRecords)
>> +        .addImm(0);
>> +    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_CBRANCH_SCC1))
>> +        .addImm(1)
>> +        .addReg(AMDGPU::SCC);
>> +    return BB;
>> +  }
>>       switch (MI->getOpcode()) {
>>     default:
>>       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
>>     case AMDGPU::BRANCH: return BB;
>>     case AMDGPU::SI_ADDR64_RSRC: {
>> -    const SIInstrInfo *TII =
>> -      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
>>       MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
>>       unsigned SuperReg = MI->getOperand(0).getReg();
>>       unsigned SubRegLo =
>> MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
>> @@ -336,8 +360,6 @@ MachineBasicBlock *
>> SITargetLowering::EmitInstrWithCustomInserter(
>>       break;
>>     }
>>     case AMDGPU::V_SUB_F64: {
>> -    const SIInstrInfo *TII =
>> -      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
>>       BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64),
>>               MI->getOperand(0).getReg())
>>               .addReg(MI->getOperand(1).getReg())
>> diff --git a/lib/Target/R600/SIInsertWaits.cpp
>> b/lib/Target/R600/SIInsertWaits.cpp
>> index 7e42fb7..2e47346 100644
>> --- a/lib/Target/R600/SIInsertWaits.cpp
>> +++ b/lib/Target/R600/SIInsertWaits.cpp
>> @@ -294,6 +294,12 @@ bool SIInsertWaits::insertWait(MachineBasicBlock
>> &MBB,
>>     if (Counts.Named.EXP == 0)
>>       ExpInstrTypesSeen = 0;
>>   +  // Ensure S_WAITCNT is inserted before S_CBRANCH.
>> +  MachineBasicBlock::iterator beforeI = I;
>> +  --beforeI;
>> +  if (beforeI->getOpcode() == AMDGPU::S_CBRANCH_SCC1)
>> +    I = beforeI;
>> +
>>     // Build the wait instruction
>>     BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
>>             .addImm((Counts.Named.VM & 0xF) |
>> diff --git a/lib/Target/R600/SIInstrInfo.td
>> b/lib/Target/R600/SIInstrInfo.td
>> index ed42a2a..9567879 100644
>> --- a/lib/Target/R600/SIInstrInfo.td
>> +++ b/lib/Target/R600/SIInstrInfo.td
>> @@ -177,6 +177,11 @@ class SOPC_32 <bits<7> op, string opName, list<dag>
>> pattern> : SOPC <
>>     opName#" $dst, $src0, $src1", pattern
>>   >;
>>   +class SOPCK_32 <bits<7> op, string opName, list<dag> pattern> : SOPC <
>> +  op, (outs SCCReg:$dst), (ins SReg_32:$src0, i16imm:$src1),
>> +  opName#" $dst, $src0, $src1", pattern
>> +>;
>> +
>>   class SOPC_64 <bits<7> op, string opName, list<dag> pattern> : SOPC <
>>     op, (outs SCCReg:$dst), (ins SSrc_64:$src0, SSrc_64:$src1),
>>     opName#" $dst, $src0, $src1", pattern
>> diff --git a/lib/Target/R600/SIInstructions.td
>> b/lib/Target/R600/SIInstructions.td
>> index 048c157..1b275a7 100644
>> --- a/lib/Target/R600/SIInstructions.td
>> +++ b/lib/Target/R600/SIInstructions.td
>> @@ -115,17 +115,17 @@ def S_CMPK_EQ_I32 : SOPK <
>>   */
>>     let isCompare = 1 in {
>> -def S_CMPK_LG_I32 : SOPK_32 <0x00000004, "S_CMPK_LG_I32", []>;
>> -def S_CMPK_GT_I32 : SOPK_32 <0x00000005, "S_CMPK_GT_I32", []>;
>> -def S_CMPK_GE_I32 : SOPK_32 <0x00000006, "S_CMPK_GE_I32", []>;
>> -def S_CMPK_LT_I32 : SOPK_32 <0x00000007, "S_CMPK_LT_I32", []>;
>> -def S_CMPK_LE_I32 : SOPK_32 <0x00000008, "S_CMPK_LE_I32", []>;
>> -def S_CMPK_EQ_U32 : SOPK_32 <0x00000009, "S_CMPK_EQ_U32", []>;
>> -def S_CMPK_LG_U32 : SOPK_32 <0x0000000a, "S_CMPK_LG_U32", []>;
>> -def S_CMPK_GT_U32 : SOPK_32 <0x0000000b, "S_CMPK_GT_U32", []>;
>> -def S_CMPK_GE_U32 : SOPK_32 <0x0000000c, "S_CMPK_GE_U32", []>;
>> -def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "S_CMPK_LT_U32", []>;
>> -def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "S_CMPK_LE_U32", []>;
>> +def S_CMPK_LG_I32 : SOPCK_32 <0x00000004, "S_CMPK_LG_I32", []>;
>> +def S_CMPK_GT_I32 : SOPCK_32 <0x00000005, "S_CMPK_GT_I32", []>;
>> +def S_CMPK_GE_I32 : SOPCK_32 <0x00000006, "S_CMPK_GE_I32", []>;
>> +def S_CMPK_LT_I32 : SOPCK_32 <0x00000007, "S_CMPK_LT_I32", []>;
>> +def S_CMPK_LE_I32 : SOPCK_32 <0x00000008, "S_CMPK_LE_I32", []>;
>> +def S_CMPK_EQ_U32 : SOPCK_32 <0x00000009, "S_CMPK_EQ_U32", []>;
>> +def S_CMPK_LG_U32 : SOPCK_32 <0x0000000a, "S_CMPK_LG_U32", []>;
>> +def S_CMPK_GT_U32 : SOPCK_32 <0x0000000b, "S_CMPK_GT_U32", []>;
>> +def S_CMPK_GE_U32 : SOPCK_32 <0x0000000c, "S_CMPK_GE_U32", []>;
>> +def S_CMPK_LT_U32 : SOPCK_32 <0x0000000d, "S_CMPK_LT_U32", []>;
>> +def S_CMPK_LE_U32 : SOPCK_32 <0x0000000e, "S_CMPK_LE_U32", []>;
>>   } // End isCompare = 1
>>     def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>;
>> @@ -492,6 +492,8 @@ defm S_LOAD_DWORDX4 : SMRD_Helper <0x02,
>> "S_LOAD_DWORDX4", SReg_64, SReg_128>;
>>   defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "S_LOAD_DWORDX8", SReg_64,
>> SReg_256>;
>>   defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "S_LOAD_DWORDX16", SReg_64,
>> SReg_512>;
>>   +let usesCustomInserter = 1 in {
>> +
>>   defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
>>     0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SReg_32
>>   >;
>> @@ -512,6 +514,8 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
>>     0x0c, "S_BUFFER_LOAD_DWORDX16", SReg_128, SReg_512
>>   >;
>>   +} // usesCustomInserter = 1
>> +
>>   } // mayLoad = 1
>>     //def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>;
>
>