[PATCH] R600/SI: Re-initialize the m0 register after using it for indirect addressing

Thu Jun 12 17:10:21 PDT 2014

On 06/12/2014 04:47 PM, Tom Stellard wrote:
> We need to store a value greater than or equal to the number of LDS
> bytes allocated by the shader in the m0 register in order for LDS
> instructions to work correctly.
>
> We always initialize m0 at the beginning of a shader, but this register
> is also used for indirect addressing offsets, so we need to
> re-initialize it any time we use indirect addressing.
> ---
>   lib/Target/R600/SILowerControlFlow.cpp | 86 +++++++++++++++++++---------------
>   1 file changed, 49 insertions(+), 37 deletions(-)
>
> diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
> index 6601f2a..09030a2 100644
> --- a/lib/Target/R600/SILowerControlFlow.cpp
> +++ b/lib/Target/R600/SILowerControlFlow.cpp
> @@ -86,6 +86,7 @@ private:
>     void Kill(MachineInstr &MI);
>     void Branch(MachineInstr &MI);
>   
> +  void InitM0ForLDS(MachineBasicBlock::iterator MI);
>     void LoadM0(MachineInstr &MI, MachineInstr *MovRel);
>     void IndirectSrc(MachineInstr &MI);
>     void IndirectDst(MachineInstr &MI);
> @@ -320,6 +321,14 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
>     MI.eraseFromParent();
>   }
>   
> +/// The m0 register stores the maximum allowable address for LDS reads and
> +/// writes.  Its value must be at least the size in bytes of LDS allocated by
> +/// the shader.  For simplicity, we set it to the maximum possible value.
> +void SILowerControlFlowPass::InitM0ForLDS(MachineBasicBlock::iterator MI) {
> +    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),  TII->get(AMDGPU::S_MOV_B32),
> +            AMDGPU::M0).addImm(0xffffffff);
> +}
> +
>   void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
>   
>     MachineBasicBlock &MBB = *MI.getParent();
> @@ -333,52 +342,56 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
>       BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
>               .addReg(Idx);
>       MBB.insert(I, MovRel);
> -    MI.eraseFromParent();
> -    return;
> -  }
> +  } else {
>   
> -  assert(AMDGPU::SReg_64RegClass.contains(Save));
> -  assert(AMDGPU::VReg_32RegClass.contains(Idx));
> +    assert(AMDGPU::SReg_64RegClass.contains(Save));
> +    assert(AMDGPU::VReg_32RegClass.contains(Idx));
>   
> -  // Save the EXEC mask
> -  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
> -          .addReg(AMDGPU::EXEC);
> +    // Save the EXEC mask
> +    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
> +            .addReg(AMDGPU::EXEC);
>   
> -  // Read the next variant into VCC (lower 32 bits) <- also loop target
> -  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
> -          AMDGPU::VCC_LO)
> -          .addReg(Idx);
> +    // Read the next variant into VCC (lower 32 bits) <- also loop target
> +    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
> +            AMDGPU::VCC_LO)
> +            .addReg(Idx);
>   
> -  // Move index from VCC into M0
> -  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
> -          .addReg(AMDGPU::VCC_LO);
> +    // Move index from VCC into M0
> +    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
> +            .addReg(AMDGPU::VCC_LO);
>   
> -  // Compare the just read M0 value to all possible Idx values
> -  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
> -          .addReg(AMDGPU::M0)
> -          .addReg(Idx);
> +    // Compare the just read M0 value to all possible Idx values
> +    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
> +            .addReg(AMDGPU::M0)
> +            .addReg(Idx);
>   
> -  // Update EXEC, save the original EXEC value to VCC
> -  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
> -          .addReg(AMDGPU::VCC);
> +    // Update EXEC, save the original EXEC value to VCC
> +    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
> +            .addReg(AMDGPU::VCC);
>   
> -  // Do the actual move
> -  MBB.insert(I, MovRel);
> +    // Do the actual move
> +    MBB.insert(I, MovRel);
>   
> -  // Update EXEC, switch all done bits to 0 and all todo bits to 1
> -  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
> -          .addReg(AMDGPU::EXEC)
> -          .addReg(AMDGPU::VCC);
> +    // Update EXEC, switch all done bits to 0 and all todo bits to 1
> +    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
> +            .addReg(AMDGPU::EXEC)
> +            .addReg(AMDGPU::VCC);
>   
> -  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
> -  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
> -          .addImm(-7)
> -          .addReg(AMDGPU::EXEC);
> +    // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
> +    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
> +            .addImm(-7)
> +            .addReg(AMDGPU::EXEC);
>   
> -  // Restore EXEC
> -  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
> -          .addReg(Save);
> +    // Restore EXEC
> +    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
> +            .addReg(Save);
>   
> +  }
> +  // FIXME: Are there any values other than the LDS address clamp that need to
> +  // be stored in the m0 register and may be live for more than a few
> +  // instructions?  If so, we should save the m0 register at the beginning
> +  // of this function and restore it here.
I think you need to set m0 up differently when doing direct buffer to 
LDS loads

> +  InitM0ForLDS(&MI);
>     MI.eraseFromParent();
>   }
>   
> @@ -523,8 +536,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
>       MachineBasicBlock &MBB = MF.front();
>       // Initialize M0 to a value that won't cause LDS access to be discarded
>       // due to offset clamping
> -    BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_MOV_B32),
> -            AMDGPU::M0).addImm(0xffffffff);
> +    InitM0ForLDS(MBB.getFirstNonPHI());
>     }
>   
>     if (NeedWQM && MFI->ShaderType == ShaderType::PIXEL) {