[PATCHES] R600/SI: Small VI improvements

Fri Mar 6 09:50:21 PST 2015

On Fri, Mar 06, 2015 at 06:31:04PM +0100, Marek Olšák wrote:
> Thanks Tom for fixing the scheduler weirdness. The attached patch only
> adds the fix for the elf test.
> 
> Marek
> 
> On Fri, Mar 6, 2015 at 2:45 PM, Marek Olšák <maraeo at gmail.com> wrote:
> > Well, the patch breaks a lot of tests due to some instructions being
> > scheduled differently for some reason. I'm fixing the tests.
> > Apparently, the reserved registers have an effect on the scheduler.
> >
> > Marek
> >
> > On Fri, Mar 6, 2015 at 1:24 PM, Marek Olšák <maraeo at gmail.com> wrote:
> >> An updated patch is attached. Please review.
> >>
> >> Marek
> >>
> >> On Wed, Mar 4, 2015 at 8:25 PM, Tom Stellard <tom at stellard.net> wrote:
> >>> On Wed, Mar 04, 2015 at 06:41:19PM +0100, Marek Olšák wrote:
> >>>> Please review.
> >>>>
> >>>> I'm not sure how important the second patch is.
> >>>>
> >>>> Marek
> >>>
> >>>> From c89e3bcc8475b3519967e1b34187164a20080250 Mon Sep 17 00:00:00 2001
> >>>> From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak at amd.com>
> >>>> Date: Wed, 4 Mar 2015 15:40:53 +0100
> >>>> Subject: [PATCH 1/2] R600/SI: Limit SGPRs to 80 on Tonga and Iceland
> >>>>
> >>>> This is a candidate for stable.
> >>>> ---
> >>>>  lib/Target/R600/SIRegisterInfo.cpp | 8 ++++++++
> >>>>  1 file changed, 8 insertions(+)
> >>>>
> >>>> diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp
> >>>> index e2138d2..4b9bee3 100644
> >>>> --- a/lib/Target/R600/SIRegisterInfo.cpp
> >>>> +++ b/lib/Target/R600/SIRegisterInfo.cpp
> >>>> @@ -47,6 +47,14 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
> >>>>    Reserved.set(AMDGPU::VGPR255);
> >>>>    Reserved.set(AMDGPU::VGPR254);
> >>>>
> >>>> +  // Tonga and Iceland can only allocate 80 SGPRs due to a hw bug.
> >>>> +  // That's 74 SGPRs if all XNACK_MASK, FLAT_SCRATCH, and VCC are used.
> >>>> +  // For now, assume XNACK_MASK is unused.
> >>>
> >>> This should be added as a subtarget feature in AMDGPU.td / AMDGPUSubtarget.h
> >>> and applied to these GPUs in Processors.td
> >>>
> >>>> +  StringRef Cpu = ST.getTargetLowering()->getTargetMachine().getTargetCPU();
> >>>> +  if (Cpu == "tonga" || Cpu == "iceland")
> >>>> +    for (int i = AMDGPU::SGPR76; i <= AMDGPU::SGPR101; i++)
> >>>> +      Reserved.set(i);
> >>>> +
> >>>
> >>> You also need to reserve super registers.  Something like:
> >>>
> >>> for (unsigned i = 76, e = AMDGPU::SGPR_32RegClass.getReg(i); i !=e; ++i) {
> >>>   for (MCRegAliasIterator R = MCRegAliasIterator(i, this, true); R.isValid() ++R) {
> >>>     Reserved.set(*R);
> >>>   }
> >>> }
> >>>
> >>> You should also update AMDGPUAsmPrinter.cpp to always report at least 80 SGPRs
> >>> used.
> >>>
> >>>>    return Reserved;
> >>>>  }
> >>>>
> >>>> --
> >>>> 2.1.0
> >>>>
> >>>
> >>>> From 932eaffc444fa581008c5039eaa63a6727beb534 Mon Sep 17 00:00:00 2001
> >>>> From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak at amd.com>
> >>>> Date: Wed, 4 Mar 2015 17:59:50 +0100
> >>>> Subject: [PATCH 2/2] R600/SI: Fix getNumSGPRsAllowed for VI
> >>>>
> >>>
> >>> This function is only used by the scheduler and is important for getting
> >>> the best performance.
> >>>
> >>> LGTM.
> >>>
> >>>> ---
> >>>>  lib/Target/R600/SIRegisterInfo.cpp | 32 +++++++++++++++++++++-----------
> >>>>  lib/Target/R600/SIRegisterInfo.h   |  4 +++-
> >>>>  2 files changed, 24 insertions(+), 12 deletions(-)
> >>>>
> >>>> diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp
> >>>> index 4b9bee3..14413e9 100644
> >>>> --- a/lib/Target/R600/SIRegisterInfo.cpp
> >>>> +++ b/lib/Target/R600/SIRegisterInfo.cpp
> >>>> @@ -14,7 +14,6 @@
> >>>>
> >>>>
> >>>>  #include "SIRegisterInfo.h"
> >>>> -#include "AMDGPUSubtarget.h"
> >>>>  #include "SIInstrInfo.h"
> >>>>  #include "SIMachineFunctionInfo.h"
> >>>>  #include "llvm/CodeGen/MachineFrameInfo.h"
> >>>> @@ -61,7 +60,8 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
> >>>>  unsigned SIRegisterInfo::getRegPressureSetLimit(unsigned Idx) const {
> >>>>
> >>>>    // FIXME: We should adjust the max number of waves based on LDS size.
> >>>> -  unsigned SGPRLimit = getNumSGPRsAllowed(ST.getMaxWavesPerCU());
> >>>> +  unsigned SGPRLimit = getNumSGPRsAllowed(ST.getGeneration(),
> >>>> +                                          ST.getMaxWavesPerCU());
> >>>>    unsigned VGPRLimit = getNumVGPRsAllowed(ST.getMaxWavesPerCU());
> >>>>
> >>>>    for (regclass_iterator I = regclass_begin(), E = regclass_end();
> >>>> @@ -502,14 +502,24 @@ unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const {
> >>>>    }
> >>>>  }
> >>>>
> >>>> -unsigned SIRegisterInfo::getNumSGPRsAllowed(unsigned WaveCount) const {
> >>>> -  switch(WaveCount) {
> >>>> -    case 10: return 48;
> >>>> -    case 9:  return 56;
> >>>> -    case 8:  return 64;
> >>>> -    case 7:  return 72;
> >>>> -    case 6:  return 80;
> >>>> -    case 5:  return 96;
> >>>> -    default: return 103;
> >>>> +unsigned SIRegisterInfo::getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen,
> >>>> +                                            unsigned WaveCount) const {
> >>>> +  if (gen >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
> >>>> +    switch (WaveCount) {
> >>>> +      case 10: return 80;
> >>>> +      case 9:  return 80;
> >>>> +      case 8:  return 96;
> >>>> +      default: return 102;
> >>>> +    }
> >>>> +  } else {
> >>>> +    switch(WaveCount) {
> >>>> +      case 10: return 48;
> >>>> +      case 9:  return 56;
> >>>> +      case 8:  return 64;
> >>>> +      case 7:  return 72;
> >>>> +      case 6:  return 80;
> >>>> +      case 5:  return 96;
> >>>> +      default: return 103;
> >>>> +    }
> >>>>    }
> >>>>  }
> >>>> diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h
> >>>> index d908ffd..1dfe530 100644
> >>>> --- a/lib/Target/R600/SIRegisterInfo.h
> >>>> +++ b/lib/Target/R600/SIRegisterInfo.h
> >>>> @@ -17,6 +17,7 @@
> >>>>  #define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H
> >>>>
> >>>>  #include "AMDGPURegisterInfo.h"
> >>>> +#include "AMDGPUSubtarget.h"
> >>>>  #include "llvm/Support/Debug.h"
> >>>>
> >>>>  namespace llvm {
> >>>> @@ -111,7 +112,8 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
> >>>>
> >>>>    /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount
> >>>>    ///        concurrent waves.
> >>>> -  unsigned getNumSGPRsAllowed(unsigned WaveCount) const;
> >>>> +  unsigned getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen,
> >>>> +                              unsigned WaveCount) const;
> >>>>
> >>>>    unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
> >>>>                                const TargetRegisterClass *RC) const;
> >>>> --
> >>>> 2.1.0
> >>>>
> >>>
> >>>> _______________________________________________
> >>>> llvm-commits mailing list
> >>>> llvm-commits at cs.uiuc.edu
> >>>> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
> >>>

> From d31e390cfe4c9adce557bea249d2afe80571074f Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak at amd.com>
> Date: Wed, 4 Mar 2015 15:40:53 +0100
> Subject: [PATCH] R600/SI: Limit SGPRs to 80 on Tonga and Iceland
> 
> This is a candidate for stable.

LGTM.  I'll make sure it gets into 3.6.1.

-Tom

> ---
>  lib/Target/R600/AMDGPU.td            |  5 +++++
>  lib/Target/R600/AMDGPUAsmPrinter.cpp |  7 +++++++
>  lib/Target/R600/AMDGPUSubtarget.cpp  |  2 +-
>  lib/Target/R600/AMDGPUSubtarget.h    |  9 +++++++++
>  lib/Target/R600/Processors.td        |  8 ++++++--
>  lib/Target/R600/SIRegisterInfo.cpp   | 17 +++++++++++++++++
>  test/CodeGen/R600/elf.ll             |  9 ++++++---
>  7 files changed, 51 insertions(+), 6 deletions(-)
> 
> diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
> index a7d48b3..e5d5ce2 100644
> --- a/lib/Target/R600/AMDGPU.td
> +++ b/lib/Target/R600/AMDGPU.td
> @@ -103,6 +103,11 @@ def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
>          "true",
>          "Enable spilling of VGPRs to scratch memory">;
>  
> +def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
> +        "SGPRInitBug",
> +        "true",
> +        "VI SGPR initilization bug requiring a fixed SGPR allocation size">;
> +
>  class SubtargetFeatureFetchLimit <string Value> :
>                            SubtargetFeature <"fetch"#Value,
>          "TexVTXClauseSize",
> diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
> index 92bc314..5e1b6a3 100644
> --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
> +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
> @@ -339,6 +339,13 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
>    ProgInfo.NumVGPR = MaxVGPR + 1;
>    ProgInfo.NumSGPR = MaxSGPR + 1;
>  
> +  if (STM.hasSGPRInitBug()) {
> +    if (ProgInfo.NumSGPR > AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG)
> +      llvm_unreachable("Too many SGPRs used with the SGPR init bug");
> +
> +    ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
> +  }
> +
>    ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;
>    ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8;
>    // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
> diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
> index 70c8525..0ead652 100644
> --- a/lib/Target/R600/AMDGPUSubtarget.cpp
> +++ b/lib/Target/R600/AMDGPUSubtarget.cpp
> @@ -70,7 +70,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS,
>        CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true),
>        EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false),
>        WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
> -      EnableVGPRSpilling(false),
> +      EnableVGPRSpilling(false), SGPRInitBug(false),
>        FrameLowering(TargetFrameLowering::StackGrowsUp,
>                      64 * 16, // Maximum stack alignment (long16)
>                      0),
> diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
> index 1b0122c..403a3e4 100644
> --- a/lib/Target/R600/AMDGPUSubtarget.h
> +++ b/lib/Target/R600/AMDGPUSubtarget.h
> @@ -44,6 +44,10 @@ public:
>      VOLCANIC_ISLANDS,
>    };
>  
> +  enum {
> +    FIXED_SGPR_COUNT_FOR_INIT_BUG = 80
> +  };
> +
>  private:
>    std::string DevName;
>    bool Is64bit;
> @@ -66,6 +70,7 @@ private:
>    bool CFALUBug;
>    int LocalMemorySize;
>    bool EnableVGPRSpilling;
> +  bool SGPRInitBug;
>  
>    AMDGPUFrameLowering FrameLowering;
>    std::unique_ptr<AMDGPUTargetLowering> TLInfo;
> @@ -206,6 +211,10 @@ public:
>      return LocalMemorySize;
>    }
>  
> +  bool hasSGPRInitBug() const {
> +    return SGPRInitBug;
> +  }
> +
>    unsigned getAmdKernelCodeChipID() const;
>  
>    bool enableMachineScheduler() const override {
> diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
> index fb5aa61..82c6d13 100644
> --- a/lib/Target/R600/Processors.td
> +++ b/lib/Target/R600/Processors.td
> @@ -119,8 +119,12 @@ def : ProcessorModel<"mullins",    SIQuarterSpeedModel, [FeatureSeaIslands]>;
>  // Volcanic Islands
>  //===----------------------------------------------------------------------===//
>  
> -def : ProcessorModel<"tonga",   SIQuarterSpeedModel, [FeatureVolcanicIslands]>;
> +def : ProcessorModel<"tonga",   SIQuarterSpeedModel,
> +  [FeatureVolcanicIslands, FeatureSGPRInitBug]
> +>;
>  
> -def : ProcessorModel<"iceland", SIQuarterSpeedModel, [FeatureVolcanicIslands]>;
> +def : ProcessorModel<"iceland", SIQuarterSpeedModel,
> +  [FeatureVolcanicIslands, FeatureSGPRInitBug]
> +>;
>  
>  def : ProcessorModel<"carrizo", SIQuarterSpeedModel, [FeatureVolcanicIslands]>;
> diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp
> index f3a4282..2571472 100644
> --- a/lib/Target/R600/SIRegisterInfo.cpp
> +++ b/lib/Target/R600/SIRegisterInfo.cpp
> @@ -46,6 +46,23 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
>    Reserved.set(AMDGPU::VGPR255);
>    Reserved.set(AMDGPU::VGPR254);
>  
> +  // Tonga and Iceland can only allocate a fixed number of SGPRs due
> +  // to a hw bug.
> +  if (ST.hasSGPRInitBug()) {
> +    unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
> +    // Reserve some SGPRs for FLAT_SCRATCH and VCC (4 SGPRs).
> +    // Assume XNACK_MASK is unused.
> +    unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4;
> +
> +    for (unsigned i = Limit; i < NumSGPRs; ++i) {
> +      unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
> +      MCRegAliasIterator R = MCRegAliasIterator(Reg, this, true);
> +
> +      for (; R.isValid(); ++R)
> +        Reserved.set(*R);
> +    }
> +  }
> +
>    return Reserved;
>  }
>  
> diff --git a/test/CodeGen/R600/elf.ll b/test/CodeGen/R600/elf.ll
> index aca3109..f801b3f 100644
> --- a/test/CodeGen/R600/elf.ll
> +++ b/test/CodeGen/R600/elf.ll
> @@ -1,7 +1,9 @@
>  ; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s
> -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG %s
> +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s
>  ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s
> -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG %s
> +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TONGA %s
> +; RUN: llc < %s -march=amdgcn -mcpu=carrizo -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s
> +; RUN: llc < %s -march=amdgcn -mcpu=carrizo -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s
>  
>  ; ELF: Format: ELF32
>  ; ELF: Name: .AMDGPU.config
> @@ -15,7 +17,8 @@
>  ; CONFIG: test:
>  ; CONFIG: .section .AMDGPU.config
>  ; CONFIG-NEXT: .long   45096
> -; CONFIG-NEXT: .long   0
> +; TYPICAL-NEXT: .long   0
> +; TONGA-NEXT: .long   576
>  define void @test(i32 %p) #0 {
>     %i = add i32 %p, 2
>     %r = bitcast i32 %i to float
> -- 
> 2.1.0
>