[PATCH] R600: Increase nearby load scheduling threshold.

Tue Jul 29 17:00:23 PDT 2014

On Tue, Jul 29, 2014 at 09:48:01PM +0000, Matt Arsenault wrote:
> This partially fixes weird looking load scheduling
> in memcpy test. The load clustering doesn't seem
> particularly smart, but this method seems to be partially
> deprecated so it might not be worth trying to fix.
> 

LGTM.

> http://reviews.llvm.org/D4713
> 
> Files:
>   lib/Target/R600/AMDGPUInstrInfo.cpp
>   test/CodeGen/R600/llvm.memcpy.ll

> Index: lib/Target/R600/AMDGPUInstrInfo.cpp
> ===================================================================
> --- lib/Target/R600/AMDGPUInstrInfo.cpp
> +++ lib/Target/R600/AMDGPUInstrInfo.cpp
> @@ -218,15 +218,26 @@
>    return true;
>  }
>  
> -bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
> -                                             int64_t Offset1, int64_t Offset2,
> -                                             unsigned NumLoads) const {
> -  assert(Offset2 > Offset1
> -         && "Second offset should be larger than first offset!");
> -  // If we have less than 16 loads in a row, and the offsets are within 16,
> -  // then schedule together.
> -  // TODO: Make the loads schedule near if it fits in a cacheline
> -  return (NumLoads < 16 && (Offset2 - Offset1) < 16);
> +// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
> +// the first 16 loads will be interleaved with the stores, and the next 16 will
> +// be clustered as expected. It should really split into 2 16 store batches.
> +//
> +// Loads are clustered until this returns false, rather than trying to schedule
> +// groups of stores. This also means we have to deal with saying different
> +// address space loads should be clustered, and ones which might cause bank
> +// conflicts.
> +//
> +// This might be deprecated so it might not be worth that much effort to fix.
> +bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
> +                                              int64_t Offset0, int64_t Offset1,
> +                                              unsigned NumLoads) const {
> +  assert(Offset1 > Offset0 &&
> +         "Second offset should be larger than first offset!");
> +  // If we have less than 16 loads in a row, and the offsets are within 64
> +  // bytes, then schedule together.
> +
> +  // A cacheline is 64 bytes (for global memory).
> +  return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
>  }
>  
>  bool
> Index: test/CodeGen/R600/llvm.memcpy.ll
> ===================================================================
> --- test/CodeGen/R600/llvm.memcpy.ll
> +++ test/CodeGen/R600/llvm.memcpy.ll
> @@ -15,17 +15,18 @@
>  ; SI: DS_WRITE_B8
>  ; SI: DS_READ_U8
>  ; SI: DS_WRITE_B8
> +
>  ; SI: DS_READ_U8
>  ; SI: DS_WRITE_B8
>  ; SI: DS_READ_U8
>  ; SI: DS_WRITE_B8
>  ; SI: DS_READ_U8
>  ; SI: DS_WRITE_B8
> -
>  ; SI: DS_READ_U8
>  ; SI: DS_WRITE_B8
>  ; SI: DS_READ_U8
>  ; SI: DS_WRITE_B8
> +
>  ; SI: DS_READ_U8
>  ; SI: DS_WRITE_B8
>  ; SI: DS_READ_U8
> @@ -35,9 +36,8 @@
>  ; SI: DS_READ_U8
>  ; SI: DS_WRITE_B8
>  ; SI: DS_READ_U8
> -; SI: DS_WRITE_B8
>  ; SI: DS_READ_U8
> -; SI: DS_WRITE_B8
> +
>  
>  ; SI: DS_READ_U8
>  ; SI: DS_READ_U8
> @@ -47,6 +47,7 @@
>  ; SI: DS_READ_U8
>  ; SI: DS_READ_U8
>  ; SI: DS_READ_U8
> +
>  ; SI: DS_READ_U8
>  ; SI: DS_READ_U8
>  ; SI: DS_READ_U8
> @@ -65,6 +66,9 @@
>  ; SI: DS_WRITE_B8
>  ; SI: DS_WRITE_B8
>  ; SI: DS_WRITE_B8
> +
> +; SI: DS_WRITE_B8
> +; SI: DS_WRITE_B8
>  ; SI: DS_WRITE_B8
>  ; SI: DS_WRITE_B8
>  ; SI: DS_WRITE_B8
> @@ -83,21 +87,13 @@
>  
>  ; FUNC-LABEL: @test_small_memcpy_i64_lds_to_lds_align2
>  ; SI: DS_READ_U16
> -; SI: DS_WRITE_B16
>  ; SI: DS_READ_U16
> -; SI: DS_WRITE_B16
>  ; SI: DS_READ_U16
> -; SI: DS_WRITE_B16
>  ; SI: DS_READ_U16
> -; SI: DS_WRITE_B16
>  ; SI: DS_READ_U16
> -; SI: DS_WRITE_B16
>  ; SI: DS_READ_U16
> -; SI: DS_WRITE_B16
>  ; SI: DS_READ_U16
> -; SI: DS_WRITE_B16
>  ; SI: DS_READ_U16
> -; SI: DS_WRITE_B16
>  
>  ; SI: DS_READ_U16
>  ; SI: DS_READ_U16
> @@ -117,6 +113,15 @@
>  ; SI: DS_WRITE_B16
>  ; SI: DS_WRITE_B16
>  
> +; SI: DS_WRITE_B16
> +; SI: DS_WRITE_B16
> +; SI: DS_WRITE_B16
> +; SI: DS_WRITE_B16
> +; SI: DS_WRITE_B16
> +; SI: DS_WRITE_B16
> +; SI: DS_WRITE_B16
> +; SI: DS_WRITE_B16
> +
>  ; SI: S_ENDPGM
>  define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
>    %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
> @@ -278,37 +283,37 @@
>  
>  ; FUNC-LABEL: @test_small_memcpy_i64_global_to_global_align2
>  ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
>  ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
>  ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
>  ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
>  ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
>  ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
>  ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
>  ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
> -
>  ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
>  ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
>  ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
>  ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
>  ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
>  ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
>  ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
>  ; SI-DAG: BUFFER_LOAD_USHORT
> +
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
>  ; SI-DAG: BUFFER_STORE_SHORT
>  
>  ; SI: S_ENDPGM
> @@ -321,9 +326,9 @@
>  
>  ; FUNC-LABEL: @test_small_memcpy_i64_global_to_global_align4
>  ; SI: BUFFER_LOAD_DWORDX4
> -; SI: BUFFER_STORE_DWORDX4
>  ; SI: BUFFER_LOAD_DWORDX4
>  ; SI: BUFFER_STORE_DWORDX4
> +; SI: BUFFER_STORE_DWORDX4
>  ; SI: S_ENDPGM
>  define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
>    %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
> @@ -334,9 +339,9 @@
>  
>  ; FUNC-LABEL: @test_small_memcpy_i64_global_to_global_align8
>  ; SI: BUFFER_LOAD_DWORDX4
> -; SI: BUFFER_STORE_DWORDX4
>  ; SI: BUFFER_LOAD_DWORDX4
>  ; SI: BUFFER_STORE_DWORDX4
> +; SI: BUFFER_STORE_DWORDX4
>  ; SI: S_ENDPGM
>  define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
>    %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
> @@ -347,9 +352,9 @@
>  
>  ; FUNC-LABEL: @test_small_memcpy_i64_global_to_global_align16
>  ; SI: BUFFER_LOAD_DWORDX4
> -; SI: BUFFER_STORE_DWORDX4
>  ; SI: BUFFER_LOAD_DWORDX4
>  ; SI: BUFFER_STORE_DWORDX4
> +; SI: BUFFER_STORE_DWORDX4
>  ; SI: S_ENDPGM
>  define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
>    %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*

> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits