[PATCH] R600: Increase nearby load scheduling threshold.
Tom Stellard
tom at stellard.net
Tue Jul 29 17:00:23 PDT 2014
On Tue, Jul 29, 2014 at 09:48:01PM +0000, Matt Arsenault wrote:
> This partially fixes weird looking load scheduling
> in memcpy test. The load clustering doesn't seem
> particularly smart, but this method seems to be partially
> deprecated so it might not be worth trying to fix.
>
LGTM.
> http://reviews.llvm.org/D4713
>
> Files:
> lib/Target/R600/AMDGPUInstrInfo.cpp
> test/CodeGen/R600/llvm.memcpy.ll
> Index: lib/Target/R600/AMDGPUInstrInfo.cpp
> ===================================================================
> --- lib/Target/R600/AMDGPUInstrInfo.cpp
> +++ lib/Target/R600/AMDGPUInstrInfo.cpp
> @@ -218,15 +218,26 @@
> return true;
> }
>
> -bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
> - int64_t Offset1, int64_t Offset2,
> - unsigned NumLoads) const {
> - assert(Offset2 > Offset1
> - && "Second offset should be larger than first offset!");
> - // If we have less than 16 loads in a row, and the offsets are within 16,
> - // then schedule together.
> - // TODO: Make the loads schedule near if it fits in a cacheline
> - return (NumLoads < 16 && (Offset2 - Offset1) < 16);
> +// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
> +// the first 16 loads will be interleaved with the stores, and the next 16 will
> +// be clustered as expected. It should really split into 2 16 store batches.
> +//
> +// Loads are clustered until this returns false, rather than trying to schedule
> +// groups of stores. This also means we have to deal with saying different
> +// address space loads should be clustered, and ones which might cause bank
> +// conflicts.
> +//
> +// This might be deprecated so it might not be worth that much effort to fix.
> +bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
> + int64_t Offset0, int64_t Offset1,
> + unsigned NumLoads) const {
> + assert(Offset1 > Offset0 &&
> + "Second offset should be larger than first offset!");
> + // If we have less than 16 loads in a row, and the offsets are within 64
> + // bytes, then schedule together.
> +
> + // A cacheline is 64 bytes (for global memory).
> + return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
> }
>
> bool
> Index: test/CodeGen/R600/llvm.memcpy.ll
> ===================================================================
> --- test/CodeGen/R600/llvm.memcpy.ll
> +++ test/CodeGen/R600/llvm.memcpy.ll
> @@ -15,17 +15,18 @@
> ; SI: DS_WRITE_B8
> ; SI: DS_READ_U8
> ; SI: DS_WRITE_B8
> +
> ; SI: DS_READ_U8
> ; SI: DS_WRITE_B8
> ; SI: DS_READ_U8
> ; SI: DS_WRITE_B8
> ; SI: DS_READ_U8
> ; SI: DS_WRITE_B8
> -
> ; SI: DS_READ_U8
> ; SI: DS_WRITE_B8
> ; SI: DS_READ_U8
> ; SI: DS_WRITE_B8
> +
> ; SI: DS_READ_U8
> ; SI: DS_WRITE_B8
> ; SI: DS_READ_U8
> @@ -35,9 +36,8 @@
> ; SI: DS_READ_U8
> ; SI: DS_WRITE_B8
> ; SI: DS_READ_U8
> -; SI: DS_WRITE_B8
> ; SI: DS_READ_U8
> -; SI: DS_WRITE_B8
> +
>
> ; SI: DS_READ_U8
> ; SI: DS_READ_U8
> @@ -47,6 +47,7 @@
> ; SI: DS_READ_U8
> ; SI: DS_READ_U8
> ; SI: DS_READ_U8
> +
> ; SI: DS_READ_U8
> ; SI: DS_READ_U8
> ; SI: DS_READ_U8
> @@ -65,6 +66,9 @@
> ; SI: DS_WRITE_B8
> ; SI: DS_WRITE_B8
> ; SI: DS_WRITE_B8
> +
> +; SI: DS_WRITE_B8
> +; SI: DS_WRITE_B8
> ; SI: DS_WRITE_B8
> ; SI: DS_WRITE_B8
> ; SI: DS_WRITE_B8
> @@ -83,21 +87,13 @@
>
> ; FUNC-LABEL: @test_small_memcpy_i64_lds_to_lds_align2
> ; SI: DS_READ_U16
> -; SI: DS_WRITE_B16
> ; SI: DS_READ_U16
> -; SI: DS_WRITE_B16
> ; SI: DS_READ_U16
> -; SI: DS_WRITE_B16
> ; SI: DS_READ_U16
> -; SI: DS_WRITE_B16
> ; SI: DS_READ_U16
> -; SI: DS_WRITE_B16
> ; SI: DS_READ_U16
> -; SI: DS_WRITE_B16
> ; SI: DS_READ_U16
> -; SI: DS_WRITE_B16
> ; SI: DS_READ_U16
> -; SI: DS_WRITE_B16
>
> ; SI: DS_READ_U16
> ; SI: DS_READ_U16
> @@ -117,6 +113,15 @@
> ; SI: DS_WRITE_B16
> ; SI: DS_WRITE_B16
>
> +; SI: DS_WRITE_B16
> +; SI: DS_WRITE_B16
> +; SI: DS_WRITE_B16
> +; SI: DS_WRITE_B16
> +; SI: DS_WRITE_B16
> +; SI: DS_WRITE_B16
> +; SI: DS_WRITE_B16
> +; SI: DS_WRITE_B16
> +
> ; SI: S_ENDPGM
> define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
> %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
> @@ -278,37 +283,37 @@
>
> ; FUNC-LABEL: @test_small_memcpy_i64_global_to_global_align2
> ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
> ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
> ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
> ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
> ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
> ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
> ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
> ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
> -
> ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
> ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
> ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
> ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
> ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
> ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
> ; SI-DAG: BUFFER_LOAD_USHORT
> -; SI-DAG: BUFFER_STORE_SHORT
> ; SI-DAG: BUFFER_LOAD_USHORT
> +
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> ; SI-DAG: BUFFER_STORE_SHORT
>
> ; SI: S_ENDPGM
> @@ -321,9 +326,9 @@
>
> ; FUNC-LABEL: @test_small_memcpy_i64_global_to_global_align4
> ; SI: BUFFER_LOAD_DWORDX4
> -; SI: BUFFER_STORE_DWORDX4
> ; SI: BUFFER_LOAD_DWORDX4
> ; SI: BUFFER_STORE_DWORDX4
> +; SI: BUFFER_STORE_DWORDX4
> ; SI: S_ENDPGM
> define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
> %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
> @@ -334,9 +339,9 @@
>
> ; FUNC-LABEL: @test_small_memcpy_i64_global_to_global_align8
> ; SI: BUFFER_LOAD_DWORDX4
> -; SI: BUFFER_STORE_DWORDX4
> ; SI: BUFFER_LOAD_DWORDX4
> ; SI: BUFFER_STORE_DWORDX4
> +; SI: BUFFER_STORE_DWORDX4
> ; SI: S_ENDPGM
> define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
> %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
> @@ -347,9 +352,9 @@
>
> ; FUNC-LABEL: @test_small_memcpy_i64_global_to_global_align16
> ; SI: BUFFER_LOAD_DWORDX4
> -; SI: BUFFER_STORE_DWORDX4
> ; SI: BUFFER_LOAD_DWORDX4
> ; SI: BUFFER_STORE_DWORDX4
> +; SI: BUFFER_STORE_DWORDX4
> ; SI: S_ENDPGM
> define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
> %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list