[PATCH] R600/SI: Implement getOptimalMemOpType

Tom Stellard tom at stellard.net
Mon Jul 28 07:04:57 PDT 2014


On Sat, Jul 26, 2014 at 09:19:36PM +0000, Matt Arsenault wrote:
> The default guess uses i32. This really needs an address space argument to really do the right thing in all cases.
> 
> http://reviews.llvm.org/D4683
> 
> Files:
>   lib/Target/R600/SIISelLowering.cpp
>   lib/Target/R600/SIISelLowering.h
>   test/CodeGen/R600/llvm.memcpy.ll

This patch LGTM, but I've disabled memcpy for R600, so clang and the optimization
passes shouldn't be generating it.

-Tom

> Index: lib/Target/R600/SIISelLowering.cpp
> ===================================================================
> --- lib/Target/R600/SIISelLowering.cpp
> +++ lib/Target/R600/SIISelLowering.cpp
> @@ -272,6 +272,26 @@
>    return VT.bitsGT(MVT::i32);
>  }
>  
> +EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
> +                                          unsigned SrcAlign, bool IsMemset,
> +                                          bool ZeroMemset,
> +                                          bool MemcpyStrSrc,
> +                                          MachineFunction &MF) const {
> +  // FIXME: Should account for address space here.
> +
> +  // The default fallback uses the private pointer size as a guess for a type to
> +  // use. Make sure we switch these to 64-bit accesses.
> +
> +  if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
> +    return MVT::v4i32;
> +
> +  if (Size >= 8 && DstAlign >= 4)
> +    return MVT::v2i32;
> +
> +  // Use the default.
> +  return MVT::Other;
> +}
> +
>  TargetLoweringBase::LegalizeTypeAction
>  SITargetLowering::getPreferredVectorAction(EVT VT) const {
>    if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
> Index: lib/Target/R600/SIISelLowering.h
> ===================================================================
> --- lib/Target/R600/SIISelLowering.h
> +++ lib/Target/R600/SIISelLowering.h
> @@ -62,6 +62,12 @@
>    bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS,
>                                       bool *IsFast) const override;
>  
> +  EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
> +                          unsigned SrcAlign, bool IsMemset,
> +                          bool ZeroMemset,
> +                          bool MemcpyStrSrc,
> +                          MachineFunction &MF) const override;
> +
>    TargetLoweringBase::LegalizeTypeAction
>    getPreferredVectorAction(EVT VT) const override;
>  
> Index: test/CodeGen/R600/llvm.memcpy.ll
> ===================================================================
> --- /dev/null
> +++ test/CodeGen/R600/llvm.memcpy.ll
> @@ -0,0 +1,358 @@
> +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
> +
> +declare void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(3)* nocapture, i32, i32, i1) nounwind
> +declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind
> +
> +
> +; FUNC-LABEL: @test_small_memcpy_i64_lds_to_lds_align1
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +; SI: DS_READ_U8
> +; SI: DS_WRITE_B8
> +
> +; SI: S_ENDPGM
> +define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
> +  %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
> +  %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
> +  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 1, i1 false) nounwind
> +  ret void
> +}
> +
> +; FUNC-LABEL: @test_small_memcpy_i64_lds_to_lds_align2
> +; SI: DS_READ_U16
> +; SI: DS_WRITE_B16
> +; SI: DS_READ_U16
> +; SI: DS_WRITE_B16
> +; SI: DS_READ_U16
> +; SI: DS_WRITE_B16
> +; SI: DS_READ_U16
> +; SI: DS_WRITE_B16
> +; SI: DS_READ_U16
> +; SI: DS_WRITE_B16
> +; SI: DS_READ_U16
> +; SI: DS_WRITE_B16
> +; SI: DS_READ_U16
> +; SI: DS_WRITE_B16
> +; SI: DS_READ_U16
> +; SI: DS_WRITE_B16
> +
> +; SI: DS_READ_U16
> +; SI: DS_WRITE_B16
> +; SI: DS_READ_U16
> +; SI: DS_WRITE_B16
> +; SI: DS_READ_U16
> +; SI: DS_WRITE_B16
> +; SI: DS_READ_U16
> +; SI: DS_WRITE_B16
> +; SI: DS_READ_U16
> +; SI: DS_WRITE_B16
> +; SI: DS_READ_U16
> +; SI: DS_WRITE_B16
> +; SI: DS_READ_U16
> +; SI: DS_WRITE_B16
> +; SI: DS_READ_U16
> +; SI: DS_WRITE_B16
> +
> +; SI: S_ENDPGM
> +define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
> +  %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
> +  %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
> +  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 2, i1 false) nounwind
> +  ret void
> +}
> +
> +; FUNC-LABEL: @test_small_memcpy_i64_lds_to_lds_align4
> +; SI-DAG: DS_READ_B32
> +; SI-DAG: DS_WRITE_B32
> +
> +; SI-DAG: DS_READ_B32
> +; SI-DAG: DS_WRITE_B32
> +
> +; SI-DAG: DS_READ_B32
> +; SI-DAG: DS_WRITE_B32
> +
> +; SI-DAG: DS_READ_B32
> +; SI-DAG: DS_WRITE_B32
> +
> +; SI-DAG: DS_READ_B32
> +; SI-DAG: DS_WRITE_B32
> +
> +; SI-DAG: DS_READ_B32
> +; SI-DAG: DS_WRITE_B32
> +
> +; SI-DAG: DS_READ_B32
> +; SI-DAG: DS_WRITE_B32
> +
> +; SI-DAG: DS_READ_B32
> +; SI-DAG: DS_WRITE_B32
> +
> +; SI-DAG: DS_READ_B32
> +; SI-DAG: DS_WRITE_B32
> +
> +; SI: S_ENDPGM
> +define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
> +  %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
> +  %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
> +  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 4, i1 false) nounwind
> +  ret void
> +}
> +
> +; FIXME: Use 64-bit ops
> +; FUNC-LABEL: @test_small_memcpy_i64_lds_to_lds_align8
> +
> +; SI-DAG: DS_READ_B32
> +; SI-DAG: DS_WRITE_B32
> +
> +; SI-DAG: DS_READ_B32
> +; SI-DAG: DS_WRITE_B32
> +
> +; SI-DAG: DS_READ_B32
> +; SI-DAG: DS_WRITE_B32
> +
> +; SI-DAG: DS_READ_B32
> +; SI-DAG: DS_WRITE_B32
> +
> +; SI-DAG: DS_READ_B32
> +; SI-DAG: DS_WRITE_B32
> +
> +; SI-DAG: DS_READ_B32
> +; SI-DAG: DS_WRITE_B32
> +
> +; SI-DAG: DS_READ_B32
> +; SI-DAG: DS_WRITE_B32
> +
> +; SI-DAG: DS_READ_B32
> +; SI-DAG: DS_WRITE_B32
> +
> +; SI-DAG: DS_READ_B32
> +; SI-DAG: DS_WRITE_B32
> +
> +; SI-DAG: S_ENDPGM
> +define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
> +  %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
> +  %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
> +  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 8, i1 false) nounwind
> +  ret void
> +}
> +
> +; FUNC-LABEL: @test_small_memcpy_i64_global_to_global_align1
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +; SI-DAG: BUFFER_LOAD_UBYTE
> +; SI-DAG: BUFFER_STORE_BYTE
> +
> +; SI: S_ENDPGM
> +define void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
> +  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
> +  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
> +  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 1, i1 false) nounwind
> +  ret void
> +}
> +
> +; FUNC-LABEL: @test_small_memcpy_i64_global_to_global_align2
> +; SI-DAG: BUFFER_LOAD_USHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_LOAD_USHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_LOAD_USHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_LOAD_USHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_LOAD_USHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_LOAD_USHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_LOAD_USHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_LOAD_USHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +
> +; SI-DAG: BUFFER_LOAD_USHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_LOAD_USHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_LOAD_USHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_LOAD_USHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_LOAD_USHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_LOAD_USHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_LOAD_USHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +; SI-DAG: BUFFER_LOAD_USHORT
> +; SI-DAG: BUFFER_STORE_SHORT
> +
> +; SI: S_ENDPGM
> +define void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
> +  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
> +  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
> +  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 2, i1 false) nounwind
> +  ret void
> +}
> +
> +; FUNC-LABEL: @test_small_memcpy_i64_global_to_global_align4
> +; SI: BUFFER_LOAD_DWORDX4
> +; SI: BUFFER_STORE_DWORDX4
> +; SI: BUFFER_LOAD_DWORDX4
> +; SI: BUFFER_STORE_DWORDX4
> +; SI: S_ENDPGM
> +define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
> +  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
> +  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
> +  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 4, i1 false) nounwind
> +  ret void
> +}
> +
> +; FUNC-LABEL: @test_small_memcpy_i64_global_to_global_align8
> +; SI: BUFFER_LOAD_DWORDX4
> +; SI: BUFFER_STORE_DWORDX4
> +; SI: BUFFER_LOAD_DWORDX4
> +; SI: BUFFER_STORE_DWORDX4
> +; SI: S_ENDPGM
> +define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
> +  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
> +  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
> +  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 8, i1 false) nounwind
> +  ret void
> +}
> +
> +; FUNC-LABEL: @test_small_memcpy_i64_global_to_global_align16
> +; SI: BUFFER_LOAD_DWORDX4
> +; SI: BUFFER_STORE_DWORDX4
> +; SI: BUFFER_LOAD_DWORDX4
> +; SI: BUFFER_STORE_DWORDX4
> +; SI: S_ENDPGM
> +define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
> +  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
> +  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
> +  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 16, i1 false) nounwind
> +  ret void
> +}

> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits




More information about the llvm-commits mailing list