[PATCH] Make CodeGenPrepare pass address space to isLegalAddressingMode

Thu Jun 4 08:50:06 PDT 2015

On Wed, Jun 03, 2015 at 12:49:58PM -0700, Matt Arsenault wrote:
> Update the easiest to fix uses of isLegalAddressingMode.
> 

> From dcdc1148faec5ad3bb02da00bcff9d5cbfadeac6 Mon Sep 17 00:00:00 2001
> From: Matt Arsenault <Matthew.Arsenault at amd.com>
> Date: Sun, 31 May 2015 21:20:41 -0700
> Subject: [PATCH 3/4] R600/SI: Reimplement isLegalAddressingMode
> 
> Now that we sometimes know the address space, this can
> theoretically do a better job.
> 
> This needs better test coverage, but this mostly depends on
> first updating the loop optimizatiosn to provide the address
> space.
> ---
>  lib/Target/R600/AMDGPU.h                  |   5 +-
>  lib/Target/R600/SIISelLowering.cpp        |  96 ++++++++----
>  test/CodeGen/R600/cgp-addressing-modes.ll | 242 ++++++++++++++++++++++++++++++
>  3 files changed, 312 insertions(+), 31 deletions(-)
>  create mode 100644 test/CodeGen/R600/cgp-addressing-modes.ll
> 
> diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
> index 9b36063..f014d7a 100644
> --- a/lib/Target/R600/AMDGPU.h
> +++ b/lib/Target/R600/AMDGPU.h
> @@ -137,7 +137,10 @@ enum AddressSpaces {
>    CONSTANT_BUFFER_14 = 22,
>    CONSTANT_BUFFER_15 = 23,
>    ADDRESS_NONE = 24, ///< Address space for unknown memory.
> -  LAST_ADDRESS = ADDRESS_NONE
> +  LAST_ADDRESS = ADDRESS_NONE,
> +
> +  // Some places use this if the address space can't be determined.
> +  UNKNOWN_ADDRESS_SPACE = ~0u
>  };
>  
>  } // namespace AMDGPUAS
> diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
> index c564bff..8f5b222 100644
> --- a/lib/Target/R600/SIISelLowering.cpp
> +++ b/lib/Target/R600/SIISelLowering.cpp
> @@ -251,47 +251,83 @@ bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
>    return false;
>  }
>  
> -// FIXME: This really needs an address space argument. The immediate offset
> -// size is different for different sets of memory instruction sets.
> -
> -// The single offset DS instructions have a 16-bit unsigned byte offset.
> -//
> -// MUBUF / MTBUF have a 12-bit unsigned byte offset, and additionally can do r +
> -// r + i with addr64. 32-bit has more addressing mode options. Depending on the
> -// resource constant, it can also do (i64 r0) + (i32 r1) * (i14 i).
> -//
> -// SMRD instructions have an 8-bit, dword offset.
> -//
>  bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM,
>                                               Type *Ty, unsigned AS) const {
>    // No global is ever allowed as a base.
>    if (AM.BaseGV)
>      return false;
>  
> -  // Allow a 16-bit unsigned immediate field, since this is what DS instructions
> -  // use.
> -  if (!isUInt<16>(AM.BaseOffs))
> -    return false;
> +  switch (AS) {
> +  case AMDGPUAS::GLOBAL_ADDRESS:
> +  case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions?

I'm not sure, but I think what you have is fine.  I think in order to
implement this correctly, we may need to pass the load/store instruction
to this function.  This way we could look at the convergent attribute
and the alignment as well, but I'm not sure if this is really feasible.

This patch LGTM.

-Tom
> +  case AMDGPUAS::PRIVATE_ADDRESS:
> +  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: {
> +    // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
> +    // additionally can do r + r + i with addr64. 32-bit has more addressing
> +    // mode options. Depending on the resource constant, it can also do
> +    // (i64 r0) + (i32 r1) * (i14 i).
> +    //
> +    // SMRD instructions have an 8-bit, dword offset.
> +    //
> +    // Assume nonunifom access, since the address space isn't enough to know
> +    // what instruction we will use, and since we don't know if this is a load
> +    // or store and scalar stores are only available on VI.
> +    //
> +    // We also know if we are doing an extload, we can't do a scalar load.
> +    //
> +    // Private arrays end up using a scratch buffer most of the time, so also
> +    // assume those use MUBUF instructions. Scratch loads / stores are currently
> +    // implemented as mubuf instructions with offen bit set, so slightly
> +    // different than the normal addr64.
> +    if (!isUInt<12>(AM.BaseOffs))
> +      return false;
>  
> -  // Only support r+r,
> -  switch (AM.Scale) {
> -  case 0:  // "r+i" or just "i", depending on HasBaseReg.
> -    break;
> -  case 1:
> -    if (AM.HasBaseReg && AM.BaseOffs)  // "r+r+i" is not allowed.
> +    // FIXME: Since we can split immediate into soffset and immediate offset,
> +    // would it make sense to allow any immediate?
> +
> +    switch (AM.Scale) {
> +    case 0: // r + i or just i, depending on HasBaseReg.
> +      return true;
> +    case 1:
> +      return true; // We have r + r or r + i.
> +    case 2:
> +      if (AM.HasBaseReg) {
> +        // Reject 2 * r + r.
> +        return false;
> +      }
> +
> +      // Allow 2 * r as r + r
> +      // Or  2 * r + i is allowed as r + r + i.
> +      return true;
> +    default: // Don't allow n * r
>        return false;
> -    // Otherwise we have r+r or r+i.
> -    break;
> -  case 2:
> -    if (AM.HasBaseReg || AM.BaseOffs)  // 2*r+r  or  2*r+i is not allowed.
> +    }
> +  }
> +  case AMDGPUAS::LOCAL_ADDRESS:
> +  case AMDGPUAS::REGION_ADDRESS: {
> +    // Basic, single offset DS instructions allow a 16-bit unsigned immediate
> +    // field.
> +    // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
> +    // an 8-bit dword offset but we don't know the alignment here.
> +    if (!isUInt<16>(AM.BaseOffs))
>        return false;
> -    // Allow 2*r as r+r.
> -    break;
> -  default: // Don't allow n * r
> +
> +    if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
> +      return true;
> +
> +    if (AM.Scale == 1 && AM.HasBaseReg)
> +      return true;
> +
>      return false;
>    }
> -
> -  return true;
> +  case AMDGPUAS::FLAT_ADDRESS: {
> +    // Flat instructions do not have offsets, and only have the register
> +    // address.
> +    return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
> +  }
> +  default:
> +    llvm_unreachable("unhandled address space");
> +  }
>  }
>  
>  bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
> diff --git a/test/CodeGen/R600/cgp-addressing-modes.ll b/test/CodeGen/R600/cgp-addressing-modes.ll
> new file mode 100644
> index 0000000..3d36bd1
> --- /dev/null
> +++ b/test/CodeGen/R600/cgp-addressing-modes.ll
> @@ -0,0 +1,242 @@
> +; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown < %s | FileCheck -check-prefix=OPT %s
> +; RUN: llc -march=amdgcn -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN %s
> +
> +declare i32 @llvm.r600.read.tidig.x() #0
> +
> +; OPT-LABEL: @test_sink_global_small_offset_i32(
> +; OPT-NOT: getelementptr i32, i32 addrspace(1)* %in
> +; OPT: br i1
> +; OPT: ptrtoint
> +
> +; GCN-LABEL: {{^}}test_sink_global_small_offset_i32:
> +; GCN: {{^}}BB0_2:
> +define void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) {
> +entry:
> +  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
> +  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7
> +  %tmp0 = icmp eq i32 %cond, 0
> +  br i1 %tmp0, label %endif, label %if
> +
> +if:
> +  %tmp1 = load i32, i32 addrspace(1)* %in.gep
> +  br label %endif
> +
> +endif:
> +  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
> +  store i32 %x, i32 addrspace(1)* %out.gep
> +  br label %done
> +
> +done:
> +  ret void
> +}
> +
> +; OPT-LABEL: @test_sink_global_small_max_i32_ds_offset(
> +; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535
> +; OPT: br i1
> +
> +; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset:
> +; GCN: s_and_saveexec_b64
> +; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
> +; GCN: {{^}}BB1_2:
> +; GCN: s_or_b64 exec
> +define void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) {
> +entry:
> +  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
> +  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535
> +  %tmp0 = icmp eq i32 %cond, 0
> +  br i1 %tmp0, label %endif, label %if
> +
> +if:
> +  %tmp1 = load i8, i8 addrspace(1)* %in.gep
> +  %tmp2 = sext i8 %tmp1 to i32
> +  br label %endif
> +
> +endif:
> +  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
> +  store i32 %x, i32 addrspace(1)* %out.gep
> +  br label %done
> +
> +done:
> +  ret void
> +}
> +
> +; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset:
> +; GCN: s_and_saveexec_b64
> +; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
> +; GCN: {{^}}BB2_2:
> +; GCN: s_or_b64 exec
> +define void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) {
> +entry:
> +  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
> +  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095
> +  %tmp0 = icmp eq i32 %cond, 0
> +  br i1 %tmp0, label %endif, label %if
> +
> +if:
> +  %tmp1 = load i8, i8 addrspace(1)* %in.gep
> +  %tmp2 = sext i8 %tmp1 to i32
> +  br label %endif
> +
> +endif:
> +  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
> +  store i32 %x, i32 addrspace(1)* %out.gep
> +  br label %done
> +
> +done:
> +  ret void
> +}
> +
> +; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset:
> +; GCN: s_and_saveexec_b64
> +; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
> +; GCN: {{^}}BB3_2:
> +; GCN: s_or_b64 exec
> +define void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) {
> +entry:
> +  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
> +  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096
> +  %tmp0 = icmp eq i32 %cond, 0
> +  br i1 %tmp0, label %endif, label %if
> +
> +if:
> +  %tmp1 = load i8, i8 addrspace(1)* %in.gep
> +  %tmp2 = sext i8 %tmp1 to i32
> +  br label %endif
> +
> +endif:
> +  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
> +  store i32 %x, i32 addrspace(1)* %out.gep
> +  br label %done
> +
> +done:
> +  ret void
> +}
> +
> +; OPT-LABEL: @test_no_sink_flat_small_offset_i32(
> +; OPT: getelementptr i32, i32 addrspace(4)* %in
> +; OPT: br i1
> +; OPT-NOT: ptrtoint
> +
> +; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32:
> +; GCN: flat_load_dword
> +; GCN: {{^}}BB4_2:
> +
> +define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
> +entry:
> +  %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
> +  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
> +  %tmp0 = icmp eq i32 %cond, 0
> +  br i1 %tmp0, label %endif, label %if
> +
> +if:
> +  %tmp1 = load i32, i32 addrspace(4)* %in.gep
> +  br label %endif
> +
> +endif:
> +  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
> +  store i32 %x, i32 addrspace(4)* %out.gep
> +  br label %done
> +
> +done:
> +  ret void
> +}
> +
> +; OPT-LABEL: @test_sink_scratch_small_offset_i32(
> +; OPT-NOT:  getelementptr [512 x i32]
> +; OPT: br i1
> +; OPT: ptrtoint
> +
> +; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32:
> +; GCN: s_and_saveexec_b64
> +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
> +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
> +; GCN: {{^}}BB5_2:
> +define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) {
> +entry:
> +  %alloca = alloca [512 x i32], align 4
> +  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
> +  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
> +  %add.arg = add i32 %arg, 8
> +  %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1023
> +  %tmp0 = icmp eq i32 %cond, 0
> +  br i1 %tmp0, label %endif, label %if
> +
> +if:
> +  store volatile i32 123, i32* %alloca.gep
> +  %tmp1 = load volatile i32, i32* %alloca.gep
> +  br label %endif
> +
> +endif:
> +  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
> +  store i32 %x, i32 addrspace(1)* %out.gep.0
> +  %load = load volatile i32, i32* %alloca.gep
> +  store i32 %load, i32 addrspace(1)* %out.gep.1
> +  br label %done
> +
> +done:
> +  ret void
> +}
> +
> +; OPT-LABEL: @test_no_sink_scratch_large_offset_i32(
> +; OPT: %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1024
> +; OPT: br i1
> +; OPT-NOT: ptrtoint
> +
> +; GCN-LABEL: {{^}}test_no_sink_scratch_large_offset_i32:
> +; GCN: s_and_saveexec_b64
> +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
> +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
> +; GCN: {{^}}BB6_2:
> +define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) {
> +entry:
> +  %alloca = alloca [512 x i32], align 4
> +  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
> +  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
> +  %add.arg = add i32 %arg, 8
> +  %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1024
> +  %tmp0 = icmp eq i32 %cond, 0
> +  br i1 %tmp0, label %endif, label %if
> +
> +if:
> +  store volatile i32 123, i32* %alloca.gep
> +  %tmp1 = load volatile i32, i32* %alloca.gep
> +  br label %endif
> +
> +endif:
> +  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
> +  store i32 %x, i32 addrspace(1)* %out.gep.0
> +  %load = load volatile i32, i32* %alloca.gep
> +  store i32 %load, i32 addrspace(1)* %out.gep.1
> +  br label %done
> +
> +done:
> +  ret void
> +}
> +
> +; GCN-LABEL: {{^}}test_sink_global_vreg_sreg_i32:
> +; GCN: s_and_saveexec_b64
> +; GCN: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
> +; GCN: {{^}}BB7_2:
> +define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset, i32 %cond) {
> +entry:
> +  %offset.ext = zext i32 %offset to i64
> +  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
> +  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 %offset.ext
> +  %tmp0 = icmp eq i32 %cond, 0
> +  br i1 %tmp0, label %endif, label %if
> +
> +if:
> +  %tmp1 = load i32, i32 addrspace(1)* %in.gep
> +  br label %endif
> +
> +endif:
> +  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
> +  store i32 %x, i32 addrspace(1)* %out.gep
> +  br label %done
> +
> +done:
> +  ret void
> +}
> +
> +attributes #0 = { nounwind readnone }
> +attributes #1 = { nounwind }
> -- 
> 2.2.1
> 

> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits