[PATCH] R600/SI: Add support for 64-bit LDS loads and stores

Wed Mar 12 12:29:08 PDT 2014

On Wed, Mar 12, 2014 at 11:24:49AM -0700, Matt Arsenault wrote:
> Hi,
> 
> This set of patches adds the 64-bit LDS load and store instruction, and matches the i16 constant offset available for addressing.
> 

These look good, thanks for doing this.  I just have one comment on
patch 4.  I will also need to test the patches before they can be committed.

Patch 1: LGTM

> From 39dfdfd220eb5b8c5b6aa4d559ba8ab27aa5009c Mon Sep 17 00:00:00 2001
> From: Matt Arsenault <Matthew.Arsenault at amd.com>
> Date: Tue, 11 Mar 2014 20:15:47 -0700
> Subject: [PATCH 2/7] R600/SI: Don't display the GDS bit.
> 
> It isn't actually used now, and probably never will be, plus it makes
> tests less annoying. I also think SC prints GDS instructions as a
> separate instruction name.

My idea for this was to print 'L' if it was cleared and 'G' if it
was set and print it right before the instruction name, which would give you
LDS_READ_B32 and
GDS_READ_B32

However, this is not critical and could be a follow on patch.  I am OK
with not printing this field.

Patch 2: LGTM

Patch 3: LGTM

> From f74df43c9cf6377346d9ba0b8be84f64766d115c Mon Sep 17 00:00:00 2001
> From: Matt Arsenault <Matthew.Arsenault at amd.com>
> Date: Tue, 11 Mar 2014 20:31:23 -0700
> Subject: [PATCH 4/7] R600/SI: Match i16 immediate offset of LDS instructions.
> 
> ---
>  lib/Target/R600/SIInstrInfo.td                  | 33 ++++++++++++++
>  lib/Target/R600/SIInstructions.td               | 50 ++++++++++++---------
>  test/CodeGen/R600/32-bit-local-address-space.ll | 58 +++++++++++++++++++++++--
>  test/CodeGen/R600/address-space.ll              |  9 ++--
>  test/CodeGen/R600/gep-address-space.ll          | 12 ++++-
>  test/CodeGen/R600/local-memory-two-objects.ll   |  7 +--
>  6 files changed, 138 insertions(+), 31 deletions(-)
> 
> diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
> index 8973f28..befe93c 100644
> --- a/lib/Target/R600/SIInstrInfo.td
> +++ b/lib/Target/R600/SIInstrInfo.td
> @@ -55,6 +55,20 @@ def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">;
>  def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
>  def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
>  
> +// Transformation function, extract the lower 8 bits of a 16-bit
> +// immediate.
> +def LO8_16 : SDNodeXForm<imm, [{
> +  APInt V = N->getAPIntValue().trunc(8);
> +  return CurDAG->getTargetConstant(V, MVT::i8);
> +}]>;
> +
> +// Transformation function, extract the upper 8 bits of a 16-bit
> +// immediate.
> +def HI8_16 : SDNodeXForm<imm, [{
> +  APInt V = N->getAPIntValue().trunc(16).shr(8);
> +  return CurDAG->getTargetConstant(V, MVT::i8);
> +}]>;
> +
>  // Transformation function, extract the lower 32bit of a 64bit immediate
>  def LO32 : SDNodeXForm<imm, [{
>    return CurDAG->getTargetConstant(N->getZExtValue() & 0xffffffff, MVT::i32);
> @@ -95,14 +109,32 @@ def as_i16imm : SDNodeXForm<imm, [{
>    return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i16);
>  }]>;
>  
> +def as_i8imm_low16 : SDNodeXForm<imm, [{
> +  return CurDAG->getTargetConstant(N->getSExtValue() & 0x00ff,
> +                                   MVT::i16);
> +}]>;
> +
> +def as_i8imm_high16 : SDNodeXForm<imm, [{
> +  return CurDAG->getTargetConstant(N->getSExtValue() & 0xff00,
> +                                   MVT::i16);
> +}]>;
> +
>  def as_i32imm: SDNodeXForm<imm, [{
>    return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i32);
>  }]>;
>  
> +def IMM8bit : PatLeaf <(imm),
> +  [{return isUInt<8>(N->getZExtValue());}]
> +>;
> +
>  def IMM12bit : PatLeaf <(imm),
>    [{return isUInt<12>(N->getZExtValue());}]
>  >;
> 

It looks like most of these new patterns aren't used anywhere.

> +def IMM16bit : PatLeaf <(imm),
> +  [{return isUInt<16>(N->getZExtValue());}]
> +>;
> +
>  def mubuf_vaddr_offset : PatFrag<
>    (ops node:$ptr, node:$offset, node:$imm_offset),
>    (add (add node:$ptr, node:$offset), node:$imm_offset)
> @@ -387,6 +419,7 @@ class DS_1A <bits<8> op, dag outs, dag ins, string asm, list<dag> pat> :
>      DS <op, outs, ins, asm, pat> {
>    bits<16> offset;
>  
> +  // Single load interpret the 2 i8imm operands as a single i16 offset.
>    let offset0 = offset{7-0};
>    let offset1 = offset{15-8};
>  }
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index 1124bd0..82acd38 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -1898,29 +1898,39 @@ def : Pat <
>  /**********   Load/Store Patterns   **********/
>  /********** ======================= **********/
>  
> -class DSReadPat <DS inst, ValueType vt, PatFrag frag> : Pat <
> -  (frag i32:$src0),
> -  (vt (inst 0, $src0, 0))
> ->;
> +multiclass DSReadPat <DS inst, ValueType vt, PatFrag frag> {
> +  def : Pat <
> +    (vt (frag (add i32:$ptr, (i32 IMM16bit:$offset)))),
> +    (inst (i1 0), $ptr, (as_i16imm $offset))
> +  >;
>  
> -def : DSReadPat <DS_READ_I8,  i32, sextloadi8_local>;
> -def : DSReadPat <DS_READ_U8,  i32, az_extloadi8_local>;
> -def : DSReadPat <DS_READ_I16, i32, sextloadi16_local>;
> -def : DSReadPat <DS_READ_U16, i32, az_extloadi16_local>;
> -def : DSReadPat <DS_READ_B32, i32, local_load>;
> -def : Pat <
> -    (local_load i32:$src0),
> -    (i32 (DS_READ_B32 0, $src0, 0))
> ->;
> +  def : Pat <
> +    (frag i32:$src0),
> +    (vt (inst 0, $src0, 0))
> +  >;
> +}
>  
> -class DSWritePat <DS inst, ValueType vt, PatFrag frag> : Pat <
> -  (frag i32:$src1, i32:$src0),
> -  (inst 0, $src0, $src1, 0)
> ->;
> +defm : DSReadPat <DS_READ_I8,  i32, sextloadi8_local>;
> +defm : DSReadPat <DS_READ_U8,  i32, az_extloadi8_local>;
> +defm : DSReadPat <DS_READ_I16, i32, sextloadi16_local>;
> +defm : DSReadPat <DS_READ_U16, i32, az_extloadi16_local>;
> +defm : DSReadPat <DS_READ_B32, i32, local_load>;
> +
> +multiclass DSWritePat <DS inst, ValueType vt, PatFrag frag> {
> +  def : Pat <
> +    (frag vt:$value, (add i32:$ptr, (i32 IMM16bit:$offset))),
> +    (inst (i1 0), $ptr, $value, (as_i16imm $offset))
> +  >;
> +
> +  def : Pat <
> +    (frag i32:$src1, i32:$src0),
> +    (inst 0, $src0, $src1, 0)
> +  >;
> +}
>  
> -def : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>;
> -def : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>;
> -def : DSWritePat <DS_WRITE_B32, i32, local_store>;
> +defm : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>;
> +defm : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>;
> +defm : DSWritePat <DS_WRITE_B32, i32, local_store>;
>  
>  def : Pat <(atomic_load_add_local i32:$ptr, i32:$val),
>             (DS_ADD_U32_RTN 0, $ptr, $val, 0)>;
> diff --git a/test/CodeGen/R600/32-bit-local-address-space.ll b/test/CodeGen/R600/32-bit-local-address-space.ll
> index 8c31e01..fffaefe 100644
> --- a/test/CodeGen/R600/32-bit-local-address-space.ll
> +++ b/test/CodeGen/R600/32-bit-local-address-space.ll
> @@ -11,7 +11,7 @@
>  
>  ; CHECK-LABEL: @local_address_load
>  ; CHECK: V_MOV_B32_e{{32|64}} [[PTR:v[0-9]]]
> -; CHECK: DS_READ_B32 [[PTR]]
> +; CHECK: DS_READ_B32 v{{[0-9]+}}, [[PTR]]
>  define void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
>  entry:
>    %0 = load i32 addrspace(3)* %in
> @@ -32,9 +32,8 @@ entry:
>  }
>  
>  ; CHECK-LABEL: @local_address_gep_const_offset
> -; CHECK: S_ADD_I32 [[SPTR:s[0-9]]]
> -; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
> -; CHECK: DS_READ_B32 v{{[0-9]+}}, [[VPTR]]
> +; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
> +; CHECK: DS_READ_B32 v{{[0-9]+}}, [[VPTR]], 4,
>  define void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
>  entry:
>    %0 = getelementptr i32 addrspace(3)* %in, i32 1
> @@ -43,6 +42,19 @@ entry:
>    ret void
>  }
>  
> +; Offset too large, can't fold into 16-bit immediate offset.
> +; CHECK-LABEL: @local_address_gep_large_const_offset
> +; CHECK: S_ADD_I32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 65540
> +; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
> +; CHECK: DS_READ_B32 [[VPTR]]
> +define void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
> +entry:
> +  %0 = getelementptr i32 addrspace(3)* %in, i32 16385
> +  %1 = load i32 addrspace(3)* %0
> +  store i32 %1, i32 addrspace(1)* %out
> +  ret void
> +}
> +
>  ; CHECK-LABEL: @null_32bit_lds_ptr:
>  ; CHECK: V_CMP_NE_I32
>  ; CHECK-NOT: V_CMP_NE_I32
> @@ -86,3 +98,41 @@ define void @global_ptr() nounwind {
>    store i32 addrspace(3)* getelementptr ([16384 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr
>    ret void
>  }
> +
> +; CHECK-LABEL: @local_address_store
> +; CHECK: DS_WRITE_B32
> +define void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
> +  store i32 %val, i32 addrspace(3)* %out
> +  ret void
> +}
> +
> +; CHECK-LABEL: @local_address_gep_store
> +; CHECK: S_ADD_I32 [[SADDR:s[0-9]+]],
> +; CHECK: V_MOV_B32_e32 [[ADDR:v[0-9]+]], [[SADDR]]
> +; CHECK: DS_WRITE_B32 [[ADDR]], v{{[0-9]+}},
> +define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) {
> +  %gep = getelementptr i32 addrspace(3)* %out, i32 %offset
> +  store i32 %val, i32 addrspace(3)* %gep, align 4
> +  ret void
> +}
> +
> +; CHECK-LABEL: @local_address_gep_const_offset_store
> +; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
> +; CHECK: V_MOV_B32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
> +; CHECK: DS_WRITE_B32 [[VPTR]], [[VAL]], 4
> +define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
> +  %gep = getelementptr i32 addrspace(3)* %out, i32 1
> +  store i32 %val, i32 addrspace(3)* %gep, align 4
> +  ret void
> +}
> +
> +; Offset too large, can't fold into 16-bit immediate offset.
> +; CHECK-LABEL: @local_address_gep_large_const_offset_store
> +; CHECK: S_ADD_I32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 65540
> +; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
> +; CHECK: DS_WRITE_B32 [[VPTR]], v{{[0-9]+}}, 0
> +define void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
> +  %gep = getelementptr i32 addrspace(3)* %out, i32 16385
> +  store i32 %val, i32 addrspace(3)* %gep, align 4
> +  ret void
> +}
> diff --git a/test/CodeGen/R600/address-space.ll b/test/CodeGen/R600/address-space.ll
> index 1fc616a..15d2ed2 100644
> --- a/test/CodeGen/R600/address-space.ll
> +++ b/test/CodeGen/R600/address-space.ll
> @@ -4,11 +4,14 @@
>  
>  %struct.foo = type { [3 x float], [3 x float] }
>  
> +; FIXME: Extra V_MOV from SGPR to VGPR for second read. The address is
> +; already in a VGPR after the first read.
> +
>  ; CHECK-LABEL: @do_as_ptr_calcs:
> -; CHECK: S_ADD_I32 {{s[0-9]+}},
> -; CHECK: S_ADD_I32 [[SREG1:s[0-9]+]],
> +; CHECK: S_LOAD_DWORD [[SREG1:s[0-9]+]],
>  ; CHECK: V_MOV_B32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
> -; CHECK: DS_READ_B32 [[VREG1]],
> +; CHECK: DS_READ_B32 v{{[0-9]+}}, [[VREG1]], 20
> +; CHECK: DS_READ_B32 v{{[0-9]+}}, v{{[0-9]+}}, 12
>  define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
>  entry:
>    %x = getelementptr inbounds %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
> diff --git a/test/CodeGen/R600/gep-address-space.ll b/test/CodeGen/R600/gep-address-space.ll
> index 494b815..ee914fa 100644
> --- a/test/CodeGen/R600/gep-address-space.ll
> +++ b/test/CodeGen/R600/gep-address-space.ll
> @@ -2,12 +2,22 @@
>  
>  define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
>  ; CHECK-LABEL: @use_gep_address_space:
> -; CHECK: S_ADD_I32
> +; CHECK: V_MOV_B32_e32 [[PTR:v[0-9]+]], s{{[0-9]+}}
> +; CHECK: DS_WRITE_B32 [[PTR]], v{{[0-9]+}}, 64
>    %p = getelementptr [1024 x i32] addrspace(3)* %array, i16 0, i16 16
>    store i32 99, i32 addrspace(3)* %p
>    ret void
>  }
>  
> +define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind {
> +; CHECK-LABEL: @use_gep_address_space_large_offset:
> +; CHECK: S_ADD_I32
> +; CHECK: DS_WRITE_B32
> +  %p = getelementptr [1024 x i32] addrspace(3)* %array, i16 0, i16 16384
> +  store i32 99, i32 addrspace(3)* %p
> +  ret void
> +}
> +
>  define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind {
>  ; CHECK-LABEL: @gep_as_vector_v4:
>  ; CHECK: S_ADD_I32
> diff --git a/test/CodeGen/R600/local-memory-two-objects.ll b/test/CodeGen/R600/local-memory-two-objects.ll
> index fdb2a5d..616000d 100644
> --- a/test/CodeGen/R600/local-memory-two-objects.ll
> +++ b/test/CodeGen/R600/local-memory-two-objects.ll
> @@ -24,11 +24,12 @@
>  ; EG-CHECK: GROUP_BARRIER
>  ; EG-CHECK-NEXT: ALU clause
>  
> -; Make sure the lds reads are using different addresses.
> +; Make sure the lds reads are using different addresses, at different
> +; constant offsets.
>  ; EG-CHECK: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
>  ; EG-CHECK-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
> -; SI-CHECK: DS_READ_B32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]]
> -; SI-CHECK-NOT: DS_READ_B32 {{v[0-9]+}}, [[ADDRR]]
> +; SI-CHECK: DS_READ_B32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]], 16
> +; SI-CHECK: DS_READ_B32 {{v[0-9]+}}, [[ADDRR]], 0,
>  
>  define void @local_memory_two_objects(i32 addrspace(1)* %out) {
>  entry:
> -- 
> 1.8.4.3
> 

Patch 5: LGTM

Patch 6: LGTM

Patch 7: LGTM