[Libclc-dev] [PATCH] libclc: Revert generic vload/vstore to pure CLC and move asm to R600

Mon Jul 8 10:38:37 PDT 2013

On Mon, Jul 01, 2013 at 06:06:39PM -0500, Aaron Watry wrote:
> The assembly optimizations were making unsafe assumptions about which address
> spaces had which identifiers.
> 
> Also, fix vload/vstore with 64-bit pointers. This was broken previously on
> Radeon SI.
> 
> Signed-off-by: Aaron Watry <awatry at gmail.com>
> ---
>  generic/lib/SOURCES               |   4 --
>  generic/lib/shared/vload.cl       |  54 +------------------
>  generic/lib/shared/vload_if.ll    |  60 ---------------------
>  generic/lib/shared/vload_impl.ll  |  49 -----------------
>  generic/lib/shared/vstore.cl      |  58 +-------------------
>  generic/lib/shared/vstore_if.ll   |  59 ---------------------
>  generic/lib/shared/vstore_impl.ll |  50 ------------------
>  r600/lib/SOURCES                  |   6 +++
>  r600/lib/shared/vload.cl          |  99 ++++++++++++++++++++++++++++++++++
>  r600/lib/shared/vload_if.ll       |  60 +++++++++++++++++++++
>  r600/lib/shared/vload_impl.ll     |  44 ++++++++++++++++
>  r600/lib/shared/vstore.cl         | 108 ++++++++++++++++++++++++++++++++++++++
>  r600/lib/shared/vstore_if.ll      |  59 +++++++++++++++++++++
>  r600/lib/shared/vstore_impl.ll    |  45 ++++++++++++++++
>  14 files changed, 425 insertions(+), 330 deletions(-)
>  delete mode 100644 generic/lib/shared/vload_if.ll
>  delete mode 100644 generic/lib/shared/vload_impl.ll
>  delete mode 100644 generic/lib/shared/vstore_if.ll
>  delete mode 100644 generic/lib/shared/vstore_impl.ll
>  create mode 100644 r600/lib/shared/vload.cl
>  create mode 100644 r600/lib/shared/vload_if.ll
>  create mode 100644 r600/lib/shared/vload_impl.ll
>  create mode 100644 r600/lib/shared/vstore.cl
>  create mode 100644 r600/lib/shared/vstore_if.ll
>  create mode 100644 r600/lib/shared/vstore_impl.ll
> 
> diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
> index 8cda14a..50cc9bd 100644
> --- a/generic/lib/SOURCES
> +++ b/generic/lib/SOURCES
> @@ -24,10 +24,6 @@ shared/clamp.cl
>  shared/max.cl
>  shared/min.cl
>  shared/vload.cl
> -shared/vload_if.ll
> -shared/vload_impl.ll
>  shared/vstore.cl
> -shared/vstore_if.ll
> -shared/vstore_impl.ll
>  workitem/get_global_id.cl
>  workitem/get_global_size.cl
> diff --git a/generic/lib/shared/vload.cl b/generic/lib/shared/vload.cl
> index f6ebd37..e8439e7 100644
> --- a/generic/lib/shared/vload.cl
> +++ b/generic/lib/shared/vload.cl
> @@ -27,12 +27,13 @@
>      VLOAD_VECTORIZE(SCALAR_GENTYPE, __constant) \
>      VLOAD_VECTORIZE(SCALAR_GENTYPE, __global) \
>  
> -//int/uint are special... see below
>  #define VLOAD_TYPES() \
>      VLOAD_ADDR_SPACES(char) \
>      VLOAD_ADDR_SPACES(uchar) \
>      VLOAD_ADDR_SPACES(short) \
>      VLOAD_ADDR_SPACES(ushort) \
> +    VLOAD_ADDR_SPACES(int) \
> +    VLOAD_ADDR_SPACES(uint) \
>      VLOAD_ADDR_SPACES(long) \
>      VLOAD_ADDR_SPACES(ulong) \
>      VLOAD_ADDR_SPACES(float) \
> @@ -43,54 +44,3 @@ VLOAD_TYPES()
>  #pragma OPENCL EXTENSION cl_khr_fp64 : enable
>      VLOAD_ADDR_SPACES(double)
>  #endif
> -
> -VLOAD_VECTORIZE(int, __private)
> -VLOAD_VECTORIZE(int, __local)
> -VLOAD_VECTORIZE(int, __constant)
> -VLOAD_VECTORIZE(uint, __private)
> -VLOAD_VECTORIZE(uint, __local)
> -VLOAD_VECTORIZE(uint, __constant)
> -
> -_CLC_OVERLOAD _CLC_DEF int2 vload2(size_t offset, const global int *x) {
> -  return (int2)(x[offset] , x[offset+1]);
> -}
> -_CLC_OVERLOAD _CLC_DEF int3 vload3(size_t offset, const global int *x) {
> -  return (int3)(vload2(offset, x), x[offset+2]);
> -}
> -_CLC_OVERLOAD _CLC_DEF uint2 vload2(size_t offset, const global uint *x) {
> -  return (uint2)(x[offset] , x[offset+1]);
> -}
> -_CLC_OVERLOAD _CLC_DEF uint3 vload3(size_t offset, const global uint *x) {
> -  return (uint3)(vload2(offset, x), x[offset+2]);
> -}
> -        
> -/*Note: It is known that R600 doesn't support load <2 x ?> and <3 x ?>... so
> - * they aren't actually overridden here
> - */
> -_CLC_DECL int4 __clc_vload4_int__global(size_t offset, const __global int *);
> -_CLC_DECL int8 __clc_vload8_int__global(size_t offset, const __global int *);
> -_CLC_DECL int16 __clc_vload16_int__global(size_t offset, const __global int *);
> -
> -_CLC_OVERLOAD _CLC_DEF int4 vload4(size_t offset, const global int *x) {
> -  return __clc_vload4_int__global(offset, x);
> -}
> -_CLC_OVERLOAD _CLC_DEF int8 vload8(size_t offset, const global int *x) {
> -  return __clc_vload8_int__global(offset, x);
> -}
> -_CLC_OVERLOAD _CLC_DEF int16 vload16(size_t offset, const global int *x) {
> -  return __clc_vload16_int__global(offset, x);
> -}
> -
> -_CLC_DECL uint4 __clc_vload4_uint__global(size_t offset, const __global uint *);
> -_CLC_DECL uint8 __clc_vload8_uint__global(size_t offset, const __global uint *);
> -_CLC_DECL uint16 __clc_vload16_uint__global(size_t offset, const __global uint *);
> -
> -_CLC_OVERLOAD _CLC_DEF uint4 vload4(size_t offset, const global uint *x) {
> -  return __clc_vload4_uint__global(offset, x);
> -}
> -_CLC_OVERLOAD _CLC_DEF uint8 vload8(size_t offset, const global uint *x) {
> -  return __clc_vload8_uint__global(offset, x);
> -}
> -_CLC_OVERLOAD _CLC_DEF uint16 vload16(size_t offset, const global uint *x) {
> -  return __clc_vload16_uint__global(offset, x);
> -}
> \ No newline at end of file
> diff --git a/generic/lib/shared/vload_if.ll b/generic/lib/shared/vload_if.ll
> deleted file mode 100644
> index 2634d37..0000000
> --- a/generic/lib/shared/vload_if.ll
> +++ /dev/null
> @@ -1,60 +0,0 @@
> -;Start int global vload
> -
> -declare <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
> -declare <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
> -declare <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
> -declare <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
> -declare <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
> -
> -define <2 x i32> @__clc_vload2_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> -  %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
> -  ret <2 x i32> %call
> -}
> -
> -define <3 x i32> @__clc_vload3_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> -  %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
> -  ret <3 x i32> %call
> -}
> -
> -define <4 x i32> @__clc_vload4_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> -  %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
> -  ret <4 x i32> %call
> -}
> -
> -define <8 x i32> @__clc_vload8_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> -  %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
> -  ret <8 x i32> %call
> -}
> -
> -define <16 x i32> @__clc_vload16_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> -  %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
> -  ret <16 x i32> %call
> -}
> -
> -
> -;Start uint global vload
> -
> -define <2 x i32> @__clc_vload2_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> -  %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
> -  ret <2 x i32> %call
> -}
> -
> -define <3 x i32> @__clc_vload3_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> -  %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
> -  ret <3 x i32> %call
> -}
> -
> -define <4 x i32> @__clc_vload4_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> -  %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
> -  ret <4 x i32> %call
> -}
> -
> -define <8 x i32> @__clc_vload8_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> -  %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
> -  ret <8 x i32> %call
> -}
> -
> -define <16 x i32> @__clc_vload16_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> -  %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
> -  ret <16 x i32> %call
> -}
> diff --git a/generic/lib/shared/vload_impl.ll b/generic/lib/shared/vload_impl.ll
> deleted file mode 100644
> index ae719e0..0000000
> --- a/generic/lib/shared/vload_impl.ll
> +++ /dev/null
> @@ -1,49 +0,0 @@
> -; This provides optimized implementations of vload4/8/16 for 32-bit int/uint
> -
> -define <2 x i32> @__clc_vload2_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> -  %1 = ptrtoint i32 addrspace(1)* %addr to i32
> -  %2 = add i32 %1, %offset
> -  %3 = inttoptr i32 %2 to <2 x i32> addrspace(1)*
> -  %4 = load <2 x i32> addrspace(1)* %3, align 4, !tbaa !3
> -  ret <2 x i32> %4
> -}
> -
> -define <3 x i32> @__clc_vload3_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> -  %1 = ptrtoint i32 addrspace(1)* %addr to i32
> -  %2 = add i32 %1, %offset
> -  %3 = inttoptr i32 %2 to <3 x i32> addrspace(1)*
> -  %4 = load <3 x i32> addrspace(1)* %3, align 4, !tbaa !3
> -  ret <3 x i32> %4
> -}
> -
> -define <4 x i32> @__clc_vload4_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> -  %1 = ptrtoint i32 addrspace(1)* %addr to i32
> -  %2 = add i32 %1, %offset
> -  %3 = inttoptr i32 %2 to <4 x i32> addrspace(1)*
> -  %4 = load <4 x i32> addrspace(1)* %3, align 4, !tbaa !3
> -  ret <4 x i32> %4
> -}
> -
> -define <8 x i32> @__clc_vload8_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> -  %1 = ptrtoint i32 addrspace(1)* %addr to i32
> -  %2 = add i32 %1, %offset
> -  %3 = inttoptr i32 %2 to <8 x i32> addrspace(1)*
> -  %4 = load <8 x i32> addrspace(1)* %3, align 4, !tbaa !3
> -  ret <8 x i32> %4
> -}
> -
> -define <16 x i32> @__clc_vload16_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> -  %1 = ptrtoint i32 addrspace(1)* %addr to i32
> -  %2 = add i32 %1, %offset
> -  %3 = inttoptr i32 %2 to <16 x i32> addrspace(1)*
> -  %4 = load <16 x i32> addrspace(1)* %3, align 4, !tbaa !3
> -  ret <16 x i32> %4
> -}
> -
> -!1 = metadata !{metadata !"char", metadata !5}
> -!2 = metadata !{metadata !"short", metadata !5}
> -!3 = metadata !{metadata !"int", metadata !5}
> -!4 = metadata !{metadata !"long", metadata !5}
> -!5 = metadata !{metadata !"omnipotent char", metadata !6}
> -!6 = metadata !{metadata !"Simple C/C++ TBAA"}
> -
> diff --git a/generic/lib/shared/vstore.cl b/generic/lib/shared/vstore.cl
> index 5b84f47..38f4d37 100644
> --- a/generic/lib/shared/vstore.cl
> +++ b/generic/lib/shared/vstore.cl
> @@ -34,12 +34,13 @@
>      VSTORE_VECTORIZE(SCALAR_GENTYPE, __local) \
>      VSTORE_VECTORIZE(SCALAR_GENTYPE, __global) \
>  
> -//int/uint are special... see below
>  #define VSTORE_TYPES() \
>      VSTORE_ADDR_SPACES(char) \
>      VSTORE_ADDR_SPACES(uchar) \
>      VSTORE_ADDR_SPACES(short) \
>      VSTORE_ADDR_SPACES(ushort) \
> +    VSTORE_ADDR_SPACES(int) \
> +    VSTORE_ADDR_SPACES(uint) \
>      VSTORE_ADDR_SPACES(long) \
>      VSTORE_ADDR_SPACES(ulong) \
>      VSTORE_ADDR_SPACES(float) \
> @@ -50,58 +51,3 @@ VSTORE_TYPES()
>  #pragma OPENCL EXTENSION cl_khr_fp64 : enable
>      VSTORE_ADDR_SPACES(double)
>  #endif
> -
> -VSTORE_VECTORIZE(int, __private)
> -VSTORE_VECTORIZE(int, __local)
> -VSTORE_VECTORIZE(uint, __private)
> -VSTORE_VECTORIZE(uint, __local)
> -
> -_CLC_OVERLOAD _CLC_DEF void vstore2(int2 vec, size_t offset, global int *mem) {
> -    mem[offset] = vec.s0;
> -    mem[offset+1] = vec.s1;
> -}
> -_CLC_OVERLOAD _CLC_DEF void vstore3(int3 vec, size_t offset, global int *mem) {
> -    mem[offset] = vec.s0;
> -    mem[offset+1] = vec.s1;
> -    mem[offset+2] = vec.s2;
> -}
> -_CLC_OVERLOAD _CLC_DEF void vstore2(uint2 vec, size_t offset, global uint *mem) {
> -    mem[offset] = vec.s0;
> -    mem[offset+1] = vec.s1;
> -}
> -_CLC_OVERLOAD _CLC_DEF void vstore3(uint3 vec, size_t offset, global uint *mem) {
> -    mem[offset] = vec.s0;
> -    mem[offset+1] = vec.s1;
> -    mem[offset+2] = vec.s2;
> -}
> -
> -/*Note: R600 probably doesn't support store <2 x ?> and <3 x ?>... so
> - * they aren't actually overridden here... lowest-common-denominator
> - */
> -_CLC_DECL void __clc_vstore4_int__global(int4 vec, size_t offset, __global int *);
> -_CLC_DECL void __clc_vstore8_int__global(int8 vec, size_t offset, __global int *);
> -_CLC_DECL void __clc_vstore16_int__global(int16 vec, size_t offset, __global int *);
> -
> -_CLC_OVERLOAD _CLC_DEF void vstore4(int4 vec, size_t offset, global int *x) {
> -    __clc_vstore4_int__global(vec, offset, x);
> -}
> -_CLC_OVERLOAD _CLC_DEF void vstore8(int8 vec, size_t offset, global int *x) {
> -    __clc_vstore8_int__global(vec, offset, x);
> -}
> -_CLC_OVERLOAD _CLC_DEF void vstore16(int16 vec, size_t offset, global int *x) {
> -    __clc_vstore16_int__global(vec, offset, x);
> -}
> -
> -_CLC_DECL void __clc_vstore4_uint__global(uint4 vec, size_t offset, __global uint *);
> -_CLC_DECL void __clc_vstore8_uint__global(uint8 vec, size_t offset, __global uint *);
> -_CLC_DECL void __clc_vstore16_uint__global(uint16 vec, size_t offset, __global uint *);
> -
> -_CLC_OVERLOAD _CLC_DEF void vstore4(uint4 vec, size_t offset, global uint *x) {
> -    __clc_vstore4_uint__global(vec, offset, x);
> -}
> -_CLC_OVERLOAD _CLC_DEF void vstore8(uint8 vec, size_t offset, global uint *x) {
> -    __clc_vstore8_uint__global(vec, offset, x);
> -}
> -_CLC_OVERLOAD _CLC_DEF void vstore16(uint16 vec, size_t offset, global uint *x) {
> -    __clc_vstore16_uint__global(vec, offset, x);
> -}
> diff --git a/generic/lib/shared/vstore_if.ll b/generic/lib/shared/vstore_if.ll
> deleted file mode 100644
> index 30eb552..0000000
> --- a/generic/lib/shared/vstore_if.ll
> +++ /dev/null
> @@ -1,59 +0,0 @@
> -;Start int global vstore
> -
> -declare void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 %y)
> -declare void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 %y)
> -declare void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 %y)
> -declare void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 %y)
> -declare void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 %y)
> -
> -define void @__clc_vstore2_int__global(<2 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
> -  call void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 %y)
> -  ret void
> -}
> -
> -define void @__clc_vstore3_int__global(<3 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
> -  call void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 %y)
> -  ret void
> -}
> -
> -define void @__clc_vstore4_int__global(<4 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
> -  call void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 %y)
> -  ret void
> -}
> -
> -define void @__clc_vstore8_int__global(<8 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
> -  call void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 %y)
> -  ret void
> -}
> -
> -define void @__clc_vstore16_int__global(<16 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
> -  call void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 %y)
> -  ret void
> -}
> -
> -
> -;Start uint global vstore
> -define void @__clc_vstore2_uint__global(<2 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
> -  call void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 %y)
> -  ret void
> -}
> -
> -define void @__clc_vstore3_uint__global(<3 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
> -  call void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 %y)
> -  ret void
> -}
> -
> -define void @__clc_vstore4_uint__global(<4 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
> -  call void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 %y)
> -  ret void
> -}
> -
> -define void @__clc_vstore8_uint__global(<8 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
> -  call void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 %y)
> -  ret void
> -}
> -
> -define void @__clc_vstore16_uint__global(<16 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
> -  call void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 %y)
> -  ret void
> -}
> \ No newline at end of file
> diff --git a/generic/lib/shared/vstore_impl.ll b/generic/lib/shared/vstore_impl.ll
> deleted file mode 100644
> index 3baab5e..0000000
> --- a/generic/lib/shared/vstore_impl.ll
> +++ /dev/null
> @@ -1,50 +0,0 @@
> -; This provides optimized implementations of vstore4/8/16 for 32-bit int/uint
> -
> -define void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> -  %1 = ptrtoint i32 addrspace(1)* %addr to i32
> -  %2 = add i32 %1, %offset
> -  %3 = inttoptr i32 %2 to <2 x i32> addrspace(1)*
> -  store <2 x i32> %vec, <2 x i32> addrspace(1)* %3, align 4, !tbaa !3
> -  ret void
> -}
> -
> -define void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> -  %1 = ptrtoint i32 addrspace(1)* %addr to i32
> -  %2 = add i32 %1, %offset
> -  %3 = inttoptr i32 %2 to <3 x i32> addrspace(1)*
> -  store <3 x i32> %vec, <3 x i32> addrspace(1)* %3, align 4, !tbaa !3
> -  ret void
> -}
> -
> -define void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> -  %1 = ptrtoint i32 addrspace(1)* %addr to i32
> -  %2 = add i32 %1, %offset
> -  %3 = inttoptr i32 %2 to <4 x i32> addrspace(1)*
> -  store <4 x i32> %vec, <4 x i32> addrspace(1)* %3, align 4, !tbaa !3
> -  ret void
> -}
> -
> -define void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> -  %1 = ptrtoint i32 addrspace(1)* %addr to i32
> -  %2 = add i32 %1, %offset
> -  %3 = inttoptr i32 %2 to <8 x i32> addrspace(1)*
> -  store <8 x i32> %vec, <8 x i32> addrspace(1)* %3, align 4, !tbaa !3
> -  ret void
> -}
> -
> -define void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> -  %1 = ptrtoint i32 addrspace(1)* %addr to i32
> -  %2 = add i32 %1, %offset
> -  %3 = inttoptr i32 %2 to <16 x i32> addrspace(1)*
> -  store <16 x i32> %vec, <16 x i32> addrspace(1)* %3, align 4, !tbaa !3
> -  ret void
> -}
> -
> -
> -!1 = metadata !{metadata !"char", metadata !5}
> -!2 = metadata !{metadata !"short", metadata !5}
> -!3 = metadata !{metadata !"int", metadata !5}
> -!4 = metadata !{metadata !"long", metadata !5}
> -!5 = metadata !{metadata !"omnipotent char", metadata !6}
> -!6 = metadata !{metadata !"Simple C/C++ TBAA"}
> -
> diff --git a/r600/lib/SOURCES b/r600/lib/SOURCES
> index af8c8c8..0637632 100644
> --- a/r600/lib/SOURCES
> +++ b/r600/lib/SOURCES
> @@ -2,3 +2,9 @@ workitem/get_group_id.ll
>  workitem/get_local_size.ll
>  workitem/get_local_id.ll
>  workitem/get_global_size.ll
> +shared/vload.cl
> +shared/vload_if.ll
> +shared/vload_impl.ll
> +shared/vstore.cl
> +shared/vstore_if.ll
> +shared/vstore_impl.ll
> diff --git a/r600/lib/shared/vload.cl b/r600/lib/shared/vload.cl
> new file mode 100644
> index 0000000..dd28347
> --- /dev/null
> +++ b/r600/lib/shared/vload.cl
> @@ -0,0 +1,99 @@
> +#include <clc/clc.h>
> +
> +#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
> +  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 vload2(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
> +    return (PRIM_TYPE##2)(x[offset] , x[offset+1]); \
> +  } \
> +\
> +  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
> +    return (PRIM_TYPE##3)(x[offset] , x[offset+1], x[offset+2]); \
> +  } \
> +\
> +  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
> +    return (PRIM_TYPE##4)(x[offset], x[offset+1], x[offset+2], x[offset+3]); \
> +  } \
> +\
> +  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
> +    return (PRIM_TYPE##8)(vload4(offset, x), vload4(offset+4, x)); \
> +  } \
> +\
> +  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
> +    return (PRIM_TYPE##16)(vload8(offset, x), vload8(offset+8, x)); \
> +  } \
> +
> +#define VLOAD_ADDR_SPACES(SCALAR_GENTYPE) \
> +    VLOAD_VECTORIZE(SCALAR_GENTYPE, __private) \
> +    VLOAD_VECTORIZE(SCALAR_GENTYPE, __local) \
> +    VLOAD_VECTORIZE(SCALAR_GENTYPE, __constant) \
> +    VLOAD_VECTORIZE(SCALAR_GENTYPE, __global) \
> +
> +//int/uint are special... see below
> +#define VLOAD_TYPES() \
> +    VLOAD_ADDR_SPACES(char) \
> +    VLOAD_ADDR_SPACES(uchar) \
> +    VLOAD_ADDR_SPACES(short) \
> +    VLOAD_ADDR_SPACES(ushort) \
> +    VLOAD_ADDR_SPACES(long) \
> +    VLOAD_ADDR_SPACES(ulong) \
> +    VLOAD_ADDR_SPACES(float) \
> +
> +VLOAD_TYPES()
> +
> +#ifdef cl_khr_fp64
> +#pragma OPENCL EXTENSION cl_khr_fp64 : enable
> +    VLOAD_ADDR_SPACES(double)
> +#endif
> +
> +VLOAD_VECTORIZE(int, __private)
> +VLOAD_VECTORIZE(int, __local)
> +VLOAD_VECTORIZE(int, __constant)
> +VLOAD_VECTORIZE(uint, __private)
> +VLOAD_VECTORIZE(uint, __local)
> +VLOAD_VECTORIZE(uint, __constant)
> +
> +_CLC_OVERLOAD _CLC_DEF int3 vload3(size_t offset, const global int *x) {
> +  return (int3)(vload2(offset, x), x[offset+2]);
> +}
> +_CLC_OVERLOAD _CLC_DEF uint3 vload3(size_t offset, const global uint *x) {
> +  return (uint3)(vload2(offset, x), x[offset+2]);
> +}
> +        
> +/*Note: R600 doesn't support load <3 x ?>... so
> + * those functions aren't actually overridden here
> + */
> +_CLC_DECL int2 __clc_vload2_int__global(size_t offset, const __global int *);
> +_CLC_DECL int4 __clc_vload4_int__global(size_t offset, const __global int *);
> +_CLC_DECL int8 __clc_vload8_int__global(size_t offset, const __global int *);
> +_CLC_DECL int16 __clc_vload16_int__global(size_t offset, const __global int *);
> +
> +_CLC_OVERLOAD _CLC_DEF int2 vload2(size_t offset, const global int *x) {
> +  return __clc_vload2_int__global(offset, x);
> +}
> +_CLC_OVERLOAD _CLC_DEF int4 vload4(size_t offset, const global int *x) {
> +  return __clc_vload4_int__global(offset, x);
> +}
> +_CLC_OVERLOAD _CLC_DEF int8 vload8(size_t offset, const global int *x) {
> +  return __clc_vload8_int__global(offset, x);
> +}
> +_CLC_OVERLOAD _CLC_DEF int16 vload16(size_t offset, const global int *x) {
> +  return __clc_vload16_int__global(offset, x);
> +}
> +
> +_CLC_DECL uint2 __clc_vload2_uint__global(size_t offset, const __global uint *);
> +_CLC_DECL uint4 __clc_vload4_uint__global(size_t offset, const __global uint *);
> +_CLC_DECL uint8 __clc_vload8_uint__global(size_t offset, const __global uint *);
> +_CLC_DECL uint16 __clc_vload16_uint__global(size_t offset, const __global uint *);
> +
> +_CLC_OVERLOAD _CLC_DEF uint2 vload2(size_t offset, const global uint *x) {
> +  return __clc_vload2_uint__global(offset, x);
> +}
> +_CLC_OVERLOAD _CLC_DEF uint4 vload4(size_t offset, const global uint *x) {
> +  return __clc_vload4_uint__global(offset, x);
> +}
> +_CLC_OVERLOAD _CLC_DEF uint8 vload8(size_t offset, const global uint *x) {
> +  return __clc_vload8_uint__global(offset, x);
> +}
> +_CLC_OVERLOAD _CLC_DEF uint16 vload16(size_t offset, const global uint *x) {
> +  return __clc_vload16_uint__global(offset, x);
> +}
> +
> diff --git a/r600/lib/shared/vload_if.ll b/r600/lib/shared/vload_if.ll
> new file mode 100644
> index 0000000..bda592b
> --- /dev/null
> +++ b/r600/lib/shared/vload_if.ll

What is the purpose of the vload_if.ll file?  All it does is wrap calls
to functions in vload_impl.ll, and I'm not sure why.

> @@ -0,0 +1,60 @@
> +;Start int global vload
> +
> +declare <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> +declare <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> +declare <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> +declare <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> +declare <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> +
> +define <2 x i32> @__clc_vload2_int__global(i32 %x, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> +  %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> +  ret <2 x i32> %call
> +}
> +
> +define <3 x i32> @__clc_vload3_int__global(i32 %x, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> +  %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> +  ret <3 x i32> %call
> +}
> +
> +define <4 x i32> @__clc_vload4_int__global(i32 %x, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> +  %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> +  ret <4 x i32> %call
> +}
> +
> +define <8 x i32> @__clc_vload8_int__global(i32 %x, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> +  %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> +  ret <8 x i32> %call
> +}
> +
> +define <16 x i32> @__clc_vload16_int__global(i32 %x, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> +  %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> +  ret <16 x i32> %call
> +}
> +
> +
> +;Start uint global vload
> +
> +define <2 x i32> @__clc_vload2_uint__global(i32 %x, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> +  %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> +  ret <2 x i32> %call
> +}
> +
> +define <3 x i32> @__clc_vload3_uint__global(i32 %x, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> +  %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> +  ret <3 x i32> %call
> +}
> +
> +define <4 x i32> @__clc_vload4_uint__global(i32 %x, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> +  %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> +  ret <4 x i32> %call
> +}
> +
> +define <8 x i32> @__clc_vload8_uint__global(i32 %x, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> +  %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> +  ret <8 x i32> %call
> +}
> +
> +define <16 x i32> @__clc_vload16_uint__global(i32 %x, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> +  %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> +  ret <16 x i32> %call
> +}
> diff --git a/r600/lib/shared/vload_impl.ll b/r600/lib/shared/vload_impl.ll
> new file mode 100644
> index 0000000..1333aac
> --- /dev/null
> +++ b/r600/lib/shared/vload_impl.ll
> @@ -0,0 +1,44 @@
> +; This provides optimized implementations of vload4/8/16 for 32-bit int/uint
> +
> +define <2 x i32> @__clc_vload2_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> +  %1 = getelementptr i32 addrspace(1)* %addr, i32 %offset
> +  %2 = bitcast i32 addrspace(1)* %1 to <2 x i32> addrspace(1)*
> +  %3 = load <2 x i32> addrspace(1)* %2, align 4, !tbaa !3
> +  ret <2 x i32> %3
> +}
> +
> +define <3 x i32> @__clc_vload3_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> +  %1 = getelementptr i32 addrspace(1)* %addr, i32 %offset
> +  %2 = bitcast i32 addrspace(1)* %1 to <3 x i32> addrspace(1)*
> +  %3 = load <3 x i32> addrspace(1)* %2, align 4, !tbaa !3
> +  ret <3 x i32> %3
> +}
> +
> +define <4 x i32> @__clc_vload4_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> +  %1 = getelementptr i32 addrspace(1)* %addr, i32 %offset
> +  %2 = bitcast i32 addrspace(1)* %1 to <4 x i32> addrspace(1)*
> +  %3 = load <4 x i32> addrspace(1)* %2, align 4, !tbaa !3
> +  ret <4 x i32> %3
> +}
> +
> +define <8 x i32> @__clc_vload8_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> +  %1 = getelementptr i32 addrspace(1)* %addr, i32 %offset
> +  %2 = bitcast i32 addrspace(1)* %1 to <8 x i32> addrspace(1)*
> +  %3 = load <8 x i32> addrspace(1)* %2, align 4, !tbaa !3
> +  ret <8 x i32> %3
> +}
> +
> +define <16 x i32> @__clc_vload16_impl_i32__global(i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> +  %1 = getelementptr i32 addrspace(1)* %addr, i32 %offset
> +  %2 = bitcast i32 addrspace(1)* %1 to <16 x i32> addrspace(1)*
> +  %3 = load <16 x i32> addrspace(1)* %2, align 4, !tbaa !3
> +  ret <16 x i32> %3
> +}
> +
> +!1 = metadata !{metadata !"char", metadata !5}
> +!2 = metadata !{metadata !"short", metadata !5}
> +!3 = metadata !{metadata !"int", metadata !5}
> +!4 = metadata !{metadata !"long", metadata !5}
> +!5 = metadata !{metadata !"omnipotent char", metadata !6}
> +!6 = metadata !{metadata !"Simple C/C++ TBAA"}
> +
> diff --git a/r600/lib/shared/vstore.cl b/r600/lib/shared/vstore.cl
> new file mode 100644
> index 0000000..c8b8cd5
> --- /dev/null
> +++ b/r600/lib/shared/vstore.cl
> @@ -0,0 +1,108 @@
> +#include <clc/clc.h>
> +
> +#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
> +
> +#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
> +  _CLC_OVERLOAD _CLC_DEF void vstore2(PRIM_TYPE##2 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
> +    mem[offset] = vec.s0; \
> +    mem[offset+1] = vec.s1; \
> +  } \
> +\
> +  _CLC_OVERLOAD _CLC_DEF void vstore3(PRIM_TYPE##3 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
> +    mem[offset] = vec.s0; \
> +    mem[offset+1] = vec.s1; \
> +    mem[offset+2] = vec.s2; \
> +  } \
> +\
> +  _CLC_OVERLOAD _CLC_DEF void vstore4(PRIM_TYPE##4 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
> +    vstore2(vec.lo, offset, mem); \
> +    vstore2(vec.hi, offset+2, mem); \
> +  } \
> +\
> +  _CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
> +    vstore4(vec.lo, offset, mem); \
> +    vstore4(vec.hi, offset+4, mem); \
> +  } \
> +\
> +  _CLC_OVERLOAD _CLC_DEF void vstore16(PRIM_TYPE##16 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
> +    vstore8(vec.lo, offset, mem); \
> +    vstore8(vec.hi, offset+8, mem); \
> +  } \
> +
> +#define VSTORE_ADDR_SPACES(SCALAR_GENTYPE) \
> +    VSTORE_VECTORIZE(SCALAR_GENTYPE, __private) \
> +    VSTORE_VECTORIZE(SCALAR_GENTYPE, __local) \
> +    VSTORE_VECTORIZE(SCALAR_GENTYPE, __global) \
> +
> +//int/uint are special... see below
> +#define VSTORE_TYPES() \
> +    VSTORE_ADDR_SPACES(char) \
> +    VSTORE_ADDR_SPACES(uchar) \
> +    VSTORE_ADDR_SPACES(short) \
> +    VSTORE_ADDR_SPACES(ushort) \
> +    VSTORE_ADDR_SPACES(long) \
> +    VSTORE_ADDR_SPACES(ulong) \
> +    VSTORE_ADDR_SPACES(float) \
> +
> +VSTORE_TYPES()
> +
> +#ifdef cl_khr_fp64
> +#pragma OPENCL EXTENSION cl_khr_fp64 : enable
> +    VSTORE_ADDR_SPACES(double)
> +#endif
> +
> +VSTORE_VECTORIZE(int, __private)
> +VSTORE_VECTORIZE(int, __local)
> +VSTORE_VECTORIZE(uint, __private)
> +VSTORE_VECTORIZE(uint, __local)
> +
> +_CLC_OVERLOAD _CLC_DEF void vstore3(int3 vec, size_t offset, global int *mem) {
> +    mem[offset] = vec.s0;
> +    mem[offset+1] = vec.s1;
> +    mem[offset+2] = vec.s2;
> +}
> +_CLC_OVERLOAD _CLC_DEF void vstore3(uint3 vec, size_t offset, global uint *mem) {
> +    mem[offset] = vec.s0;
> +    mem[offset+1] = vec.s1;
> +    mem[offset+2] = vec.s2;
> +}
> +
> +/*Note: R600 doesn't support store <3 x ?>... so
> + * those functions aren't actually overridden here... lowest-common-denominator
> + */
> +_CLC_DECL void __clc_vstore2_int__global(int2 vec, size_t offset, __global int *);
> +_CLC_DECL void __clc_vstore4_int__global(int4 vec, size_t offset, __global int *);
> +_CLC_DECL void __clc_vstore8_int__global(int8 vec, size_t offset, __global int *);
> +_CLC_DECL void __clc_vstore16_int__global(int16 vec, size_t offset, __global int *);
> +
> +_CLC_OVERLOAD _CLC_DEF void vstore2(int2 vec, size_t offset, global int *x) {
> +    __clc_vstore2_int__global(vec, offset, x);
> +}
> +_CLC_OVERLOAD _CLC_DEF void vstore4(int4 vec, size_t offset, global int *x) {
> +    __clc_vstore4_int__global(vec, offset, x);
> +}
> +_CLC_OVERLOAD _CLC_DEF void vstore8(int8 vec, size_t offset, global int *x) {
> +    __clc_vstore8_int__global(vec, offset, x);
> +}
> +_CLC_OVERLOAD _CLC_DEF void vstore16(int16 vec, size_t offset, global int *x) {
> +    __clc_vstore16_int__global(vec, offset, x);
> +}
> +
> +_CLC_DECL void __clc_vstore2_uint__global(uint2 vec, size_t offset, __global uint *);
> +_CLC_DECL void __clc_vstore4_uint__global(uint4 vec, size_t offset, __global uint *);
> +_CLC_DECL void __clc_vstore8_uint__global(uint8 vec, size_t offset, __global uint *);
> +_CLC_DECL void __clc_vstore16_uint__global(uint16 vec, size_t offset, __global uint *);
> +
> +_CLC_OVERLOAD _CLC_DEF void vstore2(uint2 vec, size_t offset, global uint *x) {
> +    __clc_vstore2_uint__global(vec, offset, x);
> +}
> +_CLC_OVERLOAD _CLC_DEF void vstore4(uint4 vec, size_t offset, global uint *x) {
> +    __clc_vstore4_uint__global(vec, offset, x);
> +}
> +_CLC_OVERLOAD _CLC_DEF void vstore8(uint8 vec, size_t offset, global uint *x) {
> +    __clc_vstore8_uint__global(vec, offset, x);
> +}
> +_CLC_OVERLOAD _CLC_DEF void vstore16(uint16 vec, size_t offset, global uint *x) {
> +    __clc_vstore16_uint__global(vec, offset, x);
> +}
> +
> diff --git a/r600/lib/shared/vstore_if.ll b/r600/lib/shared/vstore_if.ll
> new file mode 100644
> index 0000000..382a8a8
> --- /dev/null
> +++ b/r600/lib/shared/vstore_if.ll
> @@ -0,0 +1,59 @@
> +;Start int global vstore
> +
> +declare void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> +declare void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> +declare void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> +declare void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> +declare void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> +
> +define void @__clc_vstore2_int__global(<2 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> +  call void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> +  ret void
> +}
> +
> +define void @__clc_vstore3_int__global(<3 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> +  call void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> +  ret void
> +}
> +
> +define void @__clc_vstore4_int__global(<4 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> +  call void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> +  ret void
> +}
> +
> +define void @__clc_vstore8_int__global(<8 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> +  call void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> +  ret void
> +}
> +
> +define void @__clc_vstore16_int__global(<16 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> +  call void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> +  ret void
> +}
> +
> +
> +;Start uint global vstore
> +define void @__clc_vstore2_uint__global(<2 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> +  call void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> +  ret void
> +}
> +
> +define void @__clc_vstore3_uint__global(<3 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> +  call void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> +  ret void
> +}
> +
> +define void @__clc_vstore4_uint__global(<4 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> +  call void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> +  ret void
> +}
> +
> +define void @__clc_vstore8_uint__global(<8 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> +  call void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> +  ret void
> +}
> +
> +define void @__clc_vstore16_uint__global(<16 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> +  call void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> +  ret void
> +}
> \ No newline at end of file
> diff --git a/r600/lib/shared/vstore_impl.ll b/r600/lib/shared/vstore_impl.ll
> new file mode 100644
> index 0000000..8790a8f
> --- /dev/null
> +++ b/r600/lib/shared/vstore_impl.ll
> @@ -0,0 +1,45 @@
> +; This provides optimized implementations of vstore4/8/16 for 32-bit int/uint
> +
> +define void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> +  %1 = getelementptr i32 addrspace(1)* %addr, i32 %offset
> +  %2 = bitcast i32 addrspace(1)* %1 to <2 x i32> addrspace(1)*
> +  store <2 x i32> %vec, <2 x i32> addrspace(1)* %2, align 4, !tbaa !3
> +  ret void
> +}
> +
> +define void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> +  %1 = getelementptr i32 addrspace(1)* %addr, i32 %offset
> +  %2 = bitcast i32 addrspace(1)* %1 to <3 x i32> addrspace(1)*
> +  store <3 x i32> %vec, <3 x i32> addrspace(1)* %2, align 4, !tbaa !3
> +  ret void
> +}
> +
> +define void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> +  %1 = getelementptr i32 addrspace(1)* %addr, i32 %offset
> +  %2 = bitcast i32 addrspace(1)* %1 to <4 x i32> addrspace(1)*
> +  store <4 x i32> %vec, <4 x i32> addrspace(1)* %2, align 4, !tbaa !3
> +  ret void
> +}
> +
> +define void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> +  %1 = getelementptr i32 addrspace(1)* %addr, i32 %offset
> +  %2 = bitcast i32 addrspace(1)* %1 to <8 x i32> addrspace(1)*
> +  store <8 x i32> %vec, <8 x i32> addrspace(1)* %2, align 4, !tbaa !3
> +  ret void
> +}
> +
> +define void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %offset,  i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> +  %1 = getelementptr i32 addrspace(1)* %addr, i32 %offset
> +  %2 = bitcast i32 addrspace(1)* %1 to <16 x i32> addrspace(1)*
> +  store <16 x i32> %vec, <16 x i32> addrspace(1)* %2, align 4, !tbaa !3
> +  ret void
> +}
> +
> +
> +!1 = metadata !{metadata !"char", metadata !5}
> +!2 = metadata !{metadata !"short", metadata !5}
> +!3 = metadata !{metadata !"int", metadata !5}
> +!4 = metadata !{metadata !"long", metadata !5}
> +!5 = metadata !{metadata !"omnipotent char", metadata !6}
> +!6 = metadata !{metadata !"Simple C/C++ TBAA"}
> +
> -- 
> 1.8.1.2
> 
> 
> _______________________________________________
> Libclc-dev mailing list
> Libclc-dev at pcc.me.uk
> http://www.pcc.me.uk/cgi-bin/mailman/listinfo/libclc-dev