[Libclc-dev] [PATCH] libclc: Revert generic vload/vstore to pure CLC and move asm to R600
Tom Stellard
tom at stellard.net
Mon Jul 8 10:38:37 PDT 2013
On Mon, Jul 01, 2013 at 06:06:39PM -0500, Aaron Watry wrote:
> The assembly optimizations were making unsafe assumptions about which address
> spaces had which identifiers.
>
> Also, fix vload/vstore with 64-bit pointers. This was broken previously on
> Radeon SI.
>
> Signed-off-by: Aaron Watry <awatry at gmail.com>
> ---
> generic/lib/SOURCES | 4 --
> generic/lib/shared/vload.cl | 54 +------------------
> generic/lib/shared/vload_if.ll | 60 ---------------------
> generic/lib/shared/vload_impl.ll | 49 -----------------
> generic/lib/shared/vstore.cl | 58 +-------------------
> generic/lib/shared/vstore_if.ll | 59 ---------------------
> generic/lib/shared/vstore_impl.ll | 50 ------------------
> r600/lib/SOURCES | 6 +++
> r600/lib/shared/vload.cl | 99 ++++++++++++++++++++++++++++++++++
> r600/lib/shared/vload_if.ll | 60 +++++++++++++++++++++
> r600/lib/shared/vload_impl.ll | 44 ++++++++++++++++
> r600/lib/shared/vstore.cl | 108 ++++++++++++++++++++++++++++++++++++++
> r600/lib/shared/vstore_if.ll | 59 +++++++++++++++++++++
> r600/lib/shared/vstore_impl.ll | 45 ++++++++++++++++
> 14 files changed, 425 insertions(+), 330 deletions(-)
> delete mode 100644 generic/lib/shared/vload_if.ll
> delete mode 100644 generic/lib/shared/vload_impl.ll
> delete mode 100644 generic/lib/shared/vstore_if.ll
> delete mode 100644 generic/lib/shared/vstore_impl.ll
> create mode 100644 r600/lib/shared/vload.cl
> create mode 100644 r600/lib/shared/vload_if.ll
> create mode 100644 r600/lib/shared/vload_impl.ll
> create mode 100644 r600/lib/shared/vstore.cl
> create mode 100644 r600/lib/shared/vstore_if.ll
> create mode 100644 r600/lib/shared/vstore_impl.ll
>
> diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
> index 8cda14a..50cc9bd 100644
> --- a/generic/lib/SOURCES
> +++ b/generic/lib/SOURCES
> @@ -24,10 +24,6 @@ shared/clamp.cl
> shared/max.cl
> shared/min.cl
> shared/vload.cl
> -shared/vload_if.ll
> -shared/vload_impl.ll
> shared/vstore.cl
> -shared/vstore_if.ll
> -shared/vstore_impl.ll
> workitem/get_global_id.cl
> workitem/get_global_size.cl
> diff --git a/generic/lib/shared/vload.cl b/generic/lib/shared/vload.cl
> index f6ebd37..e8439e7 100644
> --- a/generic/lib/shared/vload.cl
> +++ b/generic/lib/shared/vload.cl
> @@ -27,12 +27,13 @@
> VLOAD_VECTORIZE(SCALAR_GENTYPE, __constant) \
> VLOAD_VECTORIZE(SCALAR_GENTYPE, __global) \
>
> -//int/uint are special... see below
> #define VLOAD_TYPES() \
> VLOAD_ADDR_SPACES(char) \
> VLOAD_ADDR_SPACES(uchar) \
> VLOAD_ADDR_SPACES(short) \
> VLOAD_ADDR_SPACES(ushort) \
> + VLOAD_ADDR_SPACES(int) \
> + VLOAD_ADDR_SPACES(uint) \
> VLOAD_ADDR_SPACES(long) \
> VLOAD_ADDR_SPACES(ulong) \
> VLOAD_ADDR_SPACES(float) \
> @@ -43,54 +44,3 @@ VLOAD_TYPES()
> #pragma OPENCL EXTENSION cl_khr_fp64 : enable
> VLOAD_ADDR_SPACES(double)
> #endif
> -
> -VLOAD_VECTORIZE(int, __private)
> -VLOAD_VECTORIZE(int, __local)
> -VLOAD_VECTORIZE(int, __constant)
> -VLOAD_VECTORIZE(uint, __private)
> -VLOAD_VECTORIZE(uint, __local)
> -VLOAD_VECTORIZE(uint, __constant)
> -
> -_CLC_OVERLOAD _CLC_DEF int2 vload2(size_t offset, const global int *x) {
> - return (int2)(x[offset] , x[offset+1]);
> -}
> -_CLC_OVERLOAD _CLC_DEF int3 vload3(size_t offset, const global int *x) {
> - return (int3)(vload2(offset, x), x[offset+2]);
> -}
> -_CLC_OVERLOAD _CLC_DEF uint2 vload2(size_t offset, const global uint *x) {
> - return (uint2)(x[offset] , x[offset+1]);
> -}
> -_CLC_OVERLOAD _CLC_DEF uint3 vload3(size_t offset, const global uint *x) {
> - return (uint3)(vload2(offset, x), x[offset+2]);
> -}
> -
> -/*Note: It is known that R600 doesn't support load <2 x ?> and <3 x ?>... so
> - * they aren't actually overridden here
> - */
> -_CLC_DECL int4 __clc_vload4_int__global(size_t offset, const __global int *);
> -_CLC_DECL int8 __clc_vload8_int__global(size_t offset, const __global int *);
> -_CLC_DECL int16 __clc_vload16_int__global(size_t offset, const __global int *);
> -
> -_CLC_OVERLOAD _CLC_DEF int4 vload4(size_t offset, const global int *x) {
> - return __clc_vload4_int__global(offset, x);
> -}
> -_CLC_OVERLOAD _CLC_DEF int8 vload8(size_t offset, const global int *x) {
> - return __clc_vload8_int__global(offset, x);
> -}
> -_CLC_OVERLOAD _CLC_DEF int16 vload16(size_t offset, const global int *x) {
> - return __clc_vload16_int__global(offset, x);
> -}
> -
> -_CLC_DECL uint4 __clc_vload4_uint__global(size_t offset, const __global uint *);
> -_CLC_DECL uint8 __clc_vload8_uint__global(size_t offset, const __global uint *);
> -_CLC_DECL uint16 __clc_vload16_uint__global(size_t offset, const __global uint *);
> -
> -_CLC_OVERLOAD _CLC_DEF uint4 vload4(size_t offset, const global uint *x) {
> - return __clc_vload4_uint__global(offset, x);
> -}
> -_CLC_OVERLOAD _CLC_DEF uint8 vload8(size_t offset, const global uint *x) {
> - return __clc_vload8_uint__global(offset, x);
> -}
> -_CLC_OVERLOAD _CLC_DEF uint16 vload16(size_t offset, const global uint *x) {
> - return __clc_vload16_uint__global(offset, x);
> -}
> \ No newline at end of file
> diff --git a/generic/lib/shared/vload_if.ll b/generic/lib/shared/vload_if.ll
> deleted file mode 100644
> index 2634d37..0000000
> --- a/generic/lib/shared/vload_if.ll
> +++ /dev/null
> @@ -1,60 +0,0 @@
> -;Start int global vload
> -
> -declare <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
> -declare <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
> -declare <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
> -declare <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
> -declare <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
> -
> -define <2 x i32> @__clc_vload2_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> - %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
> - ret <2 x i32> %call
> -}
> -
> -define <3 x i32> @__clc_vload3_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> - %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
> - ret <3 x i32> %call
> -}
> -
> -define <4 x i32> @__clc_vload4_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> - %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
> - ret <4 x i32> %call
> -}
> -
> -define <8 x i32> @__clc_vload8_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> - %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
> - ret <8 x i32> %call
> -}
> -
> -define <16 x i32> @__clc_vload16_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> - %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
> - ret <16 x i32> %call
> -}
> -
> -
> -;Start uint global vload
> -
> -define <2 x i32> @__clc_vload2_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> - %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y)
> - ret <2 x i32> %call
> -}
> -
> -define <3 x i32> @__clc_vload3_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> - %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y)
> - ret <3 x i32> %call
> -}
> -
> -define <4 x i32> @__clc_vload4_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> - %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y)
> - ret <4 x i32> %call
> -}
> -
> -define <8 x i32> @__clc_vload8_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> - %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y)
> - ret <8 x i32> %call
> -}
> -
> -define <16 x i32> @__clc_vload16_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline {
> - %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y)
> - ret <16 x i32> %call
> -}
> diff --git a/generic/lib/shared/vload_impl.ll b/generic/lib/shared/vload_impl.ll
> deleted file mode 100644
> index ae719e0..0000000
> --- a/generic/lib/shared/vload_impl.ll
> +++ /dev/null
> @@ -1,49 +0,0 @@
> -; This provides optimized implementations of vload4/8/16 for 32-bit int/uint
> -
> -define <2 x i32> @__clc_vload2_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> - %1 = ptrtoint i32 addrspace(1)* %addr to i32
> - %2 = add i32 %1, %offset
> - %3 = inttoptr i32 %2 to <2 x i32> addrspace(1)*
> - %4 = load <2 x i32> addrspace(1)* %3, align 4, !tbaa !3
> - ret <2 x i32> %4
> -}
> -
> -define <3 x i32> @__clc_vload3_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> - %1 = ptrtoint i32 addrspace(1)* %addr to i32
> - %2 = add i32 %1, %offset
> - %3 = inttoptr i32 %2 to <3 x i32> addrspace(1)*
> - %4 = load <3 x i32> addrspace(1)* %3, align 4, !tbaa !3
> - ret <3 x i32> %4
> -}
> -
> -define <4 x i32> @__clc_vload4_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> - %1 = ptrtoint i32 addrspace(1)* %addr to i32
> - %2 = add i32 %1, %offset
> - %3 = inttoptr i32 %2 to <4 x i32> addrspace(1)*
> - %4 = load <4 x i32> addrspace(1)* %3, align 4, !tbaa !3
> - ret <4 x i32> %4
> -}
> -
> -define <8 x i32> @__clc_vload8_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> - %1 = ptrtoint i32 addrspace(1)* %addr to i32
> - %2 = add i32 %1, %offset
> - %3 = inttoptr i32 %2 to <8 x i32> addrspace(1)*
> - %4 = load <8 x i32> addrspace(1)* %3, align 4, !tbaa !3
> - ret <8 x i32> %4
> -}
> -
> -define <16 x i32> @__clc_vload16_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> - %1 = ptrtoint i32 addrspace(1)* %addr to i32
> - %2 = add i32 %1, %offset
> - %3 = inttoptr i32 %2 to <16 x i32> addrspace(1)*
> - %4 = load <16 x i32> addrspace(1)* %3, align 4, !tbaa !3
> - ret <16 x i32> %4
> -}
> -
> -!1 = metadata !{metadata !"char", metadata !5}
> -!2 = metadata !{metadata !"short", metadata !5}
> -!3 = metadata !{metadata !"int", metadata !5}
> -!4 = metadata !{metadata !"long", metadata !5}
> -!5 = metadata !{metadata !"omnipotent char", metadata !6}
> -!6 = metadata !{metadata !"Simple C/C++ TBAA"}
> -
> diff --git a/generic/lib/shared/vstore.cl b/generic/lib/shared/vstore.cl
> index 5b84f47..38f4d37 100644
> --- a/generic/lib/shared/vstore.cl
> +++ b/generic/lib/shared/vstore.cl
> @@ -34,12 +34,13 @@
> VSTORE_VECTORIZE(SCALAR_GENTYPE, __local) \
> VSTORE_VECTORIZE(SCALAR_GENTYPE, __global) \
>
> -//int/uint are special... see below
> #define VSTORE_TYPES() \
> VSTORE_ADDR_SPACES(char) \
> VSTORE_ADDR_SPACES(uchar) \
> VSTORE_ADDR_SPACES(short) \
> VSTORE_ADDR_SPACES(ushort) \
> + VSTORE_ADDR_SPACES(int) \
> + VSTORE_ADDR_SPACES(uint) \
> VSTORE_ADDR_SPACES(long) \
> VSTORE_ADDR_SPACES(ulong) \
> VSTORE_ADDR_SPACES(float) \
> @@ -50,58 +51,3 @@ VSTORE_TYPES()
> #pragma OPENCL EXTENSION cl_khr_fp64 : enable
> VSTORE_ADDR_SPACES(double)
> #endif
> -
> -VSTORE_VECTORIZE(int, __private)
> -VSTORE_VECTORIZE(int, __local)
> -VSTORE_VECTORIZE(uint, __private)
> -VSTORE_VECTORIZE(uint, __local)
> -
> -_CLC_OVERLOAD _CLC_DEF void vstore2(int2 vec, size_t offset, global int *mem) {
> - mem[offset] = vec.s0;
> - mem[offset+1] = vec.s1;
> -}
> -_CLC_OVERLOAD _CLC_DEF void vstore3(int3 vec, size_t offset, global int *mem) {
> - mem[offset] = vec.s0;
> - mem[offset+1] = vec.s1;
> - mem[offset+2] = vec.s2;
> -}
> -_CLC_OVERLOAD _CLC_DEF void vstore2(uint2 vec, size_t offset, global uint *mem) {
> - mem[offset] = vec.s0;
> - mem[offset+1] = vec.s1;
> -}
> -_CLC_OVERLOAD _CLC_DEF void vstore3(uint3 vec, size_t offset, global uint *mem) {
> - mem[offset] = vec.s0;
> - mem[offset+1] = vec.s1;
> - mem[offset+2] = vec.s2;
> -}
> -
> -/*Note: R600 probably doesn't support store <2 x ?> and <3 x ?>... so
> - * they aren't actually overridden here... lowest-common-denominator
> - */
> -_CLC_DECL void __clc_vstore4_int__global(int4 vec, size_t offset, __global int *);
> -_CLC_DECL void __clc_vstore8_int__global(int8 vec, size_t offset, __global int *);
> -_CLC_DECL void __clc_vstore16_int__global(int16 vec, size_t offset, __global int *);
> -
> -_CLC_OVERLOAD _CLC_DEF void vstore4(int4 vec, size_t offset, global int *x) {
> - __clc_vstore4_int__global(vec, offset, x);
> -}
> -_CLC_OVERLOAD _CLC_DEF void vstore8(int8 vec, size_t offset, global int *x) {
> - __clc_vstore8_int__global(vec, offset, x);
> -}
> -_CLC_OVERLOAD _CLC_DEF void vstore16(int16 vec, size_t offset, global int *x) {
> - __clc_vstore16_int__global(vec, offset, x);
> -}
> -
> -_CLC_DECL void __clc_vstore4_uint__global(uint4 vec, size_t offset, __global uint *);
> -_CLC_DECL void __clc_vstore8_uint__global(uint8 vec, size_t offset, __global uint *);
> -_CLC_DECL void __clc_vstore16_uint__global(uint16 vec, size_t offset, __global uint *);
> -
> -_CLC_OVERLOAD _CLC_DEF void vstore4(uint4 vec, size_t offset, global uint *x) {
> - __clc_vstore4_uint__global(vec, offset, x);
> -}
> -_CLC_OVERLOAD _CLC_DEF void vstore8(uint8 vec, size_t offset, global uint *x) {
> - __clc_vstore8_uint__global(vec, offset, x);
> -}
> -_CLC_OVERLOAD _CLC_DEF void vstore16(uint16 vec, size_t offset, global uint *x) {
> - __clc_vstore16_uint__global(vec, offset, x);
> -}
> diff --git a/generic/lib/shared/vstore_if.ll b/generic/lib/shared/vstore_if.ll
> deleted file mode 100644
> index 30eb552..0000000
> --- a/generic/lib/shared/vstore_if.ll
> +++ /dev/null
> @@ -1,59 +0,0 @@
> -;Start int global vstore
> -
> -declare void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 %y)
> -declare void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 %y)
> -declare void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 %y)
> -declare void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 %y)
> -declare void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 %y)
> -
> -define void @__clc_vstore2_int__global(<2 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
> - call void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 %y)
> - ret void
> -}
> -
> -define void @__clc_vstore3_int__global(<3 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
> - call void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 %y)
> - ret void
> -}
> -
> -define void @__clc_vstore4_int__global(<4 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
> - call void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 %y)
> - ret void
> -}
> -
> -define void @__clc_vstore8_int__global(<8 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
> - call void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 %y)
> - ret void
> -}
> -
> -define void @__clc_vstore16_int__global(<16 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
> - call void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 %y)
> - ret void
> -}
> -
> -
> -;Start uint global vstore
> -define void @__clc_vstore2_uint__global(<2 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
> - call void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 %y)
> - ret void
> -}
> -
> -define void @__clc_vstore3_uint__global(<3 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
> - call void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 %y)
> - ret void
> -}
> -
> -define void @__clc_vstore4_uint__global(<4 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
> - call void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 %y)
> - ret void
> -}
> -
> -define void @__clc_vstore8_uint__global(<8 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
> - call void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 %y)
> - ret void
> -}
> -
> -define void @__clc_vstore16_uint__global(<16 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline {
> - call void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 %y)
> - ret void
> -}
> \ No newline at end of file
> diff --git a/generic/lib/shared/vstore_impl.ll b/generic/lib/shared/vstore_impl.ll
> deleted file mode 100644
> index 3baab5e..0000000
> --- a/generic/lib/shared/vstore_impl.ll
> +++ /dev/null
> @@ -1,50 +0,0 @@
> -; This provides optimized implementations of vstore4/8/16 for 32-bit int/uint
> -
> -define void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> - %1 = ptrtoint i32 addrspace(1)* %addr to i32
> - %2 = add i32 %1, %offset
> - %3 = inttoptr i32 %2 to <2 x i32> addrspace(1)*
> - store <2 x i32> %vec, <2 x i32> addrspace(1)* %3, align 4, !tbaa !3
> - ret void
> -}
> -
> -define void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> - %1 = ptrtoint i32 addrspace(1)* %addr to i32
> - %2 = add i32 %1, %offset
> - %3 = inttoptr i32 %2 to <3 x i32> addrspace(1)*
> - store <3 x i32> %vec, <3 x i32> addrspace(1)* %3, align 4, !tbaa !3
> - ret void
> -}
> -
> -define void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> - %1 = ptrtoint i32 addrspace(1)* %addr to i32
> - %2 = add i32 %1, %offset
> - %3 = inttoptr i32 %2 to <4 x i32> addrspace(1)*
> - store <4 x i32> %vec, <4 x i32> addrspace(1)* %3, align 4, !tbaa !3
> - ret void
> -}
> -
> -define void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> - %1 = ptrtoint i32 addrspace(1)* %addr to i32
> - %2 = add i32 %1, %offset
> - %3 = inttoptr i32 %2 to <8 x i32> addrspace(1)*
> - store <8 x i32> %vec, <8 x i32> addrspace(1)* %3, align 4, !tbaa !3
> - ret void
> -}
> -
> -define void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> - %1 = ptrtoint i32 addrspace(1)* %addr to i32
> - %2 = add i32 %1, %offset
> - %3 = inttoptr i32 %2 to <16 x i32> addrspace(1)*
> - store <16 x i32> %vec, <16 x i32> addrspace(1)* %3, align 4, !tbaa !3
> - ret void
> -}
> -
> -
> -!1 = metadata !{metadata !"char", metadata !5}
> -!2 = metadata !{metadata !"short", metadata !5}
> -!3 = metadata !{metadata !"int", metadata !5}
> -!4 = metadata !{metadata !"long", metadata !5}
> -!5 = metadata !{metadata !"omnipotent char", metadata !6}
> -!6 = metadata !{metadata !"Simple C/C++ TBAA"}
> -
> diff --git a/r600/lib/SOURCES b/r600/lib/SOURCES
> index af8c8c8..0637632 100644
> --- a/r600/lib/SOURCES
> +++ b/r600/lib/SOURCES
> @@ -2,3 +2,9 @@ workitem/get_group_id.ll
> workitem/get_local_size.ll
> workitem/get_local_id.ll
> workitem/get_global_size.ll
> +shared/vload.cl
> +shared/vload_if.ll
> +shared/vload_impl.ll
> +shared/vstore.cl
> +shared/vstore_if.ll
> +shared/vstore_impl.ll
> diff --git a/r600/lib/shared/vload.cl b/r600/lib/shared/vload.cl
> new file mode 100644
> index 0000000..dd28347
> --- /dev/null
> +++ b/r600/lib/shared/vload.cl
> @@ -0,0 +1,99 @@
> +#include <clc/clc.h>
> +
> +#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
> + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 vload2(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
> + return (PRIM_TYPE##2)(x[offset] , x[offset+1]); \
> + } \
> +\
> + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
> + return (PRIM_TYPE##3)(x[offset] , x[offset+1], x[offset+2]); \
> + } \
> +\
> + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
> + return (PRIM_TYPE##4)(x[offset], x[offset+1], x[offset+2], x[offset+3]); \
> + } \
> +\
> + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
> + return (PRIM_TYPE##8)(vload4(offset, x), vload4(offset+4, x)); \
> + } \
> +\
> + _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
> + return (PRIM_TYPE##16)(vload8(offset, x), vload8(offset+8, x)); \
> + } \
> +
> +#define VLOAD_ADDR_SPACES(SCALAR_GENTYPE) \
> + VLOAD_VECTORIZE(SCALAR_GENTYPE, __private) \
> + VLOAD_VECTORIZE(SCALAR_GENTYPE, __local) \
> + VLOAD_VECTORIZE(SCALAR_GENTYPE, __constant) \
> + VLOAD_VECTORIZE(SCALAR_GENTYPE, __global) \
> +
> +//int/uint are special... see below
> +#define VLOAD_TYPES() \
> + VLOAD_ADDR_SPACES(char) \
> + VLOAD_ADDR_SPACES(uchar) \
> + VLOAD_ADDR_SPACES(short) \
> + VLOAD_ADDR_SPACES(ushort) \
> + VLOAD_ADDR_SPACES(long) \
> + VLOAD_ADDR_SPACES(ulong) \
> + VLOAD_ADDR_SPACES(float) \
> +
> +VLOAD_TYPES()
> +
> +#ifdef cl_khr_fp64
> +#pragma OPENCL EXTENSION cl_khr_fp64 : enable
> + VLOAD_ADDR_SPACES(double)
> +#endif
> +
> +VLOAD_VECTORIZE(int, __private)
> +VLOAD_VECTORIZE(int, __local)
> +VLOAD_VECTORIZE(int, __constant)
> +VLOAD_VECTORIZE(uint, __private)
> +VLOAD_VECTORIZE(uint, __local)
> +VLOAD_VECTORIZE(uint, __constant)
> +
> +_CLC_OVERLOAD _CLC_DEF int3 vload3(size_t offset, const global int *x) {
> + return (int3)(vload2(offset, x), x[offset+2]);
> +}
> +_CLC_OVERLOAD _CLC_DEF uint3 vload3(size_t offset, const global uint *x) {
> + return (uint3)(vload2(offset, x), x[offset+2]);
> +}
> +
> +/*Note: R600 doesn't support load <3 x ?>... so
> + * those functions aren't actually overridden here
> + */
> +_CLC_DECL int2 __clc_vload2_int__global(size_t offset, const __global int *);
> +_CLC_DECL int4 __clc_vload4_int__global(size_t offset, const __global int *);
> +_CLC_DECL int8 __clc_vload8_int__global(size_t offset, const __global int *);
> +_CLC_DECL int16 __clc_vload16_int__global(size_t offset, const __global int *);
> +
> +_CLC_OVERLOAD _CLC_DEF int2 vload2(size_t offset, const global int *x) {
> + return __clc_vload2_int__global(offset, x);
> +}
> +_CLC_OVERLOAD _CLC_DEF int4 vload4(size_t offset, const global int *x) {
> + return __clc_vload4_int__global(offset, x);
> +}
> +_CLC_OVERLOAD _CLC_DEF int8 vload8(size_t offset, const global int *x) {
> + return __clc_vload8_int__global(offset, x);
> +}
> +_CLC_OVERLOAD _CLC_DEF int16 vload16(size_t offset, const global int *x) {
> + return __clc_vload16_int__global(offset, x);
> +}
> +
> +_CLC_DECL uint2 __clc_vload2_uint__global(size_t offset, const __global uint *);
> +_CLC_DECL uint4 __clc_vload4_uint__global(size_t offset, const __global uint *);
> +_CLC_DECL uint8 __clc_vload8_uint__global(size_t offset, const __global uint *);
> +_CLC_DECL uint16 __clc_vload16_uint__global(size_t offset, const __global uint *);
> +
> +_CLC_OVERLOAD _CLC_DEF uint2 vload2(size_t offset, const global uint *x) {
> + return __clc_vload2_uint__global(offset, x);
> +}
> +_CLC_OVERLOAD _CLC_DEF uint4 vload4(size_t offset, const global uint *x) {
> + return __clc_vload4_uint__global(offset, x);
> +}
> +_CLC_OVERLOAD _CLC_DEF uint8 vload8(size_t offset, const global uint *x) {
> + return __clc_vload8_uint__global(offset, x);
> +}
> +_CLC_OVERLOAD _CLC_DEF uint16 vload16(size_t offset, const global uint *x) {
> + return __clc_vload16_uint__global(offset, x);
> +}
> +
> diff --git a/r600/lib/shared/vload_if.ll b/r600/lib/shared/vload_if.ll
> new file mode 100644
> index 0000000..bda592b
> --- /dev/null
> +++ b/r600/lib/shared/vload_if.ll
What is the purpose of the vload_if.ll file? All it does is wrap calls
to functions in vload_impl.ll, and I'm not sure why.
> @@ -0,0 +1,60 @@
> +;Start int global vload
> +
> +declare <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> +declare <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> +declare <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> +declare <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> +declare <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> +
> +define <2 x i32> @__clc_vload2_int__global(i32 %x, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> + %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> + ret <2 x i32> %call
> +}
> +
> +define <3 x i32> @__clc_vload3_int__global(i32 %x, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> + %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> + ret <3 x i32> %call
> +}
> +
> +define <4 x i32> @__clc_vload4_int__global(i32 %x, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> + %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> + ret <4 x i32> %call
> +}
> +
> +define <8 x i32> @__clc_vload8_int__global(i32 %x, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> + %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> + ret <8 x i32> %call
> +}
> +
> +define <16 x i32> @__clc_vload16_int__global(i32 %x, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> + %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> + ret <16 x i32> %call
> +}
> +
> +
> +;Start uint global vload
> +
> +define <2 x i32> @__clc_vload2_uint__global(i32 %x, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> + %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> + ret <2 x i32> %call
> +}
> +
> +define <3 x i32> @__clc_vload3_uint__global(i32 %x, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> + %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> + ret <3 x i32> %call
> +}
> +
> +define <4 x i32> @__clc_vload4_uint__global(i32 %x, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> + %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> + ret <4 x i32> %call
> +}
> +
> +define <8 x i32> @__clc_vload8_uint__global(i32 %x, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> + %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> + ret <8 x i32> %call
> +}
> +
> +define <16 x i32> @__clc_vload16_uint__global(i32 %x, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> + %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 addrspace(1)* nocapture %addr)
> + ret <16 x i32> %call
> +}
> diff --git a/r600/lib/shared/vload_impl.ll b/r600/lib/shared/vload_impl.ll
> new file mode 100644
> index 0000000..1333aac
> --- /dev/null
> +++ b/r600/lib/shared/vload_impl.ll
> @@ -0,0 +1,44 @@
> +; This provides optimized implementations of vload4/8/16 for 32-bit int/uint
> +
> +define <2 x i32> @__clc_vload2_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> + %1 = getelementptr i32 addrspace(1)* %addr, i32 %offset
> + %2 = bitcast i32 addrspace(1)* %1 to <2 x i32> addrspace(1)*
> + %3 = load <2 x i32> addrspace(1)* %2, align 4, !tbaa !3
> + ret <2 x i32> %3
> +}
> +
> +define <3 x i32> @__clc_vload3_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> + %1 = getelementptr i32 addrspace(1)* %addr, i32 %offset
> + %2 = bitcast i32 addrspace(1)* %1 to <3 x i32> addrspace(1)*
> + %3 = load <3 x i32> addrspace(1)* %2, align 4, !tbaa !3
> + ret <3 x i32> %3
> +}
> +
> +define <4 x i32> @__clc_vload4_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> + %1 = getelementptr i32 addrspace(1)* %addr, i32 %offset
> + %2 = bitcast i32 addrspace(1)* %1 to <4 x i32> addrspace(1)*
> + %3 = load <4 x i32> addrspace(1)* %2, align 4, !tbaa !3
> + ret <4 x i32> %3
> +}
> +
> +define <8 x i32> @__clc_vload8_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> + %1 = getelementptr i32 addrspace(1)* %addr, i32 %offset
> + %2 = bitcast i32 addrspace(1)* %1 to <8 x i32> addrspace(1)*
> + %3 = load <8 x i32> addrspace(1)* %2, align 4, !tbaa !3
> + ret <8 x i32> %3
> +}
> +
> +define <16 x i32> @__clc_vload16_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline {
> + %1 = getelementptr i32 addrspace(1)* %addr, i32 %offset
> + %2 = bitcast i32 addrspace(1)* %1 to <16 x i32> addrspace(1)*
> + %3 = load <16 x i32> addrspace(1)* %2, align 4, !tbaa !3
> + ret <16 x i32> %3
> +}
> +
> +!1 = metadata !{metadata !"char", metadata !5}
> +!2 = metadata !{metadata !"short", metadata !5}
> +!3 = metadata !{metadata !"int", metadata !5}
> +!4 = metadata !{metadata !"long", metadata !5}
> +!5 = metadata !{metadata !"omnipotent char", metadata !6}
> +!6 = metadata !{metadata !"Simple C/C++ TBAA"}
> +
> diff --git a/r600/lib/shared/vstore.cl b/r600/lib/shared/vstore.cl
> new file mode 100644
> index 0000000..c8b8cd5
> --- /dev/null
> +++ b/r600/lib/shared/vstore.cl
> @@ -0,0 +1,108 @@
> +#include <clc/clc.h>
> +
> +#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
> +
> +#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
> + _CLC_OVERLOAD _CLC_DEF void vstore2(PRIM_TYPE##2 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
> + mem[offset] = vec.s0; \
> + mem[offset+1] = vec.s1; \
> + } \
> +\
> + _CLC_OVERLOAD _CLC_DEF void vstore3(PRIM_TYPE##3 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
> + mem[offset] = vec.s0; \
> + mem[offset+1] = vec.s1; \
> + mem[offset+2] = vec.s2; \
> + } \
> +\
> + _CLC_OVERLOAD _CLC_DEF void vstore4(PRIM_TYPE##4 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
> + vstore2(vec.lo, offset, mem); \
> + vstore2(vec.hi, offset+2, mem); \
> + } \
> +\
> + _CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
> + vstore4(vec.lo, offset, mem); \
> + vstore4(vec.hi, offset+4, mem); \
> + } \
> +\
> + _CLC_OVERLOAD _CLC_DEF void vstore16(PRIM_TYPE##16 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
> + vstore8(vec.lo, offset, mem); \
> + vstore8(vec.hi, offset+8, mem); \
> + } \
> +
> +#define VSTORE_ADDR_SPACES(SCALAR_GENTYPE) \
> + VSTORE_VECTORIZE(SCALAR_GENTYPE, __private) \
> + VSTORE_VECTORIZE(SCALAR_GENTYPE, __local) \
> + VSTORE_VECTORIZE(SCALAR_GENTYPE, __global) \
> +
> +//int/uint are special... see below
> +#define VSTORE_TYPES() \
> + VSTORE_ADDR_SPACES(char) \
> + VSTORE_ADDR_SPACES(uchar) \
> + VSTORE_ADDR_SPACES(short) \
> + VSTORE_ADDR_SPACES(ushort) \
> + VSTORE_ADDR_SPACES(long) \
> + VSTORE_ADDR_SPACES(ulong) \
> + VSTORE_ADDR_SPACES(float) \
> +
> +VSTORE_TYPES()
> +
> +#ifdef cl_khr_fp64
> +#pragma OPENCL EXTENSION cl_khr_fp64 : enable
> + VSTORE_ADDR_SPACES(double)
> +#endif
> +
> +VSTORE_VECTORIZE(int, __private)
> +VSTORE_VECTORIZE(int, __local)
> +VSTORE_VECTORIZE(uint, __private)
> +VSTORE_VECTORIZE(uint, __local)
> +
> +_CLC_OVERLOAD _CLC_DEF void vstore3(int3 vec, size_t offset, global int *mem) {
> + mem[offset] = vec.s0;
> + mem[offset+1] = vec.s1;
> + mem[offset+2] = vec.s2;
> +}
> +_CLC_OVERLOAD _CLC_DEF void vstore3(uint3 vec, size_t offset, global uint *mem) {
> + mem[offset] = vec.s0;
> + mem[offset+1] = vec.s1;
> + mem[offset+2] = vec.s2;
> +}
> +
> +/*Note: R600 doesn't support store <3 x ?>... so
> + * those functions aren't actually overridden here... lowest-common-denominator
> + */
> +_CLC_DECL void __clc_vstore2_int__global(int2 vec, size_t offset, __global int *);
> +_CLC_DECL void __clc_vstore4_int__global(int4 vec, size_t offset, __global int *);
> +_CLC_DECL void __clc_vstore8_int__global(int8 vec, size_t offset, __global int *);
> +_CLC_DECL void __clc_vstore16_int__global(int16 vec, size_t offset, __global int *);
> +
> +_CLC_OVERLOAD _CLC_DEF void vstore2(int2 vec, size_t offset, global int *x) {
> + __clc_vstore2_int__global(vec, offset, x);
> +}
> +_CLC_OVERLOAD _CLC_DEF void vstore4(int4 vec, size_t offset, global int *x) {
> + __clc_vstore4_int__global(vec, offset, x);
> +}
> +_CLC_OVERLOAD _CLC_DEF void vstore8(int8 vec, size_t offset, global int *x) {
> + __clc_vstore8_int__global(vec, offset, x);
> +}
> +_CLC_OVERLOAD _CLC_DEF void vstore16(int16 vec, size_t offset, global int *x) {
> + __clc_vstore16_int__global(vec, offset, x);
> +}
> +
> +_CLC_DECL void __clc_vstore2_uint__global(uint2 vec, size_t offset, __global uint *);
> +_CLC_DECL void __clc_vstore4_uint__global(uint4 vec, size_t offset, __global uint *);
> +_CLC_DECL void __clc_vstore8_uint__global(uint8 vec, size_t offset, __global uint *);
> +_CLC_DECL void __clc_vstore16_uint__global(uint16 vec, size_t offset, __global uint *);
> +
> +_CLC_OVERLOAD _CLC_DEF void vstore2(uint2 vec, size_t offset, global uint *x) {
> + __clc_vstore2_uint__global(vec, offset, x);
> +}
> +_CLC_OVERLOAD _CLC_DEF void vstore4(uint4 vec, size_t offset, global uint *x) {
> + __clc_vstore4_uint__global(vec, offset, x);
> +}
> +_CLC_OVERLOAD _CLC_DEF void vstore8(uint8 vec, size_t offset, global uint *x) {
> + __clc_vstore8_uint__global(vec, offset, x);
> +}
> +_CLC_OVERLOAD _CLC_DEF void vstore16(uint16 vec, size_t offset, global uint *x) {
> + __clc_vstore16_uint__global(vec, offset, x);
> +}
> +
> diff --git a/r600/lib/shared/vstore_if.ll b/r600/lib/shared/vstore_if.ll
> new file mode 100644
> index 0000000..382a8a8
> --- /dev/null
> +++ b/r600/lib/shared/vstore_if.ll
> @@ -0,0 +1,59 @@
> +;Start int global vstore
> +
> +declare void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> +declare void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> +declare void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> +declare void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> +declare void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> +
> +define void @__clc_vstore2_int__global(<2 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> + call void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> + ret void
> +}
> +
> +define void @__clc_vstore3_int__global(<3 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> + call void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> + ret void
> +}
> +
> +define void @__clc_vstore4_int__global(<4 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> + call void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> + ret void
> +}
> +
> +define void @__clc_vstore8_int__global(<8 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> + call void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> + ret void
> +}
> +
> +define void @__clc_vstore16_int__global(<16 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> + call void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> + ret void
> +}
> +
> +
> +;Start uint global vstore
> +define void @__clc_vstore2_uint__global(<2 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> + call void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> + ret void
> +}
> +
> +define void @__clc_vstore3_uint__global(<3 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> + call void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> + ret void
> +}
> +
> +define void @__clc_vstore4_uint__global(<4 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> + call void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> + ret void
> +}
> +
> +define void @__clc_vstore8_uint__global(<8 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> + call void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> + ret void
> +}
> +
> +define void @__clc_vstore16_uint__global(<16 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> + call void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 addrspace(1)* nocapture %addr)
> + ret void
> +}
> \ No newline at end of file
> diff --git a/r600/lib/shared/vstore_impl.ll b/r600/lib/shared/vstore_impl.ll
> new file mode 100644
> index 0000000..8790a8f
> --- /dev/null
> +++ b/r600/lib/shared/vstore_impl.ll
> @@ -0,0 +1,45 @@
> +; This provides optimized implementations of vstore4/8/16 for 32-bit int/uint
> +
> +define void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> + %1 = getelementptr i32 addrspace(1)* %addr, i32 %offset
> + %2 = bitcast i32 addrspace(1)* %1 to <2 x i32> addrspace(1)*
> + store <2 x i32> %vec, <2 x i32> addrspace(1)* %2, align 4, !tbaa !3
> + ret void
> +}
> +
> +define void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> + %1 = getelementptr i32 addrspace(1)* %addr, i32 %offset
> + %2 = bitcast i32 addrspace(1)* %1 to <3 x i32> addrspace(1)*
> + store <3 x i32> %vec, <3 x i32> addrspace(1)* %2, align 4, !tbaa !3
> + ret void
> +}
> +
> +define void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> + %1 = getelementptr i32 addrspace(1)* %addr, i32 %offset
> + %2 = bitcast i32 addrspace(1)* %1 to <4 x i32> addrspace(1)*
> + store <4 x i32> %vec, <4 x i32> addrspace(1)* %2, align 4, !tbaa !3
> + ret void
> +}
> +
> +define void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> + %1 = getelementptr i32 addrspace(1)* %addr, i32 %offset
> + %2 = bitcast i32 addrspace(1)* %1 to <8 x i32> addrspace(1)*
> + store <8 x i32> %vec, <8 x i32> addrspace(1)* %2, align 4, !tbaa !3
> + ret void
> +}
> +
> +define void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline {
> + %1 = getelementptr i32 addrspace(1)* %addr, i32 %offset
> + %2 = bitcast i32 addrspace(1)* %1 to <16 x i32> addrspace(1)*
> + store <16 x i32> %vec, <16 x i32> addrspace(1)* %2, align 4, !tbaa !3
> + ret void
> +}
> +
> +
> +!1 = metadata !{metadata !"char", metadata !5}
> +!2 = metadata !{metadata !"short", metadata !5}
> +!3 = metadata !{metadata !"int", metadata !5}
> +!4 = metadata !{metadata !"long", metadata !5}
> +!5 = metadata !{metadata !"omnipotent char", metadata !6}
> +!6 = metadata !{metadata !"Simple C/C++ TBAA"}
> +
> --
> 1.8.1.2
>
>
> _______________________________________________
> Libclc-dev mailing list
> Libclc-dev at pcc.me.uk
> http://www.pcc.me.uk/cgi-bin/mailman/listinfo/libclc-dev
More information about the Libclc-dev
mailing list