[Libclc-dev] [PATCH] relational: Implement signbit

Wed Jun 25 11:05:44 PDT 2014

Yeah, I screwed that up, and the piglit tests didn't catch it because
the auto-generated tests use the same input value for all vector
elements.

In the definition of _CLC_DEFINE_RELATIONAL_UNARY_VEC3:
return (RET_TYPE)((FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2)) !=
(RET_TYPE)0); \

Should be:
return (RET_TYPE)( (RET_TYPE){FUNCTION(x.s0), FUNCTION(x.s1),
FUNCTION(x.s2)} != (RET_TYPE)0); \

Big difference between parentheses around the vector components and braces.

This was messed up for float3, float4, float8, and float16, but float2
was correct.

With the above change and the following kernel
kernel void test_3_signbit_float(global int* out, global float* in0){
    vstore3(signbit(vload3(0, in0)), 0, out);
}

I get the following bitcode:
define void @test_3_signbit_float(i32 addrspace(1)* nocapture %out,
float addrspace(1)* nocapture readonly %in0) #0 {
  %1 = load float addrspace(1)* %in0, align 4, !tbaa !2
  %2 = getelementptr inbounds float addrspace(1)* %in0, i32 1
  %3 = load float addrspace(1)* %2, align 4, !tbaa !2
  %4 = getelementptr inbounds float addrspace(1)* %in0, i32 2
  %5 = load float addrspace(1)* %4, align 4, !tbaa !2
  %6 = bitcast float %1 to i32
  %.lobit.i.i = lshr i32 %6, 31
  %7 = insertelement <3 x i32> undef, i32 %.lobit.i.i, i32 0
  %8 = bitcast float %3 to i32
  %.lobit.i3.i = lshr i32 %8, 31
  %9 = insertelement <3 x i32> %7, i32 %.lobit.i3.i, i32 1
  %10 = bitcast float %5 to i32
  %.lobit.i2.i = lshr i32 %10, 31
  %11 = insertelement <3 x i32> %9, i32 %.lobit.i2.i, i32 2
  %12 = icmp ne <3 x i32> %11, zeroinitializer
  %13 = sext <3 x i1> %12 to <3 x i32>
  %14 = extractelement <3 x i32> %13, i32 0
  store i32 %14, i32 addrspace(1)* %out, align 4, !tbaa !6
  %15 = extractelement <3 x i32> %13, i32 1
  %16 = getelementptr inbounds i32 addrspace(1)* %out, i32 1
  store i32 %15, i32 addrspace(1)* %16, align 4, !tbaa !6
  %17 = extractelement <3 x i32> %13, i32 2
  %18 = getelementptr inbounds i32 addrspace(1)* %out, i32 2
  store i32 %17, i32 addrspace(1)* %18, align 4, !tbaa !6
  ret void
}

Does that look better?

--Aaron

On Wed, Jun 25, 2014 at 9:47 AM, Jeroen Ketema <j.ketema at imperial.ac.uk> wrote:
>
> Hi,
>
> I hadn’t looked at the patch before, but I’m wondering if it does the right thing. Consider the following kernel (I’m compiling to the NVPTX target and llvm 3.4):
>
> #define id (get_group_id(0) * get_local_size(0) + get_local_id(0))
>
> __kernel void foo(__global float* p, __global float3 *q, __global int3 *n)
> {
>   n[get_global_id(0)] = signbit(q[get_global_id(0)]);
> }
>
> I’m getting the following bit code:
>
> define void @foo(float addrspace(1)* %p, <3 x float> addrspace(1)* %q, <3 x i32> addrspace(1)* %n) #2 {
>   call void @llvm.dbg.value(metadata !{float addrspace(1)* %p}, i64 0, metadata !41), !dbg !42
>   call void @llvm.dbg.value(metadata !{<3 x float> addrspace(1)* %q}, i64 0, metadata !43), !dbg !44
>   call void @llvm.dbg.value(metadata !{<3 x i32> addrspace(1)* %n}, i64 0, metadata !45), !dbg !46
>   %1 = call i32 @get_group_id(i32 0) #5
>   %2 = call i32 @get_local_size(i32 0) #5
>   %3 = mul i32 %2, %1
>   %4 = call i32 @get_local_id(i32 0) #5
>   %5 = add i32 %3, %4
>   %6 = getelementptr inbounds <3 x float> addrspace(1)* %q, i32 %5, !dbg !47
>   %7 = bitcast <3 x float> addrspace(1)* %6 to <4 x float> addrspace(1)*, !dbg !47
>   %8 = load <4 x float> addrspace(1)* %7, !dbg !47
>   %9 = shufflevector <4 x float> %8, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>, !dbg !47
>   %10 = extractelement <3 x float> %9, i32 2
>   %11 = bitcast float %10 to i32
>   %.lobit.i.i = lshr i32 %11, 31
>   %12 = insertelement <3 x i32> undef, i32 %.lobit.i.i, i32 0
>   %13 = shufflevector <3 x i32> %12, <3 x i32> undef, <3 x i32> zeroinitializer
>   %14 = icmp ne <3 x i32> %13, zeroinitializer
>   %15 = sext <3 x i1> %14 to <3 x i32>
>   %16 = call i32 @get_group_id(i32 0) #5
>   %17 = call i32 @get_local_size(i32 0) #5
>   %18 = mul i32 %17, %16
>   %19 = call i32 @get_local_id(i32 0) #5
>   %20 = add i32 %18, %19
>   %21 = getelementptr inbounds <3 x i32> addrspace(1)* %n, i32 %20, !dbg !49
>   %22 = shufflevector <3 x i32> %15, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>, !dbg !49
>   %23 = bitcast <3 x i32> addrspace(1)* %21 to <4 x i32> addrspace(1)*, !dbg !49
>   store <4 x i32> %22, <4 x i32> addrspace(1)* %23, align 16, !dbg !49
>   ret void, !dbg !50
> }
>
> The builtin is apparently turned into a lshr. However, I see only one of these and not the three I expecting since I’m working over vectors of length 3. And, I get the impression that all vector elements should be considered separately.
>
> There’s also seems to be a deeper problem with this bitcode: The vector of length 3 is written as a vector of length 4. Hence, although the kernel is data race free the bitcode isn’t if the vectors are tightly packed in the array, which seems to be the case given the way the getelementptr is used.
>
> On 25 Jun 2014, at 14:40, Aaron Watry <awatry at gmail.com> wrote:
>
>> Committed.
>>
>> I'm going to send v2 of the other 3 patches of this series after I
>> whip up the unary/binary relational macro changes to simplify the
>> implementations.
>>
>> --Aaron
>>
>> On Fri, Jun 20, 2014 at 7:41 PM, Aaron Watry <awatry at gmail.com> wrote:
>>> v2 Changes:
>>>   - use __builtin_signbit instead of shifting by hand
>>>   - significantly improve vector shuffling
>>>   - Works correctly now for signbit(float16) on radeonsi
>>>
>>> Signed-off-by: Aaron Watry <awatry at gmail.com>
>>> ---
>>> generic/include/clc/clc.h                |  1 +
>>> generic/include/clc/relational/signbit.h | 18 +++++++
>>> generic/lib/SOURCES                      |  1 +
>>> generic/lib/relational/signbit.cl        | 87 ++++++++++++++++++++++++++++++++
>>> 4 files changed, 107 insertions(+)
>>> create mode 100644 generic/include/clc/relational/signbit.h
>>> create mode 100644 generic/lib/relational/signbit.cl
>>>
>>> diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
>>> index 7702e68..1c68bd5 100644
>>> --- a/generic/include/clc/clc.h
>>> +++ b/generic/include/clc/clc.h
>>> @@ -113,6 +113,7 @@
>>> #include <clc/relational/isnan.h>
>>> #include <clc/relational/isnotequal.h>
>>> #include <clc/relational/select.h>
>>> +#include <clc/relational/signbit.h>
>>>
>>> /* 6.11.8 Synchronization Functions */
>>> #include <clc/synchronization/cl_mem_fence_flags.h>
>>> diff --git a/generic/include/clc/relational/signbit.h b/generic/include/clc/relational/signbit.h
>>> new file mode 100644
>>> index 0000000..774d6e0
>>> --- /dev/null
>>> +++ b/generic/include/clc/relational/signbit.h
>>> @@ -0,0 +1,18 @@
>>> +
>>> +#define _CLC_SIGNBIT_DECL(TYPE, RETTYPE) \
>>> +  _CLC_OVERLOAD _CLC_DECL RETTYPE signbit(TYPE x);
>>> +
>>> +#define _CLC_VECTOR_SIGNBIT_DECL(TYPE, RETTYPE) \
>>> +  _CLC_SIGNBIT_DECL(TYPE##2, RETTYPE##2)  \
>>> +  _CLC_SIGNBIT_DECL(TYPE##3, RETTYPE##3)  \
>>> +  _CLC_SIGNBIT_DECL(TYPE##4, RETTYPE##4)  \
>>> +  _CLC_SIGNBIT_DECL(TYPE##8, RETTYPE##8)  \
>>> +  _CLC_SIGNBIT_DECL(TYPE##16, RETTYPE##16)
>>> +
>>> +_CLC_SIGNBIT_DECL(float, int)
>>> +_CLC_VECTOR_SIGNBIT_DECL(float, int)
>>> +
>>> +#ifdef cl_khr_fp64
>>> +_CLC_SIGNBIT_DECL(double, int)
>>> +_CLC_VECTOR_SIGNBIT_DECL(double, long)
>>> +#endif
>>> \ No newline at end of file
>>> diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
>>> index 611966f..aa638d8 100644
>>> --- a/generic/lib/SOURCES
>>> +++ b/generic/lib/SOURCES
>>> @@ -44,6 +44,7 @@ relational/isgreater.cl
>>> relational/isgreaterequal.cl
>>> relational/isnotequal.cl
>>> relational/isnan.cl
>>> +relational/signbit.cl
>>> shared/clamp.cl
>>> shared/max.cl
>>> shared/min.cl
>>> diff --git a/generic/lib/relational/signbit.cl b/generic/lib/relational/signbit.cl
>>> new file mode 100644
>>> index 0000000..1f496d9
>>> --- /dev/null
>>> +++ b/generic/lib/relational/signbit.cl
>>> @@ -0,0 +1,87 @@
>>> +#include <clc/clc.h>
>>> +
>>> +#define _CLC_DEFINE_RELATIONAL_UNARY_SCALAR(RET_TYPE, FUNCTION, BUILTIN_NAME, ARG_TYPE) \
>>> +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x){ \
>>> +       return BUILTIN_NAME(x); \
>>> +} \
>>> +
>>> +#define _CLC_DEFINE_RELATIONAL_UNARY_VEC(RET_TYPE, FUNCTION, ARG_TYPE) \
>>> +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \
>>> +  return (RET_TYPE)( (RET_TYPE){FUNCTION(x.lo), FUNCTION(x.hi)} != (RET_TYPE)0); \
>>> +} \
>>> +
>>> +#define _CLC_DEFINE_RELATIONAL_UNARY_VEC2(RET_TYPE, FUNCTION, ARG_TYPE) \
>>> +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \
>>> +  return (RET_TYPE)( (RET_TYPE){FUNCTION(x.lo), FUNCTION(x.hi)} != (RET_TYPE)0); \
>>> +} \
>>> +
>>> +#define _CLC_DEFINE_RELATIONAL_UNARY_VEC3(RET_TYPE, FUNCTION, ARG_TYPE) \
>>> +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \
>>> +  return (RET_TYPE)((FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2)) != (RET_TYPE)0); \
>>> +} \
>>> +
>>> +#define _CLC_DEFINE_RELATIONAL_UNARY_VEC4(RET_TYPE, FUNCTION, ARG_TYPE) \
>>> +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \
>>> +  return (RET_TYPE)( \
>>> +       ( \
>>> +               FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), FUNCTION(x.s3) \
>>> +       ) != (RET_TYPE)0); \
>>> +} \
>>> +
>>> +#define _CLC_DEFINE_RELATIONAL_UNARY_VEC8(RET_TYPE, FUNCTION, ARG_TYPE) \
>>> +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \
>>> +  return (RET_TYPE)( \
>>> +       ( \
>>> +               FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), FUNCTION(x.s3), \
>>> +               FUNCTION(x.s4), FUNCTION(x.s5), FUNCTION(x.s6), FUNCTION(x.s7) \
>>> +       ) != (RET_TYPE)0); \
>>> +} \
>>> +
>>> +#define _CLC_DEFINE_RELATIONAL_UNARY_VEC16(RET_TYPE, FUNCTION, ARG_TYPE) \
>>> +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \
>>> +  return (RET_TYPE)( \
>>> +       ( \
>>> +               FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), FUNCTION(x.s3), \
>>> +               FUNCTION(x.s4), FUNCTION(x.s5), FUNCTION(x.s6), FUNCTION(x.s7), \
>>> +               FUNCTION(x.s8), FUNCTION(x.s9), FUNCTION(x.sa), FUNCTION(x.sb), \
>>> +               FUNCTION(x.sc), FUNCTION(x.sd), FUNCTION(x.se), FUNCTION(x.sf) \
>>> +       ) != (RET_TYPE)0); \
>>> +} \
>>> +
>>> +
>>> +#define _CLC_DEFINE_RELATIONAL_UNARY(RET_TYPE, FUNCTION, BUILTIN_FUNCTION, ARG_TYPE) \
>>> +_CLC_DEFINE_RELATIONAL_UNARY_SCALAR(RET_TYPE, FUNCTION, BUILTIN_FUNCTION, ARG_TYPE) \
>>> +_CLC_DEFINE_RELATIONAL_UNARY_VEC2(RET_TYPE##2, FUNCTION, ARG_TYPE##2) \
>>> +_CLC_DEFINE_RELATIONAL_UNARY_VEC3(RET_TYPE##3, FUNCTION, ARG_TYPE##3) \
>>> +_CLC_DEFINE_RELATIONAL_UNARY_VEC4(RET_TYPE##4, FUNCTION, ARG_TYPE##4) \
>>> +_CLC_DEFINE_RELATIONAL_UNARY_VEC8(RET_TYPE##8, FUNCTION, ARG_TYPE##8) \
>>> +_CLC_DEFINE_RELATIONAL_UNARY_VEC16(RET_TYPE##16, FUNCTION, ARG_TYPE##16) \
>>> +
>>> +_CLC_DEFINE_RELATIONAL_UNARY(int, signbit, __builtin_signbitf, float)
>>> +
>>> +#ifdef cl_khr_fp64
>>> +
>>> +#pragma OPENCL EXTENSION cl_khr_fp64 : enable
>>> +
>>> +// The scalar version of signbit(double) returns an int, but the vector versions
>>> +// return long.
>>> +
>>> +_CLC_DEF _CLC_OVERLOAD int signbit(double x){
>>> +       return __builtin_signbit(x);
>>> +}
>>> +
>>> +_CLC_DEFINE_RELATIONAL_UNARY_VEC2(long2, signbit, double2)
>>> +_CLC_DEFINE_RELATIONAL_UNARY_VEC3(long3, signbit, double3)
>>> +_CLC_DEFINE_RELATIONAL_UNARY_VEC4(long4, signbit, double4)
>>> +_CLC_DEFINE_RELATIONAL_UNARY_VEC8(long8, signbit, double8)
>>> +_CLC_DEFINE_RELATIONAL_UNARY_VEC16(long16, signbit, double16)
>>> +
>>> +#endif
>>> +
>>> +#undef _CLC_DEFINE_RELATIONAL_UNARY
>>> +#undef _CLC_DEFINE_RELATIONAL_UNARY_SCALAR
>>> +#undef _CLC_DEFINE_RELATIONAL_UNARY_VEC2
>>> +#undef _CLC_DEFINE_RELATIONAL_UNARY_VEC3
>>> +#undef _CLC_DEFINE_RELATIONAL_UNARY_VEC4
>>> +#undef _CLC_DEFINE_RELATIONAL_UNARY_VEC8
>>> +#undef _CLC_DEFINE_RELATIONAL_UNARY_VEC16
>>> \ No newline at end of file
>>> --
>>> 1.9.1
>>>
>>
>> _______________________________________________
>> Libclc-dev mailing list
>> Libclc-dev at pcc.me.uk
>> http://www.pcc.me.uk/cgi-bin/mailman/listinfo/libclc-dev
>