[Libclc-dev] [PATCH] relational: Implement signbit

Wed Jun 25 07:47:44 PDT 2014

Hi,

I hadn’t looked at the patch before, but I’m wondering if it does the right thing. Consider the following kernel (I’m compiling to the NVPTX target and llvm 3.4):

#define id (get_group_id(0) * get_local_size(0) + get_local_id(0))

__kernel void foo(__global float* p, __global float3 *q, __global int3 *n)
{
  n[get_global_id(0)] = signbit(q[get_global_id(0)]);
}

I’m getting the following bit code:

define void @foo(float addrspace(1)* %p, <3 x float> addrspace(1)* %q, <3 x i32> addrspace(1)* %n) #2 {
  call void @llvm.dbg.value(metadata !{float addrspace(1)* %p}, i64 0, metadata !41), !dbg !42
  call void @llvm.dbg.value(metadata !{<3 x float> addrspace(1)* %q}, i64 0, metadata !43), !dbg !44
  call void @llvm.dbg.value(metadata !{<3 x i32> addrspace(1)* %n}, i64 0, metadata !45), !dbg !46
  %1 = call i32 @get_group_id(i32 0) #5
  %2 = call i32 @get_local_size(i32 0) #5
  %3 = mul i32 %2, %1
  %4 = call i32 @get_local_id(i32 0) #5
  %5 = add i32 %3, %4
  %6 = getelementptr inbounds <3 x float> addrspace(1)* %q, i32 %5, !dbg !47
  %7 = bitcast <3 x float> addrspace(1)* %6 to <4 x float> addrspace(1)*, !dbg !47
  %8 = load <4 x float> addrspace(1)* %7, !dbg !47
  %9 = shufflevector <4 x float> %8, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>, !dbg !47
  %10 = extractelement <3 x float> %9, i32 2
  %11 = bitcast float %10 to i32
  %.lobit.i.i = lshr i32 %11, 31
  %12 = insertelement <3 x i32> undef, i32 %.lobit.i.i, i32 0
  %13 = shufflevector <3 x i32> %12, <3 x i32> undef, <3 x i32> zeroinitializer
  %14 = icmp ne <3 x i32> %13, zeroinitializer
  %15 = sext <3 x i1> %14 to <3 x i32>
  %16 = call i32 @get_group_id(i32 0) #5
  %17 = call i32 @get_local_size(i32 0) #5
  %18 = mul i32 %17, %16
  %19 = call i32 @get_local_id(i32 0) #5
  %20 = add i32 %18, %19
  %21 = getelementptr inbounds <3 x i32> addrspace(1)* %n, i32 %20, !dbg !49
  %22 = shufflevector <3 x i32> %15, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>, !dbg !49
  %23 = bitcast <3 x i32> addrspace(1)* %21 to <4 x i32> addrspace(1)*, !dbg !49
  store <4 x i32> %22, <4 x i32> addrspace(1)* %23, align 16, !dbg !49
  ret void, !dbg !50
}

The builtin is apparently turned into a lshr. However, I see only one of these and not the three I expecting since I’m working over vectors of length 3. And, I get the impression that all vector elements should be considered separately.

There’s also seems to be a deeper problem with this bitcode: The vector of length 3 is written as a vector of length 4. Hence, although the kernel is data race free the bitcode isn’t if the vectors are tightly packed in the array, which seems to be the case given the way the getelementptr is used.

On 25 Jun 2014, at 14:40, Aaron Watry <awatry at gmail.com> wrote:

> Committed.
> 
> I'm going to send v2 of the other 3 patches of this series after I
> whip up the unary/binary relational macro changes to simplify the
> implementations.
> 
> --Aaron
> 
> On Fri, Jun 20, 2014 at 7:41 PM, Aaron Watry <awatry at gmail.com> wrote:
>> v2 Changes:
>>   - use __builtin_signbit instead of shifting by hand
>>   - significantly improve vector shuffling
>>   - Works correctly now for signbit(float16) on radeonsi
>> 
>> Signed-off-by: Aaron Watry <awatry at gmail.com>
>> ---
>> generic/include/clc/clc.h                |  1 +
>> generic/include/clc/relational/signbit.h | 18 +++++++
>> generic/lib/SOURCES                      |  1 +
>> generic/lib/relational/signbit.cl        | 87 ++++++++++++++++++++++++++++++++
>> 4 files changed, 107 insertions(+)
>> create mode 100644 generic/include/clc/relational/signbit.h
>> create mode 100644 generic/lib/relational/signbit.cl
>> 
>> diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
>> index 7702e68..1c68bd5 100644
>> --- a/generic/include/clc/clc.h
>> +++ b/generic/include/clc/clc.h
>> @@ -113,6 +113,7 @@
>> #include <clc/relational/isnan.h>
>> #include <clc/relational/isnotequal.h>
>> #include <clc/relational/select.h>
>> +#include <clc/relational/signbit.h>
>> 
>> /* 6.11.8 Synchronization Functions */
>> #include <clc/synchronization/cl_mem_fence_flags.h>
>> diff --git a/generic/include/clc/relational/signbit.h b/generic/include/clc/relational/signbit.h
>> new file mode 100644
>> index 0000000..774d6e0
>> --- /dev/null
>> +++ b/generic/include/clc/relational/signbit.h
>> @@ -0,0 +1,18 @@
>> +
>> +#define _CLC_SIGNBIT_DECL(TYPE, RETTYPE) \
>> +  _CLC_OVERLOAD _CLC_DECL RETTYPE signbit(TYPE x);
>> +
>> +#define _CLC_VECTOR_SIGNBIT_DECL(TYPE, RETTYPE) \
>> +  _CLC_SIGNBIT_DECL(TYPE##2, RETTYPE##2)  \
>> +  _CLC_SIGNBIT_DECL(TYPE##3, RETTYPE##3)  \
>> +  _CLC_SIGNBIT_DECL(TYPE##4, RETTYPE##4)  \
>> +  _CLC_SIGNBIT_DECL(TYPE##8, RETTYPE##8)  \
>> +  _CLC_SIGNBIT_DECL(TYPE##16, RETTYPE##16)
>> +
>> +_CLC_SIGNBIT_DECL(float, int)
>> +_CLC_VECTOR_SIGNBIT_DECL(float, int)
>> +
>> +#ifdef cl_khr_fp64
>> +_CLC_SIGNBIT_DECL(double, int)
>> +_CLC_VECTOR_SIGNBIT_DECL(double, long)
>> +#endif
>> \ No newline at end of file
>> diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES
>> index 611966f..aa638d8 100644
>> --- a/generic/lib/SOURCES
>> +++ b/generic/lib/SOURCES
>> @@ -44,6 +44,7 @@ relational/isgreater.cl
>> relational/isgreaterequal.cl
>> relational/isnotequal.cl
>> relational/isnan.cl
>> +relational/signbit.cl
>> shared/clamp.cl
>> shared/max.cl
>> shared/min.cl
>> diff --git a/generic/lib/relational/signbit.cl b/generic/lib/relational/signbit.cl
>> new file mode 100644
>> index 0000000..1f496d9
>> --- /dev/null
>> +++ b/generic/lib/relational/signbit.cl
>> @@ -0,0 +1,87 @@
>> +#include <clc/clc.h>
>> +
>> +#define _CLC_DEFINE_RELATIONAL_UNARY_SCALAR(RET_TYPE, FUNCTION, BUILTIN_NAME, ARG_TYPE) \
>> +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x){ \
>> +       return BUILTIN_NAME(x); \
>> +} \
>> +
>> +#define _CLC_DEFINE_RELATIONAL_UNARY_VEC(RET_TYPE, FUNCTION, ARG_TYPE) \
>> +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \
>> +  return (RET_TYPE)( (RET_TYPE){FUNCTION(x.lo), FUNCTION(x.hi)} != (RET_TYPE)0); \
>> +} \
>> +
>> +#define _CLC_DEFINE_RELATIONAL_UNARY_VEC2(RET_TYPE, FUNCTION, ARG_TYPE) \
>> +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \
>> +  return (RET_TYPE)( (RET_TYPE){FUNCTION(x.lo), FUNCTION(x.hi)} != (RET_TYPE)0); \
>> +} \
>> +
>> +#define _CLC_DEFINE_RELATIONAL_UNARY_VEC3(RET_TYPE, FUNCTION, ARG_TYPE) \
>> +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \
>> +  return (RET_TYPE)((FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2)) != (RET_TYPE)0); \
>> +} \
>> +
>> +#define _CLC_DEFINE_RELATIONAL_UNARY_VEC4(RET_TYPE, FUNCTION, ARG_TYPE) \
>> +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \
>> +  return (RET_TYPE)( \
>> +       ( \
>> +               FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), FUNCTION(x.s3) \
>> +       ) != (RET_TYPE)0); \
>> +} \
>> +
>> +#define _CLC_DEFINE_RELATIONAL_UNARY_VEC8(RET_TYPE, FUNCTION, ARG_TYPE) \
>> +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \
>> +  return (RET_TYPE)( \
>> +       ( \
>> +               FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), FUNCTION(x.s3), \
>> +               FUNCTION(x.s4), FUNCTION(x.s5), FUNCTION(x.s6), FUNCTION(x.s7) \
>> +       ) != (RET_TYPE)0); \
>> +} \
>> +
>> +#define _CLC_DEFINE_RELATIONAL_UNARY_VEC16(RET_TYPE, FUNCTION, ARG_TYPE) \
>> +_CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \
>> +  return (RET_TYPE)( \
>> +       ( \
>> +               FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), FUNCTION(x.s3), \
>> +               FUNCTION(x.s4), FUNCTION(x.s5), FUNCTION(x.s6), FUNCTION(x.s7), \
>> +               FUNCTION(x.s8), FUNCTION(x.s9), FUNCTION(x.sa), FUNCTION(x.sb), \
>> +               FUNCTION(x.sc), FUNCTION(x.sd), FUNCTION(x.se), FUNCTION(x.sf) \
>> +       ) != (RET_TYPE)0); \
>> +} \
>> +
>> +
>> +#define _CLC_DEFINE_RELATIONAL_UNARY(RET_TYPE, FUNCTION, BUILTIN_FUNCTION, ARG_TYPE) \
>> +_CLC_DEFINE_RELATIONAL_UNARY_SCALAR(RET_TYPE, FUNCTION, BUILTIN_FUNCTION, ARG_TYPE) \
>> +_CLC_DEFINE_RELATIONAL_UNARY_VEC2(RET_TYPE##2, FUNCTION, ARG_TYPE##2) \
>> +_CLC_DEFINE_RELATIONAL_UNARY_VEC3(RET_TYPE##3, FUNCTION, ARG_TYPE##3) \
>> +_CLC_DEFINE_RELATIONAL_UNARY_VEC4(RET_TYPE##4, FUNCTION, ARG_TYPE##4) \
>> +_CLC_DEFINE_RELATIONAL_UNARY_VEC8(RET_TYPE##8, FUNCTION, ARG_TYPE##8) \
>> +_CLC_DEFINE_RELATIONAL_UNARY_VEC16(RET_TYPE##16, FUNCTION, ARG_TYPE##16) \
>> +
>> +_CLC_DEFINE_RELATIONAL_UNARY(int, signbit, __builtin_signbitf, float)
>> +
>> +#ifdef cl_khr_fp64
>> +
>> +#pragma OPENCL EXTENSION cl_khr_fp64 : enable
>> +
>> +// The scalar version of signbit(double) returns an int, but the vector versions
>> +// return long.
>> +
>> +_CLC_DEF _CLC_OVERLOAD int signbit(double x){
>> +       return __builtin_signbit(x);
>> +}
>> +
>> +_CLC_DEFINE_RELATIONAL_UNARY_VEC2(long2, signbit, double2)
>> +_CLC_DEFINE_RELATIONAL_UNARY_VEC3(long3, signbit, double3)
>> +_CLC_DEFINE_RELATIONAL_UNARY_VEC4(long4, signbit, double4)
>> +_CLC_DEFINE_RELATIONAL_UNARY_VEC8(long8, signbit, double8)
>> +_CLC_DEFINE_RELATIONAL_UNARY_VEC16(long16, signbit, double16)
>> +
>> +#endif
>> +
>> +#undef _CLC_DEFINE_RELATIONAL_UNARY
>> +#undef _CLC_DEFINE_RELATIONAL_UNARY_SCALAR
>> +#undef _CLC_DEFINE_RELATIONAL_UNARY_VEC2
>> +#undef _CLC_DEFINE_RELATIONAL_UNARY_VEC3
>> +#undef _CLC_DEFINE_RELATIONAL_UNARY_VEC4
>> +#undef _CLC_DEFINE_RELATIONAL_UNARY_VEC8
>> +#undef _CLC_DEFINE_RELATIONAL_UNARY_VEC16
>> \ No newline at end of file
>> --
>> 1.9.1
>> 
> 
> _______________________________________________
> Libclc-dev mailing list
> Libclc-dev at pcc.me.uk
> http://www.pcc.me.uk/cgi-bin/mailman/listinfo/libclc-dev