[Libclc-dev] Endless loop with git versions of mesa, libclc and llvm 3.9.0

Tom Stellard via Libclc-dev libclc-dev at lists.llvm.org
Fri Sep 16 10:11:47 PDT 2016


On Thu, Sep 15, 2016 at 10:25:56AM +0200, Ricardo Ribalda Delgado via Libclc-dev wrote:
> Hi
> 
> I am evaluating libclc as a replacement for fglrx opencl library. I
> have a pre souther island board:
> 

Unfortunately, pre-Southern Islands is not very well supported.

> 00:01.0 VGA compatible controller: Advanced Micro Devices, Inc.
> [AMD/ATI] Wrestler [Radeon HD 6320]
> 
> Using the git version of mesa and libclc in combination with llvm
> 3.9.0 I get into an endless loop when I run clpeak.
> 
> Any pointers about how to debug it furhter?
> 

Try to reduce the test case as much as you can.  If you have the ISA
dumps, I can try to take a look at them.

-Tom

> Thanks
> 
> root at qt5022-open:~# CLOVER_DEBUG=clc,llvm,native
> CLOVER_DEBUG_FILE=clover.dbg clpeak
> 
> Platform: Clover
>   Device: AMD PALM (DRM 2.45.0 / 4.7.0-qtec-standard, LLVM 3.9.0)
>     Driver version  : 12.1.0-devel (Linux x64)
>     Compute units   : 2
>     Clock frequency : 0 MHz
> 
> 
> root at qt5022-open:~# cat clover.dbg.cl
> // Options:  -cl-mad-enable
> 
> #undef FETCH_2
> #undef FETCH_8
> 
> #define FETCH_2(sum, id, A, jumpBy) sum += A[id]; id += jumpBy; sum +=
> A[id]; id += jumpBy;
> #define FETCH_4(sum, id, A, jumpBy) FETCH_2(sum, id, A, jumpBy);
> FETCH_2(sum, id, A, jumpBy);
> #define FETCH_8(sum, id, A, jumpBy) FETCH_4(sum, id, A, jumpBy);
> FETCH_4(sum, id, A, jumpBy);
> 
> 
> #define FETCH_PER_WI 16
>  __kernel void global_bandwidth_v1_local_offset(__global float *A,
> __global float *B) { int id = (get_group_id(0) * get_local_size(0) *
> FETCH_PER_WI) + get_local_id(0); float sum = 0; FETCH_8(sum, id, A,
> get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
> B[get_global_id(0)] = sum; } __kernel void
> global_bandwidth_v2_local_offset(__global float2 *A, __global float
> *B) { int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) +
> get_local_id(0); float2 sum = 0; FETCH_8(sum, id, A,
> get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
> B[get_global_id(0)] = (sum.S0) + (sum.S1); } __kernel void
> global_bandwidth_v4_local_offset(__global float4 *A, __global float
> *B) { int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) +
> get_local_id(0); float4 sum = 0; FETCH_8(sum, id, A,
> get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
> B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3); }
> __kernel void global_bandwidth_v8_local_offset(__global float8 *A,
> __global float *B) { int id = (get_group_id(0) * get_local_size(0) *
> FETCH_PER_WI) + get_local_id(0); float8 sum = 0; FETCH_8(sum, id, A,
> get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
> B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) +
> (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); } __kernel void
> global_bandwidth_v16_local_offset(__global float16 *A, __global float
> *B) { int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) +
> get_local_id(0); float16 sum = 0; FETCH_8(sum, id, A,
> get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0)); float t =
> (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) +
> (sum.S6) + (sum.S7); t += (sum.S8) + (sum.S9) + (sum.SA) + (sum.SB) +
> (sum.SC) + (sum.SD) + (sum.SE) + (sum.SF); B[get_global_id(0)] = t; }
> __kernel void global_bandwidth_v1_global_offset(__global float *A,
> __global float *B) { int id = get_global_id(0); float sum = 0;
> FETCH_8(sum, id, A, get_global_size(0)); FETCH_8(sum, id, A,
> get_global_size(0)); B[get_global_id(0)] = sum; } __kernel void
> global_bandwidth_v2_global_offset(__global float2 *A, __global float
> *B) { int id = get_global_id(0); float2 sum = 0; FETCH_8(sum, id, A,
> get_global_size(0)); FETCH_8(sum, id, A, get_global_size(0));
> B[get_global_id(0)] = (sum.S0) + (sum.S1); } __kernel void
> global_bandwidth_v4_global_offset(__global float4 *A, __global float
> *B) { int id = get_global_id(0); float4 sum = 0; FETCH_8(sum, id, A,
> get_global_size(0)); FETCH_8(sum, id, A, get_global_size(0));
> B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3); }
> __kernel void global_bandwidth_v8_global_offset(__global float8 *A,
> __global float *B) { int id = get_global_id(0); float8 sum = 0;
> FETCH_8(sum, id, A, get_global_size(0)); FETCH_8(sum, id, A,
> get_global_size(0)); B[get_global_id(0)] = (sum.S0) + (sum.S1) +
> (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); }
> __kernel void global_bandwidth_v16_global_offset(__global float16 *A,
> __global float *B) { int id = get_global_id(0); float16 sum = 0;
> FETCH_8(sum, id, A, get_global_size(0)); FETCH_8(sum, id, A,
> get_global_size(0)); float t = (sum.S0) + (sum.S1) + (sum.S2) +
> (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); t += (sum.S8) +
> (sum.S9) + (sum.SA) + (sum.SB) + (sum.SC) + (sum.SD) + (sum.SE) +
> (sum.SF); B[get_global_id(0)] = t; }
> #undef MAD_4
> #undef MAD_16
> #undef MAD_64
> 
> #define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y, x,
> y); y = mad(x, y, x);
> #define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
> #define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
>  __kernel void compute_sp_v1(__global float *ptr, float _A) { float x
> = _A; float y = (float)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> ptr[get_global_id(0)] = y; } __kernel void compute_sp_v2(__global
> float *ptr, float _A) { float2 x = (float2)(_A, (_A+1)); float2 y =
> (float2)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
> (y.S0) + (y.S1); } __kernel void compute_sp_v4(__global float *ptr,
> float _A) { float4 x = (float4)(_A, (_A+1), (_A+2), (_A+3)); float4 y
> = (float4)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3); } __kernel
> void compute_sp_v8(__global float *ptr, float _A) { float8 x =
> (float8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7));
> float8 y = (float8)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] = (y.S0) + (y.S1) +
> (y.S2) + (y.S3) + (y.S4) + (y.S5) + (y.S6) + (y.S7); } __kernel void
> compute_sp_v16(__global float *ptr, float _A) { float16 x =
> (float16)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7),
> (_A+8), (_A+9), (_A+10), (_A+11), (_A+12), (_A+13), (_A+14), (_A+15));
> float16 y = (float16)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
> float2 t = (y.S01) + (y.S23) + (y.S45) + (y.S67) + (y.S89) + (y.SAB) +
> (y.SCD) + (y.SEF); ptr[get_global_id(0)] = t.S0 + t.S1; }
> #if defined(cl_khr_fp64)
>  #pragma OPENCL EXTENSION cl_khr_fp64 : enable
>  #define DOUBLE_AVAILABLE
> #elif defined(cl_amd_fp64)
>  #pragma OPENCL EXTENSION cl_amd_fp64 : enable
>  #define DOUBLE_AVAILABLE
> #endif
> #undef MAD_4
> #undef MAD_16
> #undef MAD_64
> 
> #define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y, x,
> y); y = mad(x, y, x);
> #define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
> #define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
> 
> 
> #ifdef DOUBLE_AVAILABLE
>  __kernel void compute_dp_v1(__global double *ptr, double _A) { double
> x = _A; double y = (double)get_local_id(0); MAD_64(x, y); MAD_64(x,
> y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> y); ptr[get_global_id(0)] = y; } __kernel void compute_dp_v2(__global
> double *ptr, double _A) { double2 x = (double2)(_A, (_A+1)); double2 y
> = (double2)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
> (y.S0) + (y.S1); } __kernel void compute_dp_v4(__global double *ptr,
> double _A) { double4 x = (double4)(_A, (_A+1), (_A+2), (_A+3));
> double4 y = (double4)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) +
> (y.S3); } __kernel void compute_dp_v8(__global double *ptr, double _A)
> { double8 x = (double8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5),
> (_A+6), (_A+7)); double8 y = (double8)get_local_id(0); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
> (y.S0) + (y.S1) + (y.S2) + (y.S3) + (y.S4) + (y.S5) + (y.S6) + (y.S7);
> } __kernel void compute_dp_v16(__global double *ptr, double _A) {
> double16 x = (double16)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5),
> (_A+6), (_A+7), (_A+8), (_A+9), (_A+10), (_A+11), (_A+12), (_A+13),
> (_A+14), (_A+15)); double16 y = (double16)get_local_id(0); MAD_64(x,
> y); MAD_64(x, y); double2 t = (y.S01) + (y.S23) + (y.S45) + (y.S67) +
> (y.S89) + (y.SAB) + (y.SCD) + (y.SEF); ptr[get_global_id(0)] = t.S0 +
> t.S1; }
> 
> #endif
> 
> #undef MAD_4
> #undef MAD_16
> #undef MAD_64
> 
> #define MAD_4(x, y) x = (y*x) + y; y = (x*y) + x; x = (y*x) + y; y = (x*y) + x;
> #define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
> #define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
>  __kernel void compute_integer_v1(__global int *ptr, int _A) { int x =
> _A; int y = (int)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> ptr[get_global_id(0)] = y; } __kernel void compute_integer_v2(__global
> int *ptr, int _A) { int2 x = (int2)(_A, (_A+1)); int2 y =
> (int2)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
> (y.S0) + (y.S1); } __kernel void compute_integer_v4(__global int *ptr,
> int _A) { int4 x = (int4)(_A, (_A+1), (_A+2), (_A+3)); int4 y =
> (int4)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3); } __kernel
> void compute_integer_v8(__global int *ptr, int _A) { int8 x =
> (int8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7));
> int8 y = (int8)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> y); MAD_64(x, y); ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) +
> (y.S3) + (y.S4) + (y.S5) + (y.S6) + (y.S7); } __kernel void
> compute_integer_v16(__global int *ptr, int _A) { int16 x = (int16)(_A,
> (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7), (_A+8),
> (_A+9), (_A+10), (_A+11), (_A+12), (_A+13), (_A+14), (_A+15)); int16 y
> = (int16)get_local_id(0); MAD_64(x, y); MAD_64(x, y); int2 t = (y.S01)
> + (y.S23) + (y.S45) + (y.S67) + (y.S89) + (y.SAB) + (y.SCD) + (y.SEF);
> ptr[get_global_id(0)] = t.S0 + t.S1; }// Options:  -cl-mad-enable
> 
> #undef FETCH_2
> #undef FETCH_8
> 
> #define FETCH_2(sum, id, A, jumpBy) sum += A[id]; id += jumpBy; sum +=
> A[id]; id += jumpBy;
> #define FETCH_4(sum, id, A, jumpBy) FETCH_2(sum, id, A, jumpBy);
> FETCH_2(sum, id, A, jumpBy);
> #define FETCH_8(sum, id, A, jumpBy) FETCH_4(sum, id, A, jumpBy);
> FETCH_4(sum, id, A, jumpBy);
> 
> 
> #define FETCH_PER_WI 16
>  __kernel void global_bandwidth_v1_local_offset(__global float *A,
> __global float *B) { int id = (get_group_id(0) * get_local_size(0) *
> FETCH_PER_WI) + get_local_id(0); float sum = 0; FETCH_8(sum, id, A,
> get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
> B[get_global_id(0)] = sum; } __kernel void
> global_bandwidth_v2_local_offset(__global float2 *A, __global float
> *B) { int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) +
> get_local_id(0); float2 sum = 0; FETCH_8(sum, id, A,
> get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
> B[get_global_id(0)] = (sum.S0) + (sum.S1); } __kernel void
> global_bandwidth_v4_local_offset(__global float4 *A, __global float
> *B) { int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) +
> get_local_id(0); float4 sum = 0; FETCH_8(sum, id, A,
> get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
> B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3); }
> __kernel void global_bandwidth_v8_local_offset(__global float8 *A,
> __global float *B) { int id = (get_group_id(0) * get_local_size(0) *
> FETCH_PER_WI) + get_local_id(0); float8 sum = 0; FETCH_8(sum, id, A,
> get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
> B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) +
> (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); } __kernel void
> global_bandwidth_v16_local_offset(__global float16 *A, __global float
> *B) { int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) +
> get_local_id(0); float16 sum = 0; FETCH_8(sum, id, A,
> get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0)); float t =
> (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) +
> (sum.S6) + (sum.S7); t += (sum.S8) + (sum.S9) + (sum.SA) + (sum.SB) +
> (sum.SC) + (sum.SD) + (sum.SE) + (sum.SF); B[get_global_id(0)] = t; }
> __kernel void global_bandwidth_v1_global_offset(__global float *A,
> __global float *B) { int id = get_global_id(0); float sum = 0;
> FETCH_8(sum, id, A, get_global_size(0)); FETCH_8(sum, id, A,
> get_global_size(0)); B[get_global_id(0)] = sum; } __kernel void
> global_bandwidth_v2_global_offset(__global float2 *A, __global float
> *B) { int id = get_global_id(0); float2 sum = 0; FETCH_8(sum, id, A,
> get_global_size(0)); FETCH_8(sum, id, A, get_global_size(0));
> B[get_global_id(0)] = (sum.S0) + (sum.S1); } __kernel void
> global_bandwidth_v4_global_offset(__global float4 *A, __global float
> *B) { int id = get_global_id(0); float4 sum = 0; FETCH_8(sum, id, A,
> get_global_size(0)); FETCH_8(sum, id, A, get_global_size(0));
> B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3); }
> __kernel void global_bandwidth_v8_global_offset(__global float8 *A,
> __global float *B) { int id = get_global_id(0); float8 sum = 0;
> FETCH_8(sum, id, A, get_global_size(0)); FETCH_8(sum, id, A,
> get_global_size(0)); B[get_global_id(0)] = (sum.S0) + (sum.S1) +
> (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); }
> __kernel void global_bandwidth_v16_global_offset(__global float16 *A,
> __global float *B) { int id = get_global_id(0); float16 sum = 0;
> FETCH_8(sum, id, A, get_global_size(0)); FETCH_8(sum, id, A,
> get_global_size(0)); float t = (sum.S0) + (sum.S1) + (sum.S2) +
> (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); t += (sum.S8) +
> (sum.S9) + (sum.SA) + (sum.SB) + (sum.SC) + (sum.SD) + (sum.SE) +
> (sum.SF); B[get_global_id(0)] = t; }
> #undef MAD_4
> #undef MAD_16
> #undef MAD_64
> 
> #define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y, x,
> y); y = mad(x, y, x);
> #define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
> #define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
>  __kernel void compute_sp_v1(__global float *ptr, float _A) { float x
> = _A; float y = (float)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> ptr[get_global_id(0)] = y; } __kernel void compute_sp_v2(__global
> float *ptr, float _A) { float2 x = (float2)(_A, (_A+1)); float2 y =
> (float2)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
> (y.S0) + (y.S1); } __kernel void compute_sp_v4(__global float *ptr,
> float _A) { float4 x = (float4)(_A, (_A+1), (_A+2), (_A+3)); float4 y
> = (float4)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3); } __kernel
> void compute_sp_v8(__global float *ptr, float _A) { float8 x =
> (float8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7));
> float8 y = (float8)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] = (y.S0) + (y.S1) +
> (y.S2) + (y.S3) + (y.S4) + (y.S5) + (y.S6) + (y.S7); } __kernel void
> compute_sp_v16(__global float *ptr, float _A) { float16 x =
> (float16)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7),
> (_A+8), (_A+9), (_A+10), (_A+11), (_A+12), (_A+13), (_A+14), (_A+15));
> float16 y = (float16)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
> float2 t = (y.S01) + (y.S23) + (y.S45) + (y.S67) + (y.S89) + (y.SAB) +
> (y.SCD) + (y.SEF); ptr[get_global_id(0)] = t.S0 + t.S1; }
> #if defined(cl_khr_fp64)
>  #pragma OPENCL EXTENSION cl_khr_fp64 : enable
>  #define DOUBLE_AVAILABLE
> #elif defined(cl_amd_fp64)
>  #pragma OPENCL EXTENSION cl_amd_fp64 : enable
>  #define DOUBLE_AVAILABLE
> #endif
> #undef MAD_4
> #undef MAD_16
> #undef MAD_64
> 
> #define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y, x,
> y); y = mad(x, y, x);
> #define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
> #define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
> 
> 
> #ifdef DOUBLE_AVAILABLE
>  __kernel void compute_dp_v1(__global double *ptr, double _A) { double
> x = _A; double y = (double)get_local_id(0); MAD_64(x, y); MAD_64(x,
> y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> y); ptr[get_global_id(0)] = y; } __kernel void compute_dp_v2(__global
> double *ptr, double _A) { double2 x = (double2)(_A, (_A+1)); double2 y
> = (double2)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
> (y.S0) + (y.S1); } __kernel void compute_dp_v4(__global double *ptr,
> double _A) { double4 x = (double4)(_A, (_A+1), (_A+2), (_A+3));
> double4 y = (double4)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) +
> (y.S3); } __kernel void compute_dp_v8(__global double *ptr, double _A)
> { double8 x = (double8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5),
> (_A+6), (_A+7)); double8 y = (double8)get_local_id(0); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
> (y.S0) + (y.S1) + (y.S2) + (y.S3) + (y.S4) + (y.S5) + (y.S6) + (y.S7);
> } __kernel void compute_dp_v16(__global double *ptr, double _A) {
> double16 x = (double16)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5),
> (_A+6), (_A+7), (_A+8), (_A+9), (_A+10), (_A+11), (_A+12), (_A+13),
> (_A+14), (_A+15)); double16 y = (double16)get_local_id(0); MAD_64(x,
> y); MAD_64(x, y); double2 t = (y.S01) + (y.S23) + (y.S45) + (y.S67) +
> (y.S89) + (y.SAB) + (y.SCD) + (y.SEF); ptr[get_global_id(0)] = t.S0 +
> t.S1; }
> 
> #endif
> 
> #undef MAD_4
> #undef MAD_16
> #undef MAD_64
> 
> #define MAD_4(x, y) x = (y*x) + y; y = (x*y) + x; x = (y*x) + y; y = (x*y) + x;
> #define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
> #define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
>  __kernel void compute_integer_v1(__global int *ptr, int _A) { int x =
> _A; int y = (int)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> ptr[get_global_id(0)] = y; } __kernel void compute_integer_v2(__global
> int *ptr, int _A) { int2 x = (int2)(_A, (_A+1)); int2 y =
> (int2)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
> (y.S0) + (y.S1); } __kernel void compute_integer_v4(__global int *ptr,
> int _A) { int4 x = (int4)(_A, (_A+1), (_A+2), (_A+3)); int4 y =
> (int4)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
> ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3); } __kernel
> void compute_integer_v8(__global int *ptr, int _A) { int8 x =
> (int8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7));
> int8 y = (int8)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
> y); MAD_64(x, y); ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) +
> (y.S3) + (y.S4) + (y.S5) + (y.S6) + (y.S7); } __kernel void
> compute_integer_v16(__global int *ptr, int _A) { int16 x = (int16)(_A,
> (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7), (_A+8),
> (_A+9), (_A+10), (_A+11), (_A+12), (_A+13), (_A+14), (_A+15)); int16 y
> = (int16)get_local_id(0); MAD_64(x, y); MAD_64(x, y); int2 t = (y.S01)
> + (y.S23) + (y.S45) + (y.S67) + (y.S89) + (y.SAB) + (y.SCD) + (y.SEF);
> ptr[get_global_id(0)] = t.S0 + t.S1; }
> 
> clover.dbg.ll in:
> 
> http://ge.tt/9cZmpQe2
> 
> Thanks for your help!
> 
> 
> 
> -- 
> Ricardo Ribalda
> _______________________________________________
> Libclc-dev mailing list
> Libclc-dev at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/libclc-dev


More information about the Libclc-dev mailing list