[Libclc-dev] Endless loop with git versions of mesa, libclc and llvm 3.9.0

Ricardo Ribalda Delgado via Libclc-dev libclc-dev at lists.llvm.org
Thu Sep 15 01:25:56 PDT 2016


Hi

I am evaluating libclc as a replacement for fglrx opencl library. I
have a pre souther island board:

00:01.0 VGA compatible controller: Advanced Micro Devices, Inc.
[AMD/ATI] Wrestler [Radeon HD 6320]

Using the git version of mesa and libclc in combination with llvm
3.9.0 I get into an endless loop when I run clpeak.

Any pointers about how to debug it furhter?

Thanks

root at qt5022-open:~# CLOVER_DEBUG=clc,llvm,native
CLOVER_DEBUG_FILE=clover.dbg clpeak

Platform: Clover
  Device: AMD PALM (DRM 2.45.0 / 4.7.0-qtec-standard, LLVM 3.9.0)
    Driver version  : 12.1.0-devel (Linux x64)
    Compute units   : 2
    Clock frequency : 0 MHz


root at qt5022-open:~# cat clover.dbg.cl
// Options:  -cl-mad-enable

#undef FETCH_2
#undef FETCH_8

#define FETCH_2(sum, id, A, jumpBy) sum += A[id]; id += jumpBy; sum +=
A[id]; id += jumpBy;
#define FETCH_4(sum, id, A, jumpBy) FETCH_2(sum, id, A, jumpBy);
FETCH_2(sum, id, A, jumpBy);
#define FETCH_8(sum, id, A, jumpBy) FETCH_4(sum, id, A, jumpBy);
FETCH_4(sum, id, A, jumpBy);


#define FETCH_PER_WI 16
 __kernel void global_bandwidth_v1_local_offset(__global float *A,
__global float *B) { int id = (get_group_id(0) * get_local_size(0) *
FETCH_PER_WI) + get_local_id(0); float sum = 0; FETCH_8(sum, id, A,
get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
B[get_global_id(0)] = sum; } __kernel void
global_bandwidth_v2_local_offset(__global float2 *A, __global float
*B) { int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) +
get_local_id(0); float2 sum = 0; FETCH_8(sum, id, A,
get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
B[get_global_id(0)] = (sum.S0) + (sum.S1); } __kernel void
global_bandwidth_v4_local_offset(__global float4 *A, __global float
*B) { int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) +
get_local_id(0); float4 sum = 0; FETCH_8(sum, id, A,
get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3); }
__kernel void global_bandwidth_v8_local_offset(__global float8 *A,
__global float *B) { int id = (get_group_id(0) * get_local_size(0) *
FETCH_PER_WI) + get_local_id(0); float8 sum = 0; FETCH_8(sum, id, A,
get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) +
(sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); } __kernel void
global_bandwidth_v16_local_offset(__global float16 *A, __global float
*B) { int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) +
get_local_id(0); float16 sum = 0; FETCH_8(sum, id, A,
get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0)); float t =
(sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) +
(sum.S6) + (sum.S7); t += (sum.S8) + (sum.S9) + (sum.SA) + (sum.SB) +
(sum.SC) + (sum.SD) + (sum.SE) + (sum.SF); B[get_global_id(0)] = t; }
__kernel void global_bandwidth_v1_global_offset(__global float *A,
__global float *B) { int id = get_global_id(0); float sum = 0;
FETCH_8(sum, id, A, get_global_size(0)); FETCH_8(sum, id, A,
get_global_size(0)); B[get_global_id(0)] = sum; } __kernel void
global_bandwidth_v2_global_offset(__global float2 *A, __global float
*B) { int id = get_global_id(0); float2 sum = 0; FETCH_8(sum, id, A,
get_global_size(0)); FETCH_8(sum, id, A, get_global_size(0));
B[get_global_id(0)] = (sum.S0) + (sum.S1); } __kernel void
global_bandwidth_v4_global_offset(__global float4 *A, __global float
*B) { int id = get_global_id(0); float4 sum = 0; FETCH_8(sum, id, A,
get_global_size(0)); FETCH_8(sum, id, A, get_global_size(0));
B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3); }
__kernel void global_bandwidth_v8_global_offset(__global float8 *A,
__global float *B) { int id = get_global_id(0); float8 sum = 0;
FETCH_8(sum, id, A, get_global_size(0)); FETCH_8(sum, id, A,
get_global_size(0)); B[get_global_id(0)] = (sum.S0) + (sum.S1) +
(sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); }
__kernel void global_bandwidth_v16_global_offset(__global float16 *A,
__global float *B) { int id = get_global_id(0); float16 sum = 0;
FETCH_8(sum, id, A, get_global_size(0)); FETCH_8(sum, id, A,
get_global_size(0)); float t = (sum.S0) + (sum.S1) + (sum.S2) +
(sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); t += (sum.S8) +
(sum.S9) + (sum.SA) + (sum.SB) + (sum.SC) + (sum.SD) + (sum.SE) +
(sum.SF); B[get_global_id(0)] = t; }
#undef MAD_4
#undef MAD_16
#undef MAD_64

#define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y, x,
y); y = mad(x, y, x);
#define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
#define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
 __kernel void compute_sp_v1(__global float *ptr, float _A) { float x
= _A; float y = (float)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
ptr[get_global_id(0)] = y; } __kernel void compute_sp_v2(__global
float *ptr, float _A) { float2 x = (float2)(_A, (_A+1)); float2 y =
(float2)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
(y.S0) + (y.S1); } __kernel void compute_sp_v4(__global float *ptr,
float _A) { float4 x = (float4)(_A, (_A+1), (_A+2), (_A+3)); float4 y
= (float4)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3); } __kernel
void compute_sp_v8(__global float *ptr, float _A) { float8 x =
(float8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7));
float8 y = (float8)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] = (y.S0) + (y.S1) +
(y.S2) + (y.S3) + (y.S4) + (y.S5) + (y.S6) + (y.S7); } __kernel void
compute_sp_v16(__global float *ptr, float _A) { float16 x =
(float16)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7),
(_A+8), (_A+9), (_A+10), (_A+11), (_A+12), (_A+13), (_A+14), (_A+15));
float16 y = (float16)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
float2 t = (y.S01) + (y.S23) + (y.S45) + (y.S67) + (y.S89) + (y.SAB) +
(y.SCD) + (y.SEF); ptr[get_global_id(0)] = t.S0 + t.S1; }
#if defined(cl_khr_fp64)
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #define DOUBLE_AVAILABLE
#elif defined(cl_amd_fp64)
 #pragma OPENCL EXTENSION cl_amd_fp64 : enable
 #define DOUBLE_AVAILABLE
#endif
#undef MAD_4
#undef MAD_16
#undef MAD_64

#define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y, x,
y); y = mad(x, y, x);
#define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
#define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);


#ifdef DOUBLE_AVAILABLE
 __kernel void compute_dp_v1(__global double *ptr, double _A) { double
x = _A; double y = (double)get_local_id(0); MAD_64(x, y); MAD_64(x,
y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
y); ptr[get_global_id(0)] = y; } __kernel void compute_dp_v2(__global
double *ptr, double _A) { double2 x = (double2)(_A, (_A+1)); double2 y
= (double2)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
(y.S0) + (y.S1); } __kernel void compute_dp_v4(__global double *ptr,
double _A) { double4 x = (double4)(_A, (_A+1), (_A+2), (_A+3));
double4 y = (double4)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) +
(y.S3); } __kernel void compute_dp_v8(__global double *ptr, double _A)
{ double8 x = (double8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5),
(_A+6), (_A+7)); double8 y = (double8)get_local_id(0); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
(y.S0) + (y.S1) + (y.S2) + (y.S3) + (y.S4) + (y.S5) + (y.S6) + (y.S7);
} __kernel void compute_dp_v16(__global double *ptr, double _A) {
double16 x = (double16)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5),
(_A+6), (_A+7), (_A+8), (_A+9), (_A+10), (_A+11), (_A+12), (_A+13),
(_A+14), (_A+15)); double16 y = (double16)get_local_id(0); MAD_64(x,
y); MAD_64(x, y); double2 t = (y.S01) + (y.S23) + (y.S45) + (y.S67) +
(y.S89) + (y.SAB) + (y.SCD) + (y.SEF); ptr[get_global_id(0)] = t.S0 +
t.S1; }

#endif

#undef MAD_4
#undef MAD_16
#undef MAD_64

#define MAD_4(x, y) x = (y*x) + y; y = (x*y) + x; x = (y*x) + y; y = (x*y) + x;
#define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
#define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
 __kernel void compute_integer_v1(__global int *ptr, int _A) { int x =
_A; int y = (int)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
ptr[get_global_id(0)] = y; } __kernel void compute_integer_v2(__global
int *ptr, int _A) { int2 x = (int2)(_A, (_A+1)); int2 y =
(int2)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
(y.S0) + (y.S1); } __kernel void compute_integer_v4(__global int *ptr,
int _A) { int4 x = (int4)(_A, (_A+1), (_A+2), (_A+3)); int4 y =
(int4)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3); } __kernel
void compute_integer_v8(__global int *ptr, int _A) { int8 x =
(int8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7));
int8 y = (int8)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
y); MAD_64(x, y); ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) +
(y.S3) + (y.S4) + (y.S5) + (y.S6) + (y.S7); } __kernel void
compute_integer_v16(__global int *ptr, int _A) { int16 x = (int16)(_A,
(_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7), (_A+8),
(_A+9), (_A+10), (_A+11), (_A+12), (_A+13), (_A+14), (_A+15)); int16 y
= (int16)get_local_id(0); MAD_64(x, y); MAD_64(x, y); int2 t = (y.S01)
+ (y.S23) + (y.S45) + (y.S67) + (y.S89) + (y.SAB) + (y.SCD) + (y.SEF);
ptr[get_global_id(0)] = t.S0 + t.S1; }// Options:  -cl-mad-enable

#undef FETCH_2
#undef FETCH_8

#define FETCH_2(sum, id, A, jumpBy) sum += A[id]; id += jumpBy; sum +=
A[id]; id += jumpBy;
#define FETCH_4(sum, id, A, jumpBy) FETCH_2(sum, id, A, jumpBy);
FETCH_2(sum, id, A, jumpBy);
#define FETCH_8(sum, id, A, jumpBy) FETCH_4(sum, id, A, jumpBy);
FETCH_4(sum, id, A, jumpBy);


#define FETCH_PER_WI 16
 __kernel void global_bandwidth_v1_local_offset(__global float *A,
__global float *B) { int id = (get_group_id(0) * get_local_size(0) *
FETCH_PER_WI) + get_local_id(0); float sum = 0; FETCH_8(sum, id, A,
get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
B[get_global_id(0)] = sum; } __kernel void
global_bandwidth_v2_local_offset(__global float2 *A, __global float
*B) { int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) +
get_local_id(0); float2 sum = 0; FETCH_8(sum, id, A,
get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
B[get_global_id(0)] = (sum.S0) + (sum.S1); } __kernel void
global_bandwidth_v4_local_offset(__global float4 *A, __global float
*B) { int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) +
get_local_id(0); float4 sum = 0; FETCH_8(sum, id, A,
get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3); }
__kernel void global_bandwidth_v8_local_offset(__global float8 *A,
__global float *B) { int id = (get_group_id(0) * get_local_size(0) *
FETCH_PER_WI) + get_local_id(0); float8 sum = 0; FETCH_8(sum, id, A,
get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0));
B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) +
(sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); } __kernel void
global_bandwidth_v16_local_offset(__global float16 *A, __global float
*B) { int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) +
get_local_id(0); float16 sum = 0; FETCH_8(sum, id, A,
get_local_size(0)); FETCH_8(sum, id, A, get_local_size(0)); float t =
(sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) +
(sum.S6) + (sum.S7); t += (sum.S8) + (sum.S9) + (sum.SA) + (sum.SB) +
(sum.SC) + (sum.SD) + (sum.SE) + (sum.SF); B[get_global_id(0)] = t; }
__kernel void global_bandwidth_v1_global_offset(__global float *A,
__global float *B) { int id = get_global_id(0); float sum = 0;
FETCH_8(sum, id, A, get_global_size(0)); FETCH_8(sum, id, A,
get_global_size(0)); B[get_global_id(0)] = sum; } __kernel void
global_bandwidth_v2_global_offset(__global float2 *A, __global float
*B) { int id = get_global_id(0); float2 sum = 0; FETCH_8(sum, id, A,
get_global_size(0)); FETCH_8(sum, id, A, get_global_size(0));
B[get_global_id(0)] = (sum.S0) + (sum.S1); } __kernel void
global_bandwidth_v4_global_offset(__global float4 *A, __global float
*B) { int id = get_global_id(0); float4 sum = 0; FETCH_8(sum, id, A,
get_global_size(0)); FETCH_8(sum, id, A, get_global_size(0));
B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3); }
__kernel void global_bandwidth_v8_global_offset(__global float8 *A,
__global float *B) { int id = get_global_id(0); float8 sum = 0;
FETCH_8(sum, id, A, get_global_size(0)); FETCH_8(sum, id, A,
get_global_size(0)); B[get_global_id(0)] = (sum.S0) + (sum.S1) +
(sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); }
__kernel void global_bandwidth_v16_global_offset(__global float16 *A,
__global float *B) { int id = get_global_id(0); float16 sum = 0;
FETCH_8(sum, id, A, get_global_size(0)); FETCH_8(sum, id, A,
get_global_size(0)); float t = (sum.S0) + (sum.S1) + (sum.S2) +
(sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); t += (sum.S8) +
(sum.S9) + (sum.SA) + (sum.SB) + (sum.SC) + (sum.SD) + (sum.SE) +
(sum.SF); B[get_global_id(0)] = t; }
#undef MAD_4
#undef MAD_16
#undef MAD_64

#define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y, x,
y); y = mad(x, y, x);
#define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
#define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
 __kernel void compute_sp_v1(__global float *ptr, float _A) { float x
= _A; float y = (float)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
ptr[get_global_id(0)] = y; } __kernel void compute_sp_v2(__global
float *ptr, float _A) { float2 x = (float2)(_A, (_A+1)); float2 y =
(float2)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
(y.S0) + (y.S1); } __kernel void compute_sp_v4(__global float *ptr,
float _A) { float4 x = (float4)(_A, (_A+1), (_A+2), (_A+3)); float4 y
= (float4)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3); } __kernel
void compute_sp_v8(__global float *ptr, float _A) { float8 x =
(float8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7));
float8 y = (float8)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] = (y.S0) + (y.S1) +
(y.S2) + (y.S3) + (y.S4) + (y.S5) + (y.S6) + (y.S7); } __kernel void
compute_sp_v16(__global float *ptr, float _A) { float16 x =
(float16)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7),
(_A+8), (_A+9), (_A+10), (_A+11), (_A+12), (_A+13), (_A+14), (_A+15));
float16 y = (float16)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
float2 t = (y.S01) + (y.S23) + (y.S45) + (y.S67) + (y.S89) + (y.SAB) +
(y.SCD) + (y.SEF); ptr[get_global_id(0)] = t.S0 + t.S1; }
#if defined(cl_khr_fp64)
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #define DOUBLE_AVAILABLE
#elif defined(cl_amd_fp64)
 #pragma OPENCL EXTENSION cl_amd_fp64 : enable
 #define DOUBLE_AVAILABLE
#endif
#undef MAD_4
#undef MAD_16
#undef MAD_64

#define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y, x,
y); y = mad(x, y, x);
#define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
#define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);


#ifdef DOUBLE_AVAILABLE
 __kernel void compute_dp_v1(__global double *ptr, double _A) { double
x = _A; double y = (double)get_local_id(0); MAD_64(x, y); MAD_64(x,
y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
y); ptr[get_global_id(0)] = y; } __kernel void compute_dp_v2(__global
double *ptr, double _A) { double2 x = (double2)(_A, (_A+1)); double2 y
= (double2)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
(y.S0) + (y.S1); } __kernel void compute_dp_v4(__global double *ptr,
double _A) { double4 x = (double4)(_A, (_A+1), (_A+2), (_A+3));
double4 y = (double4)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) +
(y.S3); } __kernel void compute_dp_v8(__global double *ptr, double _A)
{ double8 x = (double8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5),
(_A+6), (_A+7)); double8 y = (double8)get_local_id(0); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
(y.S0) + (y.S1) + (y.S2) + (y.S3) + (y.S4) + (y.S5) + (y.S6) + (y.S7);
} __kernel void compute_dp_v16(__global double *ptr, double _A) {
double16 x = (double16)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5),
(_A+6), (_A+7), (_A+8), (_A+9), (_A+10), (_A+11), (_A+12), (_A+13),
(_A+14), (_A+15)); double16 y = (double16)get_local_id(0); MAD_64(x,
y); MAD_64(x, y); double2 t = (y.S01) + (y.S23) + (y.S45) + (y.S67) +
(y.S89) + (y.SAB) + (y.SCD) + (y.SEF); ptr[get_global_id(0)] = t.S0 +
t.S1; }

#endif

#undef MAD_4
#undef MAD_16
#undef MAD_64

#define MAD_4(x, y) x = (y*x) + y; y = (x*y) + x; x = (y*x) + y; y = (x*y) + x;
#define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
#define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
 __kernel void compute_integer_v1(__global int *ptr, int _A) { int x =
_A; int y = (int)get_local_id(0); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
ptr[get_global_id(0)] = y; } __kernel void compute_integer_v2(__global
int *ptr, int _A) { int2 x = (int2)(_A, (_A+1)); int2 y =
(int2)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); ptr[get_global_id(0)] =
(y.S0) + (y.S1); } __kernel void compute_integer_v4(__global int *ptr,
int _A) { int4 x = (int4)(_A, (_A+1), (_A+2), (_A+3)); int4 y =
(int4)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y); MAD_64(x, y);
ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3); } __kernel
void compute_integer_v8(__global int *ptr, int _A) { int8 x =
(int8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7));
int8 y = (int8)get_local_id(0); MAD_64(x, y); MAD_64(x, y); MAD_64(x,
y); MAD_64(x, y); ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) +
(y.S3) + (y.S4) + (y.S5) + (y.S6) + (y.S7); } __kernel void
compute_integer_v16(__global int *ptr, int _A) { int16 x = (int16)(_A,
(_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7), (_A+8),
(_A+9), (_A+10), (_A+11), (_A+12), (_A+13), (_A+14), (_A+15)); int16 y
= (int16)get_local_id(0); MAD_64(x, y); MAD_64(x, y); int2 t = (y.S01)
+ (y.S23) + (y.S45) + (y.S67) + (y.S89) + (y.SAB) + (y.SCD) + (y.SEF);
ptr[get_global_id(0)] = t.S0 + t.S1; }

clover.dbg.ll in:

http://ge.tt/9cZmpQe2

Thanks for your help!



-- 
Ricardo Ribalda


More information about the Libclc-dev mailing list