[libclc] r219217 - Add AMD OpenCL builtins
Tom Stellard
thomas.stellard at amd.com
Tue Oct 7 10:10:49 PDT 2014
Added: libclc/branches/amd-builtins/amd-builtins/media/media.h
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/media/media.h?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/media/media.h (added)
+++ libclc/branches/amd-builtins/amd-builtins/media/media.h Tue Oct 7 12:10:46 2014
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#pragma OPENCL EXTENSION cl_amd_media_ops : enable
+
+extern __attribute__((const)) uint __hsail_bitalign_b32(uint, uint, uint);
+
+extern __attribute__((const)) uint __hsail_bytealign_b32(uint, uint, uint);
+
+extern __attribute__((pure)) uint __hsail_packcvt_u8x4_f32(float,float,float,float);
+
+extern __attribute__((pure)) uint __hsail_lerp_u8x4(uint,uint,uint);
+
+extern __attribute__((pure)) uint __hsail_sad_u32_u8x4(uint,uint,uint);
+
+extern __attribute__((pure)) uint __hsail_sadhi_u16x2_u8x4(uint,uint,uint);
+
+extern __attribute__((pure)) float __hsail_unpackcvt_f32_u8x4(uint,uint);
+
+extern __attribute__((const)) uint __hsail_msad(uint,uint,uint);
+
+extern __attribute__((const)) uint __hsail_sadd(uint,uint,uint);
+
+extern __attribute__((const)) uint __hsail_sadw(uint,uint,uint);
+
+extern __attribute__((const)) uint __hsail_umin3(uint,uint,uint);
+
+extern __attribute__((const)) int __hsail_imin3(int,int,int);
+
+extern __attribute__((const)) uint __hsail_umax3(uint,uint,uint);
+
+extern __attribute__((const)) int __hsail_imax3(int,int,int);
+
+extern __attribute__((const)) uint __hsail_umedian3(uint,uint,uint);
+
+extern __attribute__((const)) int __hsail_imedian3(int,int,int);
+
+extern __attribute__((const)) uint __hsail_bfe(uint,uint,uint);
+
+extern __attribute__((const)) float __hsail_f32_min3(float,float,float);
+
+extern __attribute__((const)) float __hsail_f32_max3(float,float,float);
+
+extern __attribute__((const)) float __hsail_f32_median3(float,float,float);
+
+extern __attribute__((const)) ulong __hsail_mqsad(ulong,uint,ulong);
+
+extern __attribute__((const)) ulong __hsail_qsad(ulong,uint,ulong);
+
+extern __attribute__((const)) uint __hsail_bfm(uint,uint);
+
+extern __attribute__((const)) int __hsail_ibfe(int,uint,uint);
Added: libclc/branches/amd-builtins/amd-builtins/media/median3.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/media/median3.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/media/median3.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/media/median3.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "media.h"
+
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+
+__attribute__((overloadable,always_inline,const)) uint2 amd_median3(uint2 v1, uint2 v2, uint2 v3)
+{
+ uint2 ret;
+ ret.x = __hsail_umedian3(v1.x,v2.x, v3.x);
+ ret.y = __hsail_umedian3(v1.y,v2.y,v3.y);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint3 amd_median3(uint3 v1, uint3 v2, uint3 v3)
+{
+ uint3 ret;
+ ret.x = __hsail_umedian3(v1.x,v2.x, v3.x);
+ ret.y = __hsail_umedian3(v1.y,v2.y,v3.y);
+ ret.z = __hsail_umedian3(v1.z,v2.z, v3.z);
+ return ret;
+}
+
+__attribute__((overloadable,always_inline,const)) uint4 amd_median3(uint4 v1, uint4 v2, uint4 v3)
+{
+ uint4 ret;
+ ret.x = __hsail_umedian3(v1.x,v2.x, v3.x);
+ ret.y = __hsail_umedian3(v1.y,v2.y,v3.y);
+ ret.z = __hsail_umedian3(v1.z,v2.z, v3.z);
+ ret.w = __hsail_umedian3(v1.w,v2.w,v3.w);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint8 amd_median3(uint8 v1, uint8 v2, uint8 v3)
+{
+ uint8 ret;
+ ret.s0 = __hsail_umedian3(v1.s0,v2.s0, v3.s0);
+ ret.s1 = __hsail_umedian3(v1.s1,v2.s1,v3.s1);
+ ret.s2 = __hsail_umedian3(v1.s2,v2.s2, v3.s2);
+ ret.s3 = __hsail_umedian3(v1.s3,v2.s3,v3.s3);
+ ret.s4 = __hsail_umedian3(v1.s4,v2.s4,v3.s4) ;
+ ret.s5 = __hsail_umedian3(v1.s5,v2.s5,v3.s5);
+ ret.s6 = __hsail_umedian3(v1.s6,v2.s6,v3.s6 );
+ ret.s7 = __hsail_umedian3(v1.s7,v2.s7,v3.s7);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint16 amd_median3(uint16 v1, uint16 v2, uint16 v3)
+{
+ uint16 ret;
+ ret.s0 = __hsail_umedian3(v1.s0,v2.s0, v3.s0);
+ ret.s1 = __hsail_umedian3(v1.s1,v2.s1,v3.s1);
+ ret.s2 = __hsail_umedian3(v1.s2,v2.s2, v3.s2);
+ ret.s3 = __hsail_umedian3(v1.s3,v2.s3,v3.s3);
+ ret.s4 = __hsail_umedian3(v1.s4,v2.s4,v3.s4) ;
+ ret.s5 = __hsail_umedian3(v1.s5,v2.s5,v3.s5);
+ ret.s6 = __hsail_umedian3(v1.s6,v2.s6,v3.s6 );
+ ret.s7 = __hsail_umedian3(v1.s7,v2.s7,v3.s7);
+ ret.s8 = __hsail_umedian3(v1.s8,v2.s8,v3.s8 );
+ ret.s9 = __hsail_umedian3(v1.s9,v2.s9,v3.s9);
+ ret.sa = __hsail_umedian3(v1.sa,v2.sa, v3.sa);
+ ret.sb = __hsail_umedian3(v1.sb,v2.sb,v3.sb);
+ ret.sc = __hsail_umedian3(v1.sc,v2.sc, v3.sc);
+ ret.sd = __hsail_umedian3(v1.sd,v2.sd,v3.sd);
+ ret.se = __hsail_umedian3(v1.se,v2.se, v3.se);
+ ret.sf= __hsail_umedian3(v1.sf,v2.sf,v3.sf);
+
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint amd_median3(uint v1, uint v2, uint v3)
+{
+ return __hsail_umedian3(v1,v2,v3) ;
+}
+__attribute__((overloadable,always_inline,const)) float2 amd_median3(float2 v1, float2 v2, float2 v3)
+{
+ float2 ret;
+ ret.x = __hsail_f32_median3(v1.x,v2.x, v3.x);
+ ret.y = __hsail_f32_median3(v1.y,v2.y,v3.y);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) float3 amd_median3(float3 v1, float3 v2, float3 v3)
+{
+ float3 ret;
+ ret.x = __hsail_f32_median3(v1.x,v2.x, v3.x);
+ ret.y = __hsail_f32_median3(v1.y,v2.y,v3.y);
+ ret.z = __hsail_f32_median3(v1.z,v2.z, v3.z);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) float4 amd_median3(float4 v1, float4 v2, float4 v3)
+{
+ float4 ret;
+ ret.x = __hsail_f32_median3(v1.x,v2.x, v3.x);
+ ret.y = __hsail_f32_median3(v1.y,v2.y,v3.y);
+ ret.z = __hsail_f32_median3(v1.z,v2.z, v3.z);
+ ret.w = __hsail_f32_median3(v1.w,v2.w,v3.w);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) float8 amd_median3(float8 v1, float8 v2, float8 v3)
+{
+ float8 ret;
+ ret.s0 = __hsail_f32_median3(v1.s0,v2.s0, v3.s0);
+ ret.s1 = __hsail_f32_median3(v1.s1,v2.s1,v3.s1);
+ ret.s2 = __hsail_f32_median3(v1.s2,v2.s2, v3.s2);
+ ret.s3 = __hsail_f32_median3(v1.s3,v2.s3,v3.s3);
+ ret.s4 = __hsail_f32_median3(v1.s4,v2.s4,v3.s4) ;
+ ret.s5 = __hsail_f32_median3(v1.s5,v2.s5,v3.s5);
+ ret.s6 = __hsail_f32_median3(v1.s6,v2.s6,v3.s6 );
+ ret.s7 = __hsail_f32_median3(v1.s7,v2.s7,v3.s7);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) float16 amd_median3(float16 v1, float16 v2, float16 v3)
+{
+ float16 ret;
+ ret.s0 = __hsail_f32_median3(v1.s0,v2.s0, v3.s0);
+ ret.s1 = __hsail_f32_median3(v1.s1,v2.s1,v3.s1);
+ ret.s2 = __hsail_f32_median3(v1.s2,v2.s2, v3.s2);
+ ret.s3 = __hsail_f32_median3(v1.s3,v2.s3,v3.s3);
+ ret.s4 = __hsail_f32_median3(v1.s4,v2.s4,v3.s4) ;
+ ret.s5 = __hsail_f32_median3(v1.s5,v2.s5,v3.s5);
+ ret.s6 = __hsail_f32_median3(v1.s6,v2.s6,v3.s6 );
+ ret.s7 = __hsail_f32_median3(v1.s7,v2.s7,v3.s7);
+ ret.s8 = __hsail_f32_median3(v1.s8,v2.s8,v3.s8 );
+ ret.s9 = __hsail_f32_median3(v1.s9,v2.s9,v3.s9);
+ ret.sa = __hsail_f32_median3(v1.sa,v2.sa, v3.sa);
+ ret.sb = __hsail_f32_median3(v1.sb,v2.sb,v3.sb);
+ ret.sc = __hsail_f32_median3(v1.sc,v2.sc, v3.sc);
+ ret.sd = __hsail_f32_median3(v1.sd,v2.sd,v3.sd);
+ ret.se = __hsail_f32_median3(v1.se,v2.se, v3.se);
+ ret.sf= __hsail_f32_median3(v1.sf,v2.sf,v3.sf);
+
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) float amd_median3(float v1, float v2, float v3)
+{
+ return __hsail_f32_median3(v1,v2,v3);
+}
+__attribute__((overloadable,always_inline,const)) int2 amd_median3(int2 v1, int2 v2, int2 v3)
+{
+ int2 ret;
+ ret.x = __hsail_imedian3(v1.x,v2.x, v3.x);
+ ret.y = __hsail_imedian3(v1.y,v2.y,v3.y);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) int3 amd_median3(int3 v1, int3 v2, int3 v3)
+{
+ int3 ret;
+ ret.x = __hsail_imedian3(v1.x,v2.x, v3.x);
+ ret.y = __hsail_imedian3(v1.y,v2.y,v3.y);
+ ret.z = __hsail_imedian3(v1.z,v2.z, v3.z);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) int4 amd_median3(int4 v1, int4 v2, int4 v3)
+{
+ int4 ret;
+ ret.x = __hsail_imedian3(v1.x,v2.x, v3.x);
+ ret.y = __hsail_imedian3(v1.y,v2.y,v3.y);
+ ret.z = __hsail_imedian3(v1.z,v2.z, v3.z);
+ ret.w = __hsail_imedian3(v1.w,v2.w,v3.w);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) int8 amd_median3(int8 v1, int8 v2, int8 v3)
+{
+ int8 ret;
+ ret.s0 = __hsail_imedian3(v1.s0,v2.s0, v3.s0);
+ ret.s1 = __hsail_imedian3(v1.s1,v2.s1,v3.s1);
+ ret.s2 = __hsail_imedian3(v1.s2,v2.s2, v3.s2);
+ ret.s3 = __hsail_imedian3(v1.s3,v2.s3,v3.s3);
+ ret.s4 = __hsail_imedian3(v1.s4,v2.s4,v3.s4) ;
+ ret.s5 = __hsail_imedian3(v1.s5,v2.s5,v3.s5);
+ ret.s6 = __hsail_imedian3(v1.s6,v2.s6,v3.s6 );
+ ret.s7 = __hsail_imedian3(v1.s7,v2.s7,v3.s7);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) int16 amd_median3(int16 v1, int16 v2, int16 v3)
+{
+ int16 ret;
+ ret.s0 = __hsail_imedian3(v1.s0,v2.s0, v3.s0);
+ ret.s1 = __hsail_imedian3(v1.s1,v2.s1,v3.s1);
+ ret.s2 = __hsail_imedian3(v1.s2,v2.s2, v3.s2);
+ ret.s3 = __hsail_imedian3(v1.s3,v2.s3,v3.s3);
+ ret.s4 = __hsail_imedian3(v1.s4,v2.s4,v3.s4) ;
+ ret.s5 = __hsail_imedian3(v1.s5,v2.s5,v3.s5);
+ ret.s6 = __hsail_imedian3(v1.s6,v2.s6,v3.s6 );
+ ret.s7 = __hsail_imedian3(v1.s7,v2.s7,v3.s7);
+ ret.s8 = __hsail_imedian3(v1.s8,v2.s8,v3.s8 );
+ ret.s9 = __hsail_imedian3(v1.s9,v2.s9,v3.s9);
+ ret.sa = __hsail_imedian3(v1.sa,v2.sa, v3.sa);
+ ret.sb = __hsail_imedian3(v1.sb,v2.sb,v3.sb);
+ ret.sc = __hsail_imedian3(v1.sc,v2.sc, v3.sc);
+ ret.sd = __hsail_imedian3(v1.sd,v2.sd,v3.sd);
+ ret.se = __hsail_imedian3(v1.se,v2.se, v3.se);
+ ret.sf= __hsail_imedian3(v1.sf,v2.sf,v3.sf);
+
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) int amd_median3(int v1, int v2, int v3)
+{
+ return __hsail_imedian3(v1,v2,v3);
+}
Added: libclc/branches/amd-builtins/amd-builtins/media/min3.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/media/min3.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/media/min3.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/media/min3.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "media.h"
+
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+
+__attribute__((overloadable,always_inline,const)) uint2 amd_min3(uint2 v1, uint2 v2, uint2 v3)
+{
+ uint2 ret;
+ ret.x = __hsail_umin3(v1.x,v2.x, v3.x);
+ ret.y = __hsail_umin3(v1.y,v2.y,v3.y);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint3 amd_min3(uint3 v1, uint3 v2, uint3 v3)
+{
+ uint3 ret;
+ ret.x = __hsail_umin3(v1.x,v2.x, v3.x);
+ ret.y = __hsail_umin3(v1.y,v2.y,v3.y);
+ ret.z = __hsail_umin3(v1.z,v2.z, v3.z);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint4 amd_min3(uint4 v1, uint4 v2, uint4 v3)
+{
+ uint4 ret;
+ ret.x = __hsail_umin3(v1.x,v2.x, v3.x);
+ ret.y = __hsail_umin3(v1.y,v2.y,v3.y);
+ ret.z = __hsail_umin3(v1.z,v2.z, v3.z);
+ ret.w = __hsail_umin3(v1.w,v2.w,v3.w);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint8 amd_min3(uint8 v1, uint8 v2, uint8 v3)
+{
+ uint8 ret;
+ ret.s0 = __hsail_umin3(v1.s0,v2.s0, v3.s0);
+ ret.s1 = __hsail_umin3(v1.s1,v2.s1,v3.s1);
+ ret.s2 = __hsail_umin3(v1.s2,v2.s2, v3.s2);
+ ret.s3 = __hsail_umin3(v1.s3,v2.s3,v3.s3);
+ ret.s4 = __hsail_umin3(v1.s4,v2.s4,v3.s4) ;
+ ret.s5 = __hsail_umin3(v1.s5,v2.s5,v3.s5);
+ ret.s6 = __hsail_umin3(v1.s6,v2.s6,v3.s6 );
+ ret.s7 = __hsail_umin3(v1.s7,v2.s7,v3.s7);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint16 amd_min3(uint16 v1, uint16 v2, uint16 v3)
+{
+ uint16 ret;
+ ret.s0 = __hsail_umin3(v1.s0,v2.s0, v3.s0);
+ ret.s1 = __hsail_umin3(v1.s1,v2.s1,v3.s1);
+ ret.s2 = __hsail_umin3(v1.s2,v2.s2, v3.s2);
+ ret.s3 = __hsail_umin3(v1.s3,v2.s3,v3.s3);
+ ret.s4 = __hsail_umin3(v1.s4,v2.s4,v3.s4) ;
+ ret.s5 = __hsail_umin3(v1.s5,v2.s5,v3.s5);
+ ret.s6 = __hsail_umin3(v1.s6,v2.s6,v3.s6 );
+ ret.s7 = __hsail_umin3(v1.s7,v2.s7,v3.s7);
+ ret.s8 = __hsail_umin3(v1.s8,v2.s8,v3.s8 );
+ ret.s9 = __hsail_umin3(v1.s9,v2.s9,v3.s9);
+ ret.sa = __hsail_umin3(v1.sa,v2.sa, v3.sa);
+ ret.sb = __hsail_umin3(v1.sb,v2.sb,v3.sb);
+ ret.sc = __hsail_umin3(v1.sc,v2.sc, v3.sc);
+ ret.sd = __hsail_umin3(v1.sd,v2.sd,v3.sd);
+ ret.se = __hsail_umin3(v1.se,v2.se, v3.se);
+ ret.sf= __hsail_umin3(v1.sf,v2.sf,v3.sf);
+
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint amd_min3(uint v1, uint v2, uint v3)
+{
+ return __hsail_umin3(v1,v2,v3);
+}
+__attribute__((overloadable,always_inline,const)) float2 amd_min3(float2 v1, float2 v2, float2 v3)
+{
+ float2 ret;
+ ret.x = __hsail_f32_min3(v1.x,v2.x, v3.x);
+ ret.y = __hsail_f32_min3(v1.y,v2.y,v3.y);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) float3 amd_min3(float3 v1, float3 v2, float3 v3)
+{
+ float3 ret;
+ ret.x = __hsail_f32_min3(v1.x,v2.x, v3.x);
+ ret.y = __hsail_f32_min3(v1.y,v2.y,v3.y);
+ ret.z = __hsail_f32_min3(v1.z,v2.z, v3.z);
+ return ret;
+}
+
+__attribute__((overloadable,always_inline,const)) float4 amd_min3(float4 v1, float4 v2, float4 v3)
+{
+ float4 ret;
+ ret.x = __hsail_f32_min3(v1.x,v2.x, v3.x);
+ ret.y = __hsail_f32_min3(v1.y,v2.y,v3.y);
+ ret.z = __hsail_f32_min3(v1.z,v2.z, v3.z);
+ ret.w = __hsail_f32_min3(v1.w,v2.w,v3.w);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) float8 amd_min3(float8 v1, float8 v2, float8 v3)
+{
+ float8 ret;
+ ret.s0 = __hsail_f32_min3(v1.s0,v2.s0, v3.s0);
+ ret.s1 = __hsail_f32_min3(v1.s1,v2.s1,v3.s1);
+ ret.s2 = __hsail_f32_min3(v1.s2,v2.s2, v3.s2);
+ ret.s3 = __hsail_f32_min3(v1.s3,v2.s3,v3.s3);
+ ret.s4 = __hsail_f32_min3(v1.s4,v2.s4,v3.s4) ;
+ ret.s5 = __hsail_f32_min3(v1.s5,v2.s5,v3.s5);
+ ret.s6 = __hsail_f32_min3(v1.s6,v2.s6,v3.s6 );
+ ret.s7 = __hsail_f32_min3(v1.s7,v2.s7,v3.s7);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) float16 amd_min3(float16 v1, float16 v2, float16 v3)
+{
+ float16 ret;
+ ret.s0 = __hsail_f32_min3(v1.s0,v2.s0, v3.s0);
+ ret.s1 = __hsail_f32_min3(v1.s1,v2.s1,v3.s1);
+ ret.s2 = __hsail_f32_min3(v1.s2,v2.s2, v3.s2);
+ ret.s3 = __hsail_f32_min3(v1.s3,v2.s3,v3.s3);
+ ret.s4 = __hsail_f32_min3(v1.s4,v2.s4,v3.s4) ;
+ ret.s5 = __hsail_f32_min3(v1.s5,v2.s5,v3.s5);
+ ret.s6 = __hsail_f32_min3(v1.s6,v2.s6,v3.s6 );
+ ret.s7 = __hsail_f32_min3(v1.s7,v2.s7,v3.s7);
+ ret.s8 = __hsail_f32_min3(v1.s8,v2.s8,v3.s8 );
+ ret.s9 = __hsail_f32_min3(v1.s9,v2.s9,v3.s9);
+ ret.sa = __hsail_f32_min3(v1.sa,v2.sa, v3.sa);
+ ret.sb = __hsail_f32_min3(v1.sb,v2.sb,v3.sb);
+ ret.sc = __hsail_f32_min3(v1.sc,v2.sc, v3.sc);
+ ret.sd = __hsail_f32_min3(v1.sd,v2.sd,v3.sd);
+ ret.se = __hsail_f32_min3(v1.se,v2.se, v3.se);
+ ret.sf= __hsail_f32_min3(v1.sf,v2.sf,v3.sf);
+
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) float amd_min3(float v1, float v2, float v3)
+{
+ return __hsail_f32_min3(v1,v2,v3);
+}
+__attribute__((overloadable,always_inline,const)) int2 amd_min3(int2 v1, int2 v2, int2 v3)
+{
+ int2 ret;
+ ret.x = __hsail_imin3(v1.x,v2.x, v3.x);
+ ret.y = __hsail_imin3(v1.y,v2.y,v3.y);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) int3 amd_min3(int3 v1, int3 v2, int3 v3)
+{
+ int3 ret;
+ ret.x = __hsail_imin3(v1.x,v2.x, v3.x);
+ ret.y = __hsail_imin3(v1.y,v2.y,v3.y);
+ ret.z = __hsail_imin3(v1.z,v2.z, v3.z);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) int4 amd_min3(int4 v1, int4 v2, int4 v3)
+{
+ int4 ret;
+ ret.x = __hsail_imin3(v1.x,v2.x, v3.x);
+ ret.y = __hsail_imin3(v1.y,v2.y,v3.y);
+ ret.z = __hsail_imin3(v1.z,v2.z, v3.z);
+ ret.w = __hsail_imin3(v1.w,v2.w,v3.w);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) int8 amd_min3(int8 v1, int8 v2, int8 v3)
+{
+ int8 ret;
+ ret.s0 = __hsail_imin3(v1.s0,v2.s0, v3.s0);
+ ret.s1 = __hsail_imin3(v1.s1,v2.s1,v3.s1);
+ ret.s2 = __hsail_imin3(v1.s2,v2.s2, v3.s2);
+ ret.s3 = __hsail_imin3(v1.s3,v2.s3,v3.s3);
+ ret.s4 = __hsail_imin3(v1.s4,v2.s4,v3.s4) ;
+ ret.s5 = __hsail_imin3(v1.s5,v2.s5,v3.s5);
+ ret.s6 = __hsail_imin3(v1.s6,v2.s6,v3.s6 );
+ ret.s7 = __hsail_imin3(v1.s7,v2.s7,v3.s7);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) int16 amd_min3(int16 v1, int16 v2, int16 v3)
+{
+ int16 ret;
+ ret.s0 = __hsail_imin3(v1.s0,v2.s0, v3.s0);
+ ret.s1 = __hsail_imin3(v1.s1,v2.s1,v3.s1);
+ ret.s2 = __hsail_imin3(v1.s2,v2.s2, v3.s2);
+ ret.s3 = __hsail_imin3(v1.s3,v2.s3,v3.s3);
+ ret.s4 = __hsail_imin3(v1.s4,v2.s4,v3.s4) ;
+ ret.s5 = __hsail_imin3(v1.s5,v2.s5,v3.s5);
+ ret.s6 = __hsail_imin3(v1.s6,v2.s6,v3.s6 );
+ ret.s7 = __hsail_imin3(v1.s7,v2.s7,v3.s7);
+ ret.s8 = __hsail_imin3(v1.s8,v2.s8,v3.s8 );
+ ret.s9 = __hsail_imin3(v1.s9,v2.s9,v3.s9);
+ ret.sa = __hsail_imin3(v1.sa,v2.sa, v3.sa);
+ ret.sb = __hsail_imin3(v1.sb,v2.sb,v3.sb);
+ ret.sc = __hsail_imin3(v1.sc,v2.sc, v3.sc);
+ ret.sd = __hsail_imin3(v1.sd,v2.sd,v3.sd);
+ ret.se = __hsail_imin3(v1.se,v2.se, v3.se);
+ ret.sf= __hsail_imin3(v1.sf,v2.sf,v3.sf);
+
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) int amd_min3(int v1, int v2, int v3)
+{
+ return __hsail_imin3(v1,v2,v3);
+}
Added: libclc/branches/amd-builtins/amd-builtins/media/mqsad.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/media/mqsad.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/media/mqsad.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/media/mqsad.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "media.h"
+
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+
+__attribute__((overloadable,always_inline,const)) ulong2 amd_mqsad(ulong2 v1, uint2 v2, ulong2 v3)
+{
+ ulong2 ret;
+ ret.x = __hsail_mqsad(v1.x,v2.x, v3.x);
+ ret.y = __hsail_mqsad(v1.y,v2.y,v3.y);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong3 amd_mqsad(ulong3 v1, uint3 v2, ulong3 v3)
+{
+ ulong3 ret;
+ ret.x = __hsail_mqsad(v1.x,v2.x, v3.x);
+ ret.y = __hsail_mqsad(v1.y,v2.y,v3.y);
+ ret.z = __hsail_mqsad(v1.z,v2.z, v3.z);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong4 amd_mqsad(ulong4 v1, uint4 v2, ulong4 v3)
+{
+ ulong4 ret;
+ ret.x = __hsail_mqsad(v1.x,v2.x, v3.x);
+ ret.y = __hsail_mqsad(v1.y,v2.y,v3.y);
+ ret.z = __hsail_mqsad(v1.z,v2.z, v3.z);
+ ret.w = __hsail_mqsad(v1.w,v2.w,v3.w);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong8 amd_mqsad(ulong8 v1, uint8 v2, ulong8 v3)
+{
+ ulong8 ret;
+ ret.s0 = __hsail_mqsad(v1.s0,v2.s0, v3.s0);
+ ret.s1 = __hsail_mqsad(v1.s1,v2.s1,v3.s1);
+ ret.s2 = __hsail_mqsad(v1.s2,v2.s2, v3.s2);
+ ret.s3 = __hsail_mqsad(v1.s3,v2.s3,v3.s3);
+ ret.s4 = __hsail_mqsad(v1.s4,v2.s4,v3.s4) ;
+ ret.s5 = __hsail_mqsad(v1.s5,v2.s5,v3.s5);
+ ret.s6 = __hsail_mqsad(v1.s6,v2.s6,v3.s6 );
+ ret.s7 = __hsail_mqsad(v1.s7,v2.s7,v3.s7);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong16 amd_mqsad(ulong16 v1, uint16 v2, ulong16 v3)
+{
+ ulong16 ret;
+ ret.s0 = __hsail_mqsad(v1.s0,v2.s0, v3.s0);
+ ret.s1 = __hsail_mqsad(v1.s1,v2.s1,v3.s1);
+ ret.s2 = __hsail_mqsad(v1.s2,v2.s2, v3.s2);
+ ret.s3 = __hsail_mqsad(v1.s3,v2.s3,v3.s3);
+ ret.s4 = __hsail_mqsad(v1.s4,v2.s4,v3.s4) ;
+ ret.s5 = __hsail_mqsad(v1.s5,v2.s5,v3.s5);
+ ret.s6 = __hsail_mqsad(v1.s6,v2.s6,v3.s6 );
+ ret.s7 = __hsail_mqsad(v1.s7,v2.s7,v3.s7);
+ ret.s8 = __hsail_mqsad(v1.s8,v2.s8,v3.s8 );
+ ret.s9 = __hsail_mqsad(v1.s9,v2.s9,v3.s9);
+ ret.sa = __hsail_mqsad(v1.sa,v2.sa, v3.sa);
+ ret.sb = __hsail_mqsad(v1.sb,v2.sb,v3.sb);
+ ret.sc = __hsail_mqsad(v1.sc,v2.sc, v3.sc);
+ ret.sd = __hsail_mqsad(v1.sd,v2.sd,v3.sd);
+ ret.se = __hsail_mqsad(v1.se,v2.se, v3.se);
+ ret.sf= __hsail_mqsad(v1.sf,v2.sf,v3.sf);
+
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong amd_mqsad(ulong v1, uint v2, ulong v3)
+{
+ return __hsail_mqsad(v1,v2,v3);
+}
+
Added: libclc/branches/amd-builtins/amd-builtins/media/msad.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/media/msad.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/media/msad.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/media/msad.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "media.h"
+
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+
+__attribute__((overloadable,always_inline,const)) uint2 amd_msad(uint2 v1, uint2 v2, uint2 v3)
+{
+ uint2 ret;
+ ret.x = __hsail_msad(v1.x,v2.x, v3.x);
+ ret.y = __hsail_msad(v1.y,v2.y,v3.y);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint3 amd_msad(uint3 v1, uint3 v2, uint3 v3)
+{
+ uint3 ret;
+ ret.x = __hsail_msad(v1.x,v2.x, v3.x);
+ ret.y = __hsail_msad(v1.y,v2.y,v3.y);
+ ret.z = __hsail_msad(v1.z,v2.z, v3.z);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint4 amd_msad(uint4 v1, uint4 v2, uint4 v3)
+{
+ uint4 ret;
+ ret.x = __hsail_msad(v1.x,v2.x, v3.x);
+ ret.y = __hsail_msad(v1.y,v2.y,v3.y);
+ ret.z = __hsail_msad(v1.z,v2.z, v3.z);
+ ret.w = __hsail_msad(v1.w,v2.w,v3.w);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint8 amd_msad(uint8 v1, uint8 v2, uint8 v3)
+{
+ uint8 ret;
+ ret.s0 = __hsail_msad(v1.s0,v2.s0, v3.s0);
+ ret.s1 = __hsail_msad(v1.s1,v2.s1,v3.s1);
+ ret.s2 = __hsail_msad(v1.s2,v2.s2, v3.s2);
+ ret.s3 = __hsail_msad(v1.s3,v2.s3,v3.s3);
+ ret.s4 = __hsail_msad(v1.s4,v2.s4,v3.s4) ;
+ ret.s5 = __hsail_msad(v1.s5,v2.s5,v3.s5);
+ ret.s6 = __hsail_msad(v1.s6,v2.s6,v3.s6 );
+ ret.s7 = __hsail_msad(v1.s7,v2.s7,v3.s7);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint16 amd_msad(uint16 v1, uint16 v2, uint16 v3)
+{
+ uint16 ret;
+ ret.s0 = __hsail_msad(v1.s0,v2.s0, v3.s0);
+ ret.s1 = __hsail_msad(v1.s1,v2.s1,v3.s1);
+ ret.s2 = __hsail_msad(v1.s2,v2.s2, v3.s2);
+ ret.s3 = __hsail_msad(v1.s3,v2.s3,v3.s3);
+ ret.s4 = __hsail_msad(v1.s4,v2.s4,v3.s4) ;
+ ret.s5 = __hsail_msad(v1.s5,v2.s5,v3.s5);
+ ret.s6 = __hsail_msad(v1.s6,v2.s6,v3.s6 );
+ ret.s7 = __hsail_msad(v1.s7,v2.s7,v3.s7);
+ ret.s8 = __hsail_msad(v1.s8,v2.s8,v3.s8 );
+ ret.s9 = __hsail_msad(v1.s9,v2.s9,v3.s9);
+ ret.sa = __hsail_msad(v1.sa,v2.sa, v3.sa);
+ ret.sb = __hsail_msad(v1.sb,v2.sb,v3.sb);
+ ret.sc = __hsail_msad(v1.sc,v2.sc, v3.sc);
+ ret.sd = __hsail_msad(v1.sd,v2.sd,v3.sd);
+ ret.se = __hsail_msad(v1.se,v2.se, v3.se);
+ ret.sf= __hsail_msad(v1.sf,v2.sf,v3.sf);
+
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint amd_msad(uint v1, uint v2, uint v3)
+{
+ return __hsail_msad(v1,v2,v3);
+}
Added: libclc/branches/amd-builtins/amd-builtins/media/pack.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/media/pack.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/media/pack.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/media/pack.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "media.h"
+
+#ifdef __clang__
+__attribute__((overloadable, always_inline))
+#else
+__attribute__((always_inline))
+#endif
+ uint amd_pack(float4 v)
+{
+ return __hsail_packcvt_u8x4_f32(v.s0,v.s1,v.s2,v.s3);
+}
Added: libclc/branches/amd-builtins/amd-builtins/media/qsad.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/media/qsad.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/media/qsad.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/media/qsad.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "media.h"
+
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+
+__attribute__((overloadable,always_inline,const)) ulong2 amd_qsad(ulong2 v1, uint2 v2, ulong2 v3)
+{
+ ulong2 ret;
+ ret.x = __hsail_qsad(v1.x,v2.x, v3.x);
+ ret.y = __hsail_qsad(v1.y,v2.y,v3.y);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong3 amd_qsad(ulong3 v1, uint3 v2, ulong3 v3)
+{
+ ulong3 ret;
+ ret.x = __hsail_qsad(v1.x,v2.x, v3.x);
+ ret.y = __hsail_qsad(v1.y,v2.y,v3.y);
+ ret.z = __hsail_qsad(v1.z,v2.z, v3.z);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong4 amd_qsad(ulong4 v1, uint4 v2, ulong4 v3)
+{
+ ulong4 ret;
+ ret.x = __hsail_qsad(v1.x,v2.x, v3.x);
+ ret.y = __hsail_qsad(v1.y,v2.y,v3.y);
+ ret.z = __hsail_qsad(v1.z,v2.z, v3.z);
+ ret.w = __hsail_qsad(v1.w,v2.w,v3.w);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong8 amd_qsad(ulong8 v1, uint8 v2, ulong8 v3)
+{
+ ulong8 ret;
+ ret.s0 = __hsail_qsad(v1.s0,v2.s0, v3.s0);
+ ret.s1 = __hsail_qsad(v1.s1,v2.s1,v3.s1);
+ ret.s2 = __hsail_qsad(v1.s2,v2.s2, v3.s2);
+ ret.s3 = __hsail_qsad(v1.s3,v2.s3,v3.s3);
+ ret.s4 = __hsail_qsad(v1.s4,v2.s4,v3.s4) ;
+ ret.s5 = __hsail_qsad(v1.s5,v2.s5,v3.s5);
+ ret.s6 = __hsail_qsad(v1.s6,v2.s6,v3.s6 );
+ ret.s7 = __hsail_qsad(v1.s7,v2.s7,v3.s7);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong16 amd_qsad(ulong16 v1, uint16 v2, ulong16 v3)
+{
+ ulong16 ret;
+ ret.s0 = __hsail_qsad(v1.s0,v2.s0, v3.s0);
+ ret.s1 = __hsail_qsad(v1.s1,v2.s1,v3.s1);
+ ret.s2 = __hsail_qsad(v1.s2,v2.s2, v3.s2);
+ ret.s3 = __hsail_qsad(v1.s3,v2.s3,v3.s3);
+ ret.s4 = __hsail_qsad(v1.s4,v2.s4,v3.s4) ;
+ ret.s5 = __hsail_qsad(v1.s5,v2.s5,v3.s5);
+ ret.s6 = __hsail_qsad(v1.s6,v2.s6,v3.s6 );
+ ret.s7 = __hsail_qsad(v1.s7,v2.s7,v3.s7);
+ ret.s8 = __hsail_qsad(v1.s8,v2.s8,v3.s8 );
+ ret.s9 = __hsail_qsad(v1.s9,v2.s9,v3.s9);
+ ret.sa = __hsail_qsad(v1.sa,v2.sa, v3.sa);
+ ret.sb = __hsail_qsad(v1.sb,v2.sb,v3.sb);
+ ret.sc = __hsail_qsad(v1.sc,v2.sc, v3.sc);
+ ret.sd = __hsail_qsad(v1.sd,v2.sd,v3.sd);
+ ret.se = __hsail_qsad(v1.se,v2.se, v3.se);
+ ret.sf= __hsail_qsad(v1.sf,v2.sf,v3.sf);
+
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong amd_qsad(ulong v1, uint v2, ulong v3)
+{
+ return __hsail_qsad(v1,v2,v3);
+}
+
Added: libclc/branches/amd-builtins/amd-builtins/media/sad.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/media/sad.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/media/sad.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/media/sad.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "media.h"
+
+__attribute__((overloadable, always_inline)) uint
+amd_sad(uint a, uint b, uint c)
+{
+ return __hsail_sad_u32_u8x4(a, b, c);
+}
+
+__attribute__((overloadable, always_inline)) uint2
+amd_sad(uint2 a, uint2 b, uint2 c)
+{
+ uint2 ret;
+ ret.x = __hsail_sad_u32_u8x4(a.x, b.x, c.x);
+ ret.y = __hsail_sad_u32_u8x4(a.y, b.y, c.y);
+ return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint3
+amd_sad(uint3 a, uint3 b, uint3 c)
+{
+
+ uint3 ret;
+ ret.x = __hsail_sad_u32_u8x4(a.x, b.x, c.x);
+ ret.y = __hsail_sad_u32_u8x4(a.y, b.y, c.y);
+ ret.z = __hsail_sad_u32_u8x4(a.z, b.z, c.z);
+ return ret;
+
+}
+
+__attribute__((overloadable, always_inline)) uint4
+amd_sad(uint4 a, uint4 b, uint4 c)
+{
+ uint4 ret;
+ ret.x = __hsail_sad_u32_u8x4(a.x, b.x, c.x);
+ ret.y = __hsail_sad_u32_u8x4(a.y, b.y, c.y);
+ ret.z = __hsail_sad_u32_u8x4(a.z, b.z, c.z);
+ ret.w = __hsail_sad_u32_u8x4(a.w, b.w, c.w);
+ return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint8
+amd_sad(uint8 a, uint8 b, uint8 c)
+{
+ uint8 ret;
+ ret.s0 = __hsail_sad_u32_u8x4(a.s0, b.s0, c.s0);
+ ret.s1 = __hsail_sad_u32_u8x4(a.s1, b.s1, c.s1);
+ ret.s2 = __hsail_sad_u32_u8x4(a.s2, b.s2, c.s2);
+ ret.s3 = __hsail_sad_u32_u8x4(a.s3, b.s3, c.s3);
+ ret.s4 = __hsail_sad_u32_u8x4(a.s4, b.s4, c.s4);
+ ret.s5 = __hsail_sad_u32_u8x4(a.s5, b.s5, c.s5);
+ ret.s6 = __hsail_sad_u32_u8x4(a.s6, b.s6, c.s6);
+ ret.s7 = __hsail_sad_u32_u8x4(a.s7, b.s7, c.s7);
+ return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint16
+amd_sad(uint16 a, uint16 b, uint16 c)
+{
+ uint16 ret;
+ ret.s0 = __hsail_sad_u32_u8x4(a.s0, b.s0, c.s0);
+ ret.s1 = __hsail_sad_u32_u8x4(a.s1, b.s1, c.s1);
+ ret.s2 = __hsail_sad_u32_u8x4(a.s2, b.s2, c.s2);
+ ret.s3 = __hsail_sad_u32_u8x4(a.s3, b.s3, c.s3);
+ ret.s4 = __hsail_sad_u32_u8x4(a.s4, b.s4, c.s4);
+ ret.s5 = __hsail_sad_u32_u8x4(a.s5, b.s5, c.s5);
+ ret.s6 = __hsail_sad_u32_u8x4(a.s6, b.s6, c.s6);
+ ret.s7 = __hsail_sad_u32_u8x4(a.s7, b.s7, c.s7);
+ ret.s8 = __hsail_sad_u32_u8x4(a.s8, b.s8, c.s8);
+ ret.s9 = __hsail_sad_u32_u8x4(a.s9, b.s9, c.s9);
+ ret.sa = __hsail_sad_u32_u8x4(a.sa, b.sa, c.sa);
+ ret.sb = __hsail_sad_u32_u8x4(a.sb, b.sb, c.sb);
+ ret.sc = __hsail_sad_u32_u8x4(a.sc, b.sc, c.sc);
+ ret.sd = __hsail_sad_u32_u8x4(a.sd, b.sd, c.sd);
+ ret.se = __hsail_sad_u32_u8x4(a.se, b.se, c.se);
+ ret.sf = __hsail_sad_u32_u8x4(a.sf, b.sf, c.sf);
+ return ret;
+}
+
Added: libclc/branches/amd-builtins/amd-builtins/media/sad4.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/media/sad4.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/media/sad4.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/media/sad4.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "media.h"
+#ifdef __clang__
+__attribute__((overloadable, always_inline))
+#else
+__attribute__((always_inline))
+#endif
+uint amd_sad4(uint4 x, uint4 y, uint z)
+{
+ uint a = __hsail_sad_u32_u8x4(x.s0,y.s0,z);
+ a = __hsail_sad_u32_u8x4(x.s1,y.s1,a);
+ a = __hsail_sad_u32_u8x4(x.s2,y.s2,a);
+
+ return __hsail_sad_u32_u8x4(x.s3,y.s3,a);
+}
+
Added: libclc/branches/amd-builtins/amd-builtins/media/sadd.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/media/sadd.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/media/sadd.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/media/sadd.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "media.h"
+
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+
+__attribute__((overloadable,always_inline,const)) uint2 amd_sadd(uint2 v1, uint2 v2, uint2 v3)
+{
+ uint2 ret;
+ ret.x = __hsail_sadd(v1.x,v2.x, v3.x);
+ ret.y = __hsail_sadd(v1.y,v2.y,v3.y);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint3 amd_sadd(uint3 v1, uint3 v2, uint3 v3)
+{
+ uint3 ret;
+ ret.x = __hsail_sadd(v1.x,v2.x, v3.x);
+ ret.y = __hsail_sadd(v1.y,v2.y,v3.y);
+ ret.z = __hsail_sadd(v1.z,v2.z, v3.z);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint4 amd_sadd(uint4 v1, uint4 v2, uint4 v3)
+{
+ uint4 ret;
+ ret.x = __hsail_sadd(v1.x,v2.x, v3.x);
+ ret.y = __hsail_sadd(v1.y,v2.y,v3.y);
+ ret.z = __hsail_sadd(v1.z,v2.z, v3.z);
+ ret.w = __hsail_sadd(v1.w,v2.w,v3.w);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint8 amd_sadd(uint8 v1, uint8 v2, uint8 v3)
+{
+ uint8 ret;
+ ret.s0 = __hsail_sadd(v1.s0,v2.s0, v3.s0);
+ ret.s1 = __hsail_sadd(v1.s1,v2.s1,v3.s1);
+ ret.s2 = __hsail_sadd(v1.s2,v2.s2, v3.s2);
+ ret.s3 = __hsail_sadd(v1.s3,v2.s3,v3.s3);
+ ret.s4 = __hsail_sadd(v1.s4,v2.s4,v3.s4) ;
+ ret.s5 = __hsail_sadd(v1.s5,v2.s5,v3.s5);
+ ret.s6 = __hsail_sadd(v1.s6,v2.s6,v3.s6 );
+ ret.s7 = __hsail_sadd(v1.s7,v2.s7,v3.s7);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint16 amd_sadd(uint16 v1, uint16 v2, uint16 v3)
+{
+ uint16 ret;
+ ret.s0 = __hsail_sadd(v1.s0,v2.s0, v3.s0);
+ ret.s1 = __hsail_sadd(v1.s1,v2.s1,v3.s1);
+ ret.s2 = __hsail_sadd(v1.s2,v2.s2, v3.s2);
+ ret.s3 = __hsail_sadd(v1.s3,v2.s3,v3.s3);
+ ret.s4 = __hsail_sadd(v1.s4,v2.s4,v3.s4) ;
+ ret.s5 = __hsail_sadd(v1.s5,v2.s5,v3.s5);
+ ret.s6 = __hsail_sadd(v1.s6,v2.s6,v3.s6 );
+ ret.s7 = __hsail_sadd(v1.s7,v2.s7,v3.s7);
+ ret.s8 = __hsail_sadd(v1.s8,v2.s8,v3.s8 );
+ ret.s9 = __hsail_sadd(v1.s9,v2.s9,v3.s9);
+ ret.sa = __hsail_sadd(v1.sa,v2.sa, v3.sa);
+ ret.sb = __hsail_sadd(v1.sb,v2.sb,v3.sb);
+ ret.sc = __hsail_sadd(v1.sc,v2.sc, v3.sc);
+ ret.sd = __hsail_sadd(v1.sd,v2.sd,v3.sd);
+ ret.se = __hsail_sadd(v1.se,v2.se, v3.se);
+ ret.sf= __hsail_sadd(v1.sf,v2.sf,v3.sf);
+
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint amd_sadd(uint v1, uint v2, uint v3)
+{
+ return __hsail_sadd(v1,v2,v3);
+}
Added: libclc/branches/amd-builtins/amd-builtins/media/sadhi.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/media/sadhi.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/media/sadhi.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/media/sadhi.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "media.h"
+
+__attribute__((overloadable, always_inline)) uint
+amd_sadhi(uint a, uint b, uint c)
+{
+ return __hsail_sadhi_u16x2_u8x4(a, b, c);
+}
+
+__attribute__((overloadable, always_inline)) uint2
+amd_sadhi(uint2 a, uint2 b, uint2 c)
+{
+ uint2 ret;
+ ret.x = __hsail_sadhi_u16x2_u8x4(a.x, b.x, c.x);
+ ret.y = __hsail_sadhi_u16x2_u8x4(a.y, b.y, c.y);
+ return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint3
+amd_sadhi(uint3 a, uint3 b, uint3 c)
+{
+
+ uint3 ret;
+ ret.x = __hsail_sadhi_u16x2_u8x4(a.x, b.x, c.x);
+ ret.y = __hsail_sadhi_u16x2_u8x4(a.y, b.y, c.y);
+ ret.z = __hsail_sadhi_u16x2_u8x4(a.z, b.z, c.z);
+ return ret;
+
+}
+
+__attribute__((overloadable, always_inline)) uint4
+amd_sadhi(uint4 a, uint4 b, uint4 c)
+{
+ uint4 ret;
+ ret.x = __hsail_sadhi_u16x2_u8x4(a.x, b.x, c.x);
+ ret.y = __hsail_sadhi_u16x2_u8x4(a.y, b.y, c.y);
+ ret.z = __hsail_sadhi_u16x2_u8x4(a.z, b.z, c.z);
+ ret.w = __hsail_sadhi_u16x2_u8x4(a.w, b.w, c.w);
+ return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint8
+amd_sadhi(uint8 a, uint8 b, uint8 c)
+{
+ uint8 ret;
+ ret.s0 = __hsail_sadhi_u16x2_u8x4(a.s0, b.s0, c.s0);
+ ret.s1 = __hsail_sadhi_u16x2_u8x4(a.s1, b.s1, c.s1);
+ ret.s2 = __hsail_sadhi_u16x2_u8x4(a.s2, b.s2, c.s2);
+ ret.s3 = __hsail_sadhi_u16x2_u8x4(a.s3, b.s3, c.s3);
+ ret.s4 = __hsail_sadhi_u16x2_u8x4(a.s4, b.s4, c.s4);
+ ret.s5 = __hsail_sadhi_u16x2_u8x4(a.s5, b.s5, c.s5);
+ ret.s6 = __hsail_sadhi_u16x2_u8x4(a.s6, b.s6, c.s6);
+ ret.s7 = __hsail_sadhi_u16x2_u8x4(a.s7, b.s7, c.s7);
+ return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint16
+amd_sadhi(uint16 a, uint16 b, uint16 c)
+{
+ uint16 ret;
+ ret.s0 = __hsail_sadhi_u16x2_u8x4(a.s0, b.s0, c.s0);
+ ret.s1 = __hsail_sadhi_u16x2_u8x4(a.s1, b.s1, c.s1);
+ ret.s2 = __hsail_sadhi_u16x2_u8x4(a.s2, b.s2, c.s2);
+ ret.s3 = __hsail_sadhi_u16x2_u8x4(a.s3, b.s3, c.s3);
+ ret.s4 = __hsail_sadhi_u16x2_u8x4(a.s4, b.s4, c.s4);
+ ret.s5 = __hsail_sadhi_u16x2_u8x4(a.s5, b.s5, c.s5);
+ ret.s6 = __hsail_sadhi_u16x2_u8x4(a.s6, b.s6, c.s6);
+ ret.s7 = __hsail_sadhi_u16x2_u8x4(a.s7, b.s7, c.s7);
+ ret.s8 = __hsail_sadhi_u16x2_u8x4(a.s8, b.s8, c.s8);
+ ret.s9 = __hsail_sadhi_u16x2_u8x4(a.s9, b.s9, c.s9);
+ ret.sa = __hsail_sadhi_u16x2_u8x4(a.sa, b.sa, c.sa);
+ ret.sb = __hsail_sadhi_u16x2_u8x4(a.sb, b.sb, c.sb);
+ ret.sc = __hsail_sadhi_u16x2_u8x4(a.sc, b.sc, c.sc);
+ ret.sd = __hsail_sadhi_u16x2_u8x4(a.sd, b.sd, c.sd);
+ ret.se = __hsail_sadhi_u16x2_u8x4(a.se, b.se, c.se);
+ ret.sf = __hsail_sadhi_u16x2_u8x4(a.sf, b.sf, c.sf);
+ return ret;
+}
+
Added: libclc/branches/amd-builtins/amd-builtins/media/sadw.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/media/sadw.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/media/sadw.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/media/sadw.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "media.h"
+
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+
+__attribute__((overloadable,always_inline,const)) uint2 amd_sadw(uint2 v1, uint2 v2, uint2 v3)
+{
+ uint2 ret;
+ ret.x = __hsail_sadw(v1.x,v2.x, v3.x);
+ ret.y = __hsail_sadw(v1.y,v2.y,v3.y);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint3 amd_sadw(uint3 v1, uint3 v2, uint3 v3)
+{
+ uint3 ret;
+ ret.x = __hsail_sadw(v1.x,v2.x, v3.x);
+ ret.y = __hsail_sadw(v1.y,v2.y,v3.y);
+ ret.z = __hsail_sadw(v1.z,v2.z, v3.z);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint4 amd_sadw(uint4 v1, uint4 v2, uint4 v3)
+{
+ uint4 ret;
+ ret.x = __hsail_sadw(v1.x,v2.x, v3.x);
+ ret.y = __hsail_sadw(v1.y,v2.y,v3.y);
+ ret.z = __hsail_sadw(v1.z,v2.z, v3.z);
+ ret.w = __hsail_sadw(v1.w,v2.w,v3.w);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint8 amd_sadw(uint8 v1, uint8 v2, uint8 v3)
+{
+ uint8 ret;
+ ret.s0 = __hsail_sadw(v1.s0,v2.s0, v3.s0);
+ ret.s1 = __hsail_sadw(v1.s1,v2.s1,v3.s1);
+ ret.s2 = __hsail_sadw(v1.s2,v2.s2, v3.s2);
+ ret.s3 = __hsail_sadw(v1.s3,v2.s3,v3.s3);
+ ret.s4 = __hsail_sadw(v1.s4,v2.s4,v3.s4 );
+ ret.s5 = __hsail_sadw(v1.s5,v2.s5,v3.s5);
+ ret.s6 = __hsail_sadw(v1.s6,v2.s6,v3.s6 );
+ ret.s7 = __hsail_sadw(v1.s7,v2.s7,v3.s7);
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint16 amd_sadw(uint16 v1, uint16 v2, uint16 v3)
+{
+ uint16 ret;
+ ret.s0 = __hsail_sadw(v1.s0,v2.s0, v3.s0);
+ ret.s1 = __hsail_sadw(v1.s1,v2.s1,v3.s1);
+ ret.s2 = __hsail_sadw(v1.s2,v2.s2, v3.s2);
+ ret.s3 = __hsail_sadw(v1.s3,v2.s3,v3.s3);
+ ret.s4 = __hsail_sadw(v1.s4,v2.s4,v3.s4) ;
+ ret.s5 = __hsail_sadw(v1.s5,v2.s5,v3.s5);
+ ret.s6 = __hsail_sadw(v1.s6,v2.s6,v3.s6 );
+ ret.s7 = __hsail_sadw(v1.s7,v2.s7,v3.s7);
+ ret.s8 = __hsail_sadw(v1.s8,v2.s8,v3.s8 );
+ ret.s9 = __hsail_sadw(v1.s9,v2.s9,v3.s9);
+ ret.sa = __hsail_sadw(v1.sa,v2.sa, v3.sa);
+ ret.sb = __hsail_sadw(v1.sb,v2.sb,v3.sb);
+ ret.sc = __hsail_sadw(v1.sc,v2.sc, v3.sc);
+ ret.sd = __hsail_sadw(v1.sd,v2.sd,v3.sd);
+ ret.se = __hsail_sadw(v1.se,v2.se, v3.se);
+ ret.sf= __hsail_sadw(v1.sf,v2.sf,v3.sf);
+
+ return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint amd_sadw(uint v1, uint v2, uint v3)
+{
+ return __hsail_sadw(v1,v2,v3);
+}
Added: libclc/branches/amd-builtins/amd-builtins/media/unpack.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/media/unpack.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/media/unpack.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/media/unpack.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "media.h"
+
+__attribute__((overloadable, always_inline)) float
+amd_unpack0(uint a)
+{
+ return __hsail_unpackcvt_f32_u8x4(a,0);
+}
+
+__attribute__((overloadable, always_inline)) float2
+amd_unpack0(uint2 a)
+{
+ float2 ret;
+ ret.x = __hsail_unpackcvt_f32_u8x4(a.x,0);
+ ret.y = __hsail_unpackcvt_f32_u8x4(a.y,0);
+ return ret;
+}
+
+__attribute__((overloadable, always_inline)) float3
+amd_unpack0(uint3 a)
+{
+
+ float3 ret;
+ ret.x = __hsail_unpackcvt_f32_u8x4(a.x,0);
+ ret.y = __hsail_unpackcvt_f32_u8x4(a.y,0);
+ ret.z = __hsail_unpackcvt_f32_u8x4(a.z,0);
+ return ret;
+
+}
+
+__attribute__((overloadable, always_inline)) float4
+amd_unpack0(uint4 a)
+{
+ float4 ret;
+ ret.x = __hsail_unpackcvt_f32_u8x4(a.x,0);
+ ret.y = __hsail_unpackcvt_f32_u8x4(a.y,0);
+ ret.z = __hsail_unpackcvt_f32_u8x4(a.z,0);
+ ret.w = __hsail_unpackcvt_f32_u8x4(a.w,0);
+ return ret;
+}
+
+__attribute__((overloadable, always_inline)) float8
+amd_unpack0(uint8 a)
+{
+ float8 ret;
+ ret.s0 = __hsail_unpackcvt_f32_u8x4(a.s0,0);
+ ret.s1 = __hsail_unpackcvt_f32_u8x4(a.s1,0);
+ ret.s2 = __hsail_unpackcvt_f32_u8x4(a.s2,0);
+ ret.s3 = __hsail_unpackcvt_f32_u8x4(a.s3,0);
+ ret.s4 = __hsail_unpackcvt_f32_u8x4(a.s4,0);
+ ret.s5 = __hsail_unpackcvt_f32_u8x4(a.s5,0);
+ ret.s6 = __hsail_unpackcvt_f32_u8x4(a.s6,0);
+ ret.s7 = __hsail_unpackcvt_f32_u8x4(a.s7,0);
+ return ret;
+}
+
+__attribute__((overloadable, always_inline)) float16
+amd_unpack0(uint16 a)
+{
+ float16 ret;
+ ret.s0 = __hsail_unpackcvt_f32_u8x4(a.s0,0);
+ ret.s1 = __hsail_unpackcvt_f32_u8x4(a.s1,0);
+ ret.s2 = __hsail_unpackcvt_f32_u8x4(a.s2,0);
+ ret.s3 = __hsail_unpackcvt_f32_u8x4(a.s3,0);
+ ret.s4 = __hsail_unpackcvt_f32_u8x4(a.s4,0);
+ ret.s5 = __hsail_unpackcvt_f32_u8x4(a.s5,0);
+ ret.s6 = __hsail_unpackcvt_f32_u8x4(a.s6,0);
+ ret.s7 = __hsail_unpackcvt_f32_u8x4(a.s7,0);
+ ret.s8 = __hsail_unpackcvt_f32_u8x4(a.s8,0);
+ ret.s9 = __hsail_unpackcvt_f32_u8x4(a.s9,0);
+ ret.sa = __hsail_unpackcvt_f32_u8x4(a.sa,0);
+ ret.sb = __hsail_unpackcvt_f32_u8x4(a.sb,0);
+ ret.sc = __hsail_unpackcvt_f32_u8x4(a.sc,0);
+ ret.sd = __hsail_unpackcvt_f32_u8x4(a.sd,0);
+ ret.se = __hsail_unpackcvt_f32_u8x4(a.se,0);
+ ret.sf = __hsail_unpackcvt_f32_u8x4(a.sf,0);
+ return ret;
+}
+
+__attribute__((overloadable, always_inline)) float
+amd_unpack1(uint a)
+{
+ return __hsail_unpackcvt_f32_u8x4(a,1);
+}
+
+__attribute__((overloadable, always_inline)) float2
+amd_unpack1(uint2 a)
+{
+ float2 ret;
+ ret.x = __hsail_unpackcvt_f32_u8x4(a.x,1);
+ ret.y = __hsail_unpackcvt_f32_u8x4(a.y,1);
+ return ret;
+}
+
+__attribute__((overloadable, always_inline)) float3
+amd_unpack1(uint3 a)
+{
+
+ float3 ret;
+ ret.x = __hsail_unpackcvt_f32_u8x4(a.x,1);
+ ret.y = __hsail_unpackcvt_f32_u8x4(a.y,1);
+ ret.z = __hsail_unpackcvt_f32_u8x4(a.z,1);
+ return ret;
+
+}
+
+__attribute__((overloadable, always_inline)) float4
+amd_unpack1(uint4 a)
+{
+ float4 ret;
+ ret.x = __hsail_unpackcvt_f32_u8x4(a.x,1);
+ ret.y = __hsail_unpackcvt_f32_u8x4(a.y,1);
+ ret.z = __hsail_unpackcvt_f32_u8x4(a.z,1);
+ ret.w = __hsail_unpackcvt_f32_u8x4(a.w,1);
+ return ret;
+}
+
+__attribute__((overloadable, always_inline)) float8
+amd_unpack1(uint8 a)
+{
+ float8 ret;
+ ret.s0 = __hsail_unpackcvt_f32_u8x4(a.s0,1);
+ ret.s1 = __hsail_unpackcvt_f32_u8x4(a.s1,1);
+ ret.s2 = __hsail_unpackcvt_f32_u8x4(a.s2,1);
+ ret.s3 = __hsail_unpackcvt_f32_u8x4(a.s3,1);
+ ret.s4 = __hsail_unpackcvt_f32_u8x4(a.s4,1);
+ ret.s5 = __hsail_unpackcvt_f32_u8x4(a.s5,1);
+ ret.s6 = __hsail_unpackcvt_f32_u8x4(a.s6,1);
+ ret.s7 = __hsail_unpackcvt_f32_u8x4(a.s7,1);
+ return ret;
+}
+
+__attribute__((overloadable, always_inline)) float16
+amd_unpack1(uint16 a)
+{
+ float16 ret;
+ ret.s0 = __hsail_unpackcvt_f32_u8x4(a.s0,1);
+ ret.s1 = __hsail_unpackcvt_f32_u8x4(a.s1,1);
+ ret.s2 = __hsail_unpackcvt_f32_u8x4(a.s2,1);
+ ret.s3 = __hsail_unpackcvt_f32_u8x4(a.s3,1);
+ ret.s4 = __hsail_unpackcvt_f32_u8x4(a.s4,1);
+ ret.s5 = __hsail_unpackcvt_f32_u8x4(a.s5,1);
+ ret.s6 = __hsail_unpackcvt_f32_u8x4(a.s6,1);
+ ret.s7 = __hsail_unpackcvt_f32_u8x4(a.s7,1);
+ ret.s8 = __hsail_unpackcvt_f32_u8x4(a.s8,1);
+ ret.s9 = __hsail_unpackcvt_f32_u8x4(a.s9,1);
+ ret.sa = __hsail_unpackcvt_f32_u8x4(a.sa,1);
+ ret.sb = __hsail_unpackcvt_f32_u8x4(a.sb,1);
+ ret.sc = __hsail_unpackcvt_f32_u8x4(a.sc,1);
+ ret.sd = __hsail_unpackcvt_f32_u8x4(a.sd,1);
+ ret.se = __hsail_unpackcvt_f32_u8x4(a.se,1);
+ ret.sf = __hsail_unpackcvt_f32_u8x4(a.sf,1);
+ return ret;
+}
+
+__attribute__((overloadable, always_inline)) float
+amd_unpack2(uint a)
+{
+ return __hsail_unpackcvt_f32_u8x4(a,2);
+}
+
+__attribute__((overloadable, always_inline)) float2
+amd_unpack2(uint2 a)
+{
+ float2 ret;
+ ret.x = __hsail_unpackcvt_f32_u8x4(a.x,2);
+ ret.y = __hsail_unpackcvt_f32_u8x4(a.y,2);
+ return ret;
+}
+
+__attribute__((overloadable, always_inline)) float3
+amd_unpack2(uint3 a)
+{
+
+ float3 ret;
+ ret.x = __hsail_unpackcvt_f32_u8x4(a.x,2);
+ ret.y = __hsail_unpackcvt_f32_u8x4(a.y,2);
+ ret.z = __hsail_unpackcvt_f32_u8x4(a.z,2);
+ return ret;
+
+}
+
+__attribute__((overloadable, always_inline)) float4
+amd_unpack2(uint4 a)
+{
+ float4 ret;
+ ret.x = __hsail_unpackcvt_f32_u8x4(a.x,2);
+ ret.y = __hsail_unpackcvt_f32_u8x4(a.y,2);
+ ret.z = __hsail_unpackcvt_f32_u8x4(a.z,2);
+ ret.w = __hsail_unpackcvt_f32_u8x4(a.w,2);
+ return ret;
+}
+
+__attribute__((overloadable, always_inline)) float8
+amd_unpack2(uint8 a)
+{
+ float8 ret;
+ ret.s0 = __hsail_unpackcvt_f32_u8x4(a.s0,2);
+ ret.s1 = __hsail_unpackcvt_f32_u8x4(a.s1,2);
+ ret.s2 = __hsail_unpackcvt_f32_u8x4(a.s2,2);
+ ret.s3 = __hsail_unpackcvt_f32_u8x4(a.s3,2);
+ ret.s4 = __hsail_unpackcvt_f32_u8x4(a.s4,2);
+ ret.s5 = __hsail_unpackcvt_f32_u8x4(a.s5,2);
+ ret.s6 = __hsail_unpackcvt_f32_u8x4(a.s6,2);
+ ret.s7 = __hsail_unpackcvt_f32_u8x4(a.s7,2);
+ return ret;
+}
+
+__attribute__((overloadable, always_inline)) float16
+amd_unpack2(uint16 a)
+{
+ float16 ret;
+ ret.s0 = __hsail_unpackcvt_f32_u8x4(a.s0,2);
+ ret.s1 = __hsail_unpackcvt_f32_u8x4(a.s1,2);
+ ret.s2 = __hsail_unpackcvt_f32_u8x4(a.s2,2);
+ ret.s3 = __hsail_unpackcvt_f32_u8x4(a.s3,2);
+ ret.s4 = __hsail_unpackcvt_f32_u8x4(a.s4,2);
+ ret.s5 = __hsail_unpackcvt_f32_u8x4(a.s5,2);
+ ret.s6 = __hsail_unpackcvt_f32_u8x4(a.s6,2);
+ ret.s7 = __hsail_unpackcvt_f32_u8x4(a.s7,2);
+ ret.s8 = __hsail_unpackcvt_f32_u8x4(a.s8,2);
+ ret.s9 = __hsail_unpackcvt_f32_u8x4(a.s9,2);
+ ret.sa = __hsail_unpackcvt_f32_u8x4(a.sa,2);
+ ret.sb = __hsail_unpackcvt_f32_u8x4(a.sb,2);
+ ret.sc = __hsail_unpackcvt_f32_u8x4(a.sc,2);
+ ret.sd = __hsail_unpackcvt_f32_u8x4(a.sd,2);
+ ret.se = __hsail_unpackcvt_f32_u8x4(a.se,2);
+ ret.sf = __hsail_unpackcvt_f32_u8x4(a.sf,2);
+ return ret;
+}
+
+__attribute__((overloadable, always_inline)) float
+amd_unpack3(uint a)
+{
+ return __hsail_unpackcvt_f32_u8x4(a,3);
+}
+
+__attribute__((overloadable, always_inline)) float2
+amd_unpack3(uint2 a)
+{
+ float2 ret;
+ ret.x = __hsail_unpackcvt_f32_u8x4(a.x,3);
+ ret.y = __hsail_unpackcvt_f32_u8x4(a.y,3);
+ return ret;
+}
+
+__attribute__((overloadable, always_inline)) float3
+amd_unpack3(uint3 a)
+{
+
+ float3 ret;
+ ret.x = __hsail_unpackcvt_f32_u8x4(a.x,3);
+ ret.y = __hsail_unpackcvt_f32_u8x4(a.y,3);
+ ret.z = __hsail_unpackcvt_f32_u8x4(a.z,3);
+ return ret;
+
+}
+
+__attribute__((overloadable, always_inline)) float4
+amd_unpack3(uint4 a)
+{
+ float4 ret;
+ ret.x = __hsail_unpackcvt_f32_u8x4(a.x,3);
+ ret.y = __hsail_unpackcvt_f32_u8x4(a.y,3);
+ ret.z = __hsail_unpackcvt_f32_u8x4(a.z,3);
+ ret.w = __hsail_unpackcvt_f32_u8x4(a.w,3);
+ return ret;
+}
+
+__attribute__((overloadable, always_inline)) float8
+amd_unpack3(uint8 a)
+{
+ float8 ret;
+ ret.s0 = __hsail_unpackcvt_f32_u8x4(a.s0,3);
+ ret.s1 = __hsail_unpackcvt_f32_u8x4(a.s1,3);
+ ret.s2 = __hsail_unpackcvt_f32_u8x4(a.s2,3);
+ ret.s3 = __hsail_unpackcvt_f32_u8x4(a.s3,3);
+ ret.s4 = __hsail_unpackcvt_f32_u8x4(a.s4,3);
+ ret.s5 = __hsail_unpackcvt_f32_u8x4(a.s5,3);
+ ret.s6 = __hsail_unpackcvt_f32_u8x4(a.s6,3);
+ ret.s7 = __hsail_unpackcvt_f32_u8x4(a.s7,3);
+ return ret;
+}
+
+__attribute__((overloadable, always_inline)) float16
+amd_unpack3(uint16 a)
+{
+ float16 ret;
+ ret.s0 = __hsail_unpackcvt_f32_u8x4(a.s0,3);
+ ret.s1 = __hsail_unpackcvt_f32_u8x4(a.s1,3);
+ ret.s2 = __hsail_unpackcvt_f32_u8x4(a.s2,3);
+ ret.s3 = __hsail_unpackcvt_f32_u8x4(a.s3,3);
+ ret.s4 = __hsail_unpackcvt_f32_u8x4(a.s4,3);
+ ret.s5 = __hsail_unpackcvt_f32_u8x4(a.s5,3);
+ ret.s6 = __hsail_unpackcvt_f32_u8x4(a.s6,3);
+ ret.s7 = __hsail_unpackcvt_f32_u8x4(a.s7,3);
+ ret.s8 = __hsail_unpackcvt_f32_u8x4(a.s8,3);
+ ret.s9 = __hsail_unpackcvt_f32_u8x4(a.s9,3);
+ ret.sa = __hsail_unpackcvt_f32_u8x4(a.sa,3);
+ ret.sb = __hsail_unpackcvt_f32_u8x4(a.sb,3);
+ ret.sc = __hsail_unpackcvt_f32_u8x4(a.sc,3);
+ ret.sd = __hsail_unpackcvt_f32_u8x4(a.sd,3);
+ ret.se = __hsail_unpackcvt_f32_u8x4(a.se,3);
+ ret.sf = __hsail_unpackcvt_f32_u8x4(a.sf,3);
+ return ret;
+}
Added: libclc/branches/amd-builtins/amd-builtins/misc/amdil-to-hsail.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/misc/amdil-to-hsail.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/misc/amdil-to-hsail.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/misc/amdil-to-hsail.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+// __amdil_ to __hsail_ translation
+
+// HSAIL intrinsic functions used by math32 functions
+extern __attribute__((pure)) float __hsail_fma_f32(float, float, float);
+extern __attribute__((pure)) float __hsail_nfma_f32(float, float, float);
+extern __attribute__((pure)) float __hsail_min_f32(float, float);
+extern __attribute__((pure)) float __hsail_max_f32(float, float);
+extern __attribute__((pure)) float __hsail_ftz_f32(float);
+extern __attribute__((pure)) float __hsail_round_f32(float);
+extern __attribute__((pure)) float __hsail_floor_f32(float);
+extern __attribute__((pure)) float __hsail_ceil_f32(float);
+extern __attribute__((pure)) float __hsail_trunc_f32(float);
+extern __attribute__((pure)) float __hsail_abs_f32(float);
+
+extern __attribute__((pure)) int __hsail_min_s32(int, int);
+extern __attribute__((pure)) int __hsail_max_s32(int, int);
+extern __attribute__((pure)) uint __hsail_min_u32(uint, int);
+extern __attribute__((pure)) uint __hsail_max_u32(uint, uint);
+extern __attribute__((pure)) int __hsail_mulhi_s32(int, int);
+extern __attribute__((pure)) uint __hsail_mulhi_u32(uint, uint);
+extern __attribute__((pure)) int __hsail_mulhi_s64(int, int);
+extern __attribute__((pure)) uint __hsail_mulhi_u64(uint, uint);
+
+// HSAIL intrinsic functions used by math64 functions
+extern __attribute__((pure)) double __hsail_fma_f64(double, double, double);
+extern __attribute__((pure)) double __hsail_nfma_f64(double, double, double);
+extern __attribute__((pure)) double __hsail_max_f64(double, double);
+extern __attribute__((pure)) double __hsail_min_f64(double, double);
+extern __attribute__((pure)) double __hsail_round_f64(double);
+extern __attribute__((pure)) double __hsail_floor_f64(double);
+extern __attribute__((pure)) double __hsail_ceil_f64(double);
+extern __attribute__((pure)) double __hsail_trunc_f64(double);
+extern __attribute__((pure)) double __hsail_abs_f64(double);
+extern __attribute__((pure)) double __hsail_nrsqrt_f64(double);
+extern __attribute__((pure)) double __hsail_nsqrt_f64(double);
+
+extern __attribute__((pure)) uint __hsail_mad_u32(uint, uint, uint);
+
+// HSAIL conversion intrinsics
+extern __attribute__((pure)) float __cvt_f32_f16(uint op1);
+
+extern __attribute__((pure)) float __cvt_f16_rtz_f32(float op1);
+extern __attribute__((pure)) float __cvt_f16_rte_f32(float op1);
+extern __attribute__((pure)) float __cvt_f16_rtn_f32(float op1);
+extern __attribute__((pure)) float __cvt_f16_rtp_f32(float op1);
+
+extern __attribute__((pure)) float __cvt_f16_rtz_f64(double op1);
+extern __attribute__((pure)) float __cvt_f16_rte_f64(double op1);
+extern __attribute__((pure)) float __cvt_f16_rtn_f64(double op1);
+extern __attribute__((pure)) float __cvt_f16_rtp_f64(double op1);
+
+// Misc HSAIL intrinsic functions
+extern __attribute__((const)) uint __hsail_bitselect_u32(uint, uint, uint);
+extern __attribute__((pure)) int __hsail_class_f32(float, int);
+extern __attribute__((pure)) int __hsail_class_f64(double, int);
+extern __attribute__((pure)) int __hsail_mad24_s32(int, int, int);
+extern __attribute__((pure)) uint __hsail_mad24_u32(uint, uint, uint);
+extern __attribute__((pure)) int __hsail_mul24_s32(int, int);
+extern __attribute__((pure)) uint __hsail_mul24_u32(uint, uint);
+
+extern __attribute__((pure)) int __hsail_popcount_u32_b32(int);
+
+extern __attribute__((pure)) int __hsail_firstbit_u32(uint);
+
+extern __attribute__((pure)) float __hsail_fraction_f32(float);
+extern __attribute__((pure)) double __hsail_fraction_f64(double);
+
+// __amdil_ math32 function defs
+
+__attribute__((weak,always_inline)) float
+__amdil_div_f32(float x, float y) {
+ return native_divide(x, y);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_fma_f32(float x, float y, float z) {
+ return __hsail_fma_f32(x, y, z);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_mad_f32(float x, float y, float z) {
+ return __hsail_nfma_f32(x, y, z);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_min_f32(float x, float y) {
+ return __hsail_min_f32(x, y);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_max_f32(float x, float y) {
+ return __hsail_max_f32(x, y);
+}
+
+__attribute__((weak,always_inline)) float
+__ftz_f32(float x) {
+ return __hsail_ftz_f32(x);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_round_nearest_f32(float x) {
+ return __hsail_round_f32(x);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_round_neginf_f32(float x) {
+ return __hsail_floor_f32(x);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_round_posinf_f32(float x) {
+ return __hsail_ceil_f32(x);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_round_zero_f32(float x) {
+ return __hsail_trunc_f32(x);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_fabs_f32(float x) {
+ return __hsail_abs_f32(x);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_improved_div_f32(float x, float y) {
+ return native_divide(x, y);
+}
+
+__attribute__((weak,always_inline)) int
+__amdil_imin_i32(int x, int y) {
+ return __hsail_min_s32(x, y);
+}
+
+__attribute__((weak,always_inline)) int
+__amdil_imax_i32(int x, int y) {
+ return __hsail_max_s32(x, y);
+}
+
+__attribute__((weak,always_inline)) uint
+__amdil_umin_u32(uint x, uint y) {
+ return __hsail_min_u32(x, y);
+}
+
+__attribute__((weak,always_inline)) uint
+__amdil_umax_u32(uint x, uint y) {
+ return __hsail_max_u32(x, y);
+}
+
+__attribute__((weak,always_inline)) int
+__amdil_imul_high_i32(int x, int y) {
+ return __hsail_mulhi_s32(x, y);
+}
+
+__attribute__((weak,always_inline)) uint
+__amdil_umul_high_u32(uint x, uint y) {
+ return __hsail_mulhi_u32(x, y);
+}
+
+__attribute__((weak,always_inline)) uint
+__amdil_umad_u32(uint x, uint y, uint z) {
+ return __hsail_mad_u32(x, y, z);
+}
+
+// __amdil_ math64 function defs
+
+__attribute__((weak,always_inline)) double
+__amdil_fma_f64(double x, double y, double z) {
+ return __hsail_fma_f64(x, y, z);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_mad_f64(double x, double y, double z) {
+ return __hsail_nfma_f64(x, y, z);
+}
+
+ __attribute__((weak,always_inline)) double
+__amdil_max_f64(double x, double y) {
+ return __hsail_max_f64(x, y);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_round_nearest_f64(double x) {
+ return __hsail_round_f64(x);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_round_neginf_f64(double x) {
+ return __hsail_floor_f64(x);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_round_posinf_f64(double x) {
+ return __hsail_ceil_f64(x);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_round_zero_f64(double x) {
+ return __hsail_trunc_f64(x);
+}
+
+ __attribute__((weak,always_inline)) double
+__amdil_min_f64(double x, double y) {
+ return __hsail_min_f64(x, y);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_fabs_f64(double x) {
+ return __hsail_abs_f64(x);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_sqrt_f64(double x) {
+ return __hsail_nsqrt_f64(x);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_rsq_f64(double x) {
+ return __hsail_nrsqrt_f64(x);
+}
+
+// __amdil conversion functions
+
+__attribute__((weak,always_inline)) float
+__amdil_half_to_float_f32(uint x) {
+ return __cvt_f32_f16(x);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_float_to_half_f32(float x) {
+ return __cvt_f16_rtz_f32(x);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_float_to_half_near_f32(float x) {
+ return __cvt_f16_rte_f32(x);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_float_to_half_neg_inf_f32(float x) {
+ return __cvt_f16_rtn_f32(x);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_float_to_half_plus_inf_f32(float x) {
+ return __cvt_f16_rtp_f32(x);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_double_to_half_f64(double x) {
+ return __cvt_f16_rtz_f64(x);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_double_to_half_near_f64(double x) {
+ return __cvt_f16_rte_f64(x);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_double_to_half_neg_inf_f64(double x) {
+ return __cvt_f16_rtn_f64(x);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_double_to_half_plus_inf_f64(double x) {
+ return __cvt_f16_rtp_f64(x);
+}
+
+// Misc __amdil_ function defs
+
+__attribute__((weak,always_inline)) uint
+__amdil_bfi_u32(uint x, uint y, uint z) {
+ return __hsail_bitselect_u32(x, y, z);
+}
+
+__attribute__((weak,always_inline)) int
+__amdil_class_f32(float x, int y) {
+ int cval = __hsail_class_f32(x, y);
+ int ret = (cval & 0x1) ? (0xffffffffU) : 0;
+ return ret;
+}
+
+__attribute__((weak,always_inline)) int
+__amdil_class_f64(double x, int y) {
+ int cval = __hsail_class_f64(x, y);
+ int ret = (cval & 0x1) ? (0xffffffffU) : 0;
+ return ret;
+}
+
+__attribute__((weak,always_inline)) int
+__amdil_imad24_i32(int x, int y, int z) {
+ return __hsail_mad24_s32(x, y, z);
+}
+
+__attribute__((weak,always_inline)) uint
+__amdil_umad24_u32(uint x, uint y, uint z) {
+ return __hsail_mad24_u32(x, y, z);
+}
+
+__attribute__((weak,always_inline)) int
+__amdil_imul24_i32(int x, int y) {
+ return __hsail_mul24_s32(x, y);
+}
+
+__attribute__((weak,always_inline)) uint
+__amdil_umul24_u32(uint x, uint y) {
+ return __hsail_mul24_u32(x, y);
+}
+
+__attribute__((weak,always_inline)) int
+__amdil_count_bits_i32(int x) {
+ return __hsail_popcount_u32_b32(x);
+}
+
+__attribute__((weak,always_inline)) int
+__amdil_ffb_hi_u32(uint x) {
+ return __hsail_firstbit_u32(x);
+}
+
+//#ifdef HSAIL_SPEC_CURRENT
+__attribute__((weak,always_inline)) float
+__amdil_fraction_f32(float x) {
+ return __hsail_fraction_f32(x);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_fraction_f64(double x) {
+ return __hsail_fraction_f64(x);
+}
+//#endif
+
Added: libclc/branches/amd-builtins/amd-builtins/misc/atomicWorkItemFence.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/misc/atomicWorkItemFence.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/misc/atomicWorkItemFence.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/misc/atomicWorkItemFence.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#if __OPENCL_C_VERSION__ >= 200
+extern void __atomic_memfence(uint flags, uint mo, uint msc);
+enum BrigMemoryFenceSegments {
+ BRIG_MEMORY_FENCE_NONE = 0,
+ BRIG_MEMORY_FENCE_GROUP = 1,
+ BRIG_MEMORY_FENCE_GLOBAL = 2,
+ BRIG_MEMORY_FENCE_BOTH = 3,
+ BRIG_MEMORY_FENCE_IMAGE = 4
+};
+
+enum BrigMemoryOrder {
+ BRIG_MEMORY_ORDER_NONE = 0,
+ BRIG_MEMORY_ORDER_RELAXED = 1,
+ BRIG_MEMORY_ORDER_ACQUIRE = 2,
+ BRIG_MEMORY_ORDER_RELEASE = 3,
+ BRIG_MEMORY_ORDER_ACQUIRE_RELEASE = 4
+};
+
+enum BrigMemoryScope {
+ BRIG_MEMORY_SCOPE_NONE = 0,
+ BRIG_MEMORY_SCOPE_WAVEFRONT = 1,
+ BRIG_MEMORY_SCOPE_WORKGROUP = 2,
+ BRIG_MEMORY_SCOPE_COMPONENT = 3,
+ BRIG_MEMORY_SCOPE_SYSTEM = 4,
+ BRIG_MEMORY_SCOPE_WORKITEM = 5
+};
+
+static inline uint getBrigMemoryOrder(memory_order mo) {
+ switch(mo) {
+ default : return BRIG_MEMORY_ORDER_NONE;
+ case memory_order_relaxed : return BRIG_MEMORY_ORDER_RELAXED;
+ case memory_order_release : return BRIG_MEMORY_ORDER_RELEASE;
+ case memory_order_acquire : return BRIG_MEMORY_ORDER_ACQUIRE;
+ case memory_order_acq_rel :
+ case memory_order_seq_cst : return BRIG_MEMORY_ORDER_ACQUIRE_RELEASE;
+ }
+}
+
+static inline uint getBrigMemoryScope(memory_scope msc) {
+ switch(msc) {
+ default : return BRIG_MEMORY_SCOPE_NONE;
+ case memory_scope_work_group : return BRIG_MEMORY_SCOPE_WORKGROUP;
+ case memory_scope_device : return BRIG_MEMORY_SCOPE_COMPONENT;
+ case memory_scope_all_svm_devices : return BRIG_MEMORY_SCOPE_SYSTEM;
+ case memory_scope_sub_group : return BRIG_MEMORY_SCOPE_WAVEFRONT;
+ case memory_scope_work_item : return BRIG_MEMORY_SCOPE_WORKITEM;
+ }
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) void
+atomic_work_item_fence(/*cl_mem_fence_flags*/ unsigned flag, memory_order mo, memory_scope msc) {
+ uint brigSegment = 0;
+ uint brigMemoryOrder = getBrigMemoryOrder(mo);
+ uint brigMemoryScope = BRIG_MEMORY_SCOPE_WORKGROUP;
+ // relaxed fence has no effect
+ if (mo == memory_order_relaxed) return;
+ if ((flag & CLK_GLOBAL_MEM_FENCE) && (flag & CLK_LOCAL_MEM_FENCE)) {
+ brigSegment = BRIG_MEMORY_FENCE_BOTH;
+ brigMemoryScope = getBrigMemoryScope(msc);
+ }
+ else if (flag & CLK_GLOBAL_MEM_FENCE) {
+ brigSegment = BRIG_MEMORY_FENCE_GLOBAL;
+ brigMemoryScope = getBrigMemoryScope(msc);
+ }
+ else if (flag & CLK_LOCAL_MEM_FENCE) {
+ brigSegment = BRIG_MEMORY_FENCE_GROUP;
+ }
+ if (brigSegment != 0) {
+ __atomic_memfence(brigSegment, brigMemoryOrder, brigMemoryScope);
+ }
+ if (flag & CLK_IMAGE_MEM_FENCE) {
+ brigMemoryScope = getBrigMemoryScope(msc);
+ __atomic_memfence(BRIG_MEMORY_FENCE_IMAGE, BRIG_MEMORY_ORDER_ACQUIRE_RELEASE, brigMemoryScope);
+ }
+}
+#endif // __OPENCL_C_VERSION__ >= 200
Added: libclc/branches/amd-builtins/amd-builtins/misc/awgcpy.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/misc/awgcpy.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/misc/awgcpy.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/misc/awgcpy.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,2696 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+extern __attribute__((pure)) int __hsail_workitemid_flat(void);
+
+__attribute__((always_inline)) static event_t
+__AWGClgI1(__local uchar * dst, const __global uchar * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClgI1"))) event_t async_work_group_copy(__local uchar *, const __global uchar *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClgI1"))) event_t async_work_group_copy(__local char *, const __global char *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClgI1(__local uchar *dst, const __global uchar *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClgI1"))) event_t async_work_group_strided_copy(__local uchar *, const __global uchar *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClgI1"))) event_t async_work_group_strided_copy(__local char *, const __global char *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCglI1(__global uchar * dst, const __local uchar * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCglI1"))) event_t async_work_group_copy(__global uchar *, const __local uchar *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCglI1"))) event_t async_work_group_copy(__global char *, const __local char *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCglI1(__global uchar *dst, const __local uchar *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCglI1"))) event_t async_work_group_strided_copy(__global uchar *, const __local uchar *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCglI1"))) event_t async_work_group_strided_copy(__global char *, const __local char *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uchar *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global char *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClgI2(__local ushort * dst, const __global ushort * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClgI2"))) event_t async_work_group_copy(__local ushort *, const __global ushort *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClgI2"))) event_t async_work_group_copy(__local short *, const __global short *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClgI2(__local ushort *dst, const __global ushort *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClgI2"))) event_t async_work_group_strided_copy(__local ushort *, const __global ushort *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClgI2"))) event_t async_work_group_strided_copy(__local short *, const __global short *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCglI2(__global ushort * dst, const __local ushort * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCglI2"))) event_t async_work_group_copy(__global ushort *, const __local ushort *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCglI2"))) event_t async_work_group_copy(__global short *, const __local short *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCglI2(__global ushort *dst, const __local ushort *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCglI2"))) event_t async_work_group_strided_copy(__global ushort *, const __local ushort *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCglI2"))) event_t async_work_group_strided_copy(__global short *, const __local short *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ushort *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global short *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClgI4(__local uint * dst, const __global uint * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClgI4"))) event_t async_work_group_copy(__local uint *, const __global uint *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClgI4"))) event_t async_work_group_copy(__local int *, const __global int *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClgI4(__local uint *dst, const __global uint *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClgI4"))) event_t async_work_group_strided_copy(__local uint *, const __global uint *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClgI4"))) event_t async_work_group_strided_copy(__local int *, const __global int *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCglI4(__global uint * dst, const __local uint * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCglI4"))) event_t async_work_group_copy(__global uint *, const __local uint *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCglI4"))) event_t async_work_group_copy(__global int *, const __local int *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCglI4(__global uint *dst, const __local uint *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCglI4"))) event_t async_work_group_strided_copy(__global uint *, const __local uint *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCglI4"))) event_t async_work_group_strided_copy(__global int *, const __local int *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uint *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global int *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClgI8(__local ulong * dst, const __global ulong * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClgI8"))) event_t async_work_group_copy(__local ulong *, const __global ulong *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClgI8"))) event_t async_work_group_copy(__local long *, const __global long *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClgI8(__local ulong *dst, const __global ulong *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClgI8"))) event_t async_work_group_strided_copy(__local ulong *, const __global ulong *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClgI8"))) event_t async_work_group_strided_copy(__local long *, const __global long *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCglI8(__global ulong * dst, const __local ulong * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCglI8"))) event_t async_work_group_copy(__global ulong *, const __local ulong *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCglI8"))) event_t async_work_group_copy(__global long *, const __local long *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCglI8(__global ulong *dst, const __local ulong *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCglI8"))) event_t async_work_group_strided_copy(__global ulong *, const __local ulong *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCglI8"))) event_t async_work_group_strided_copy(__global long *, const __local long *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ulong *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global long *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local float * dst, const __global float * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local float *dst, const __global float *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global float * dst, const __local float * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global float *dst, const __local float *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global float *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local double * dst, const __global double * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local double *dst, const __global double *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global double * dst, const __local double * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global double *dst, const __local double *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global double *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg2I1(__local uchar2 * dst, const __global uchar2 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg2I1"))) event_t async_work_group_copy(__local uchar2 *, const __global uchar2 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg2I1"))) event_t async_work_group_copy(__local char2 *, const __global char2 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg2I1(__local uchar2 *dst, const __global uchar2 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg2I1"))) event_t async_work_group_strided_copy(__local uchar2 *, const __global uchar2 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg2I1"))) event_t async_work_group_strided_copy(__local char2 *, const __global char2 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl2I1(__global uchar2 * dst, const __local uchar2 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl2I1"))) event_t async_work_group_copy(__global uchar2 *, const __local uchar2 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl2I1"))) event_t async_work_group_copy(__global char2 *, const __local char2 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl2I1(__global uchar2 *dst, const __local uchar2 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl2I1"))) event_t async_work_group_strided_copy(__global uchar2 *, const __local uchar2 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl2I1"))) event_t async_work_group_strided_copy(__global char2 *, const __local char2 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uchar2 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global char2 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg2I2(__local ushort2 * dst, const __global ushort2 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg2I2"))) event_t async_work_group_copy(__local ushort2 *, const __global ushort2 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg2I2"))) event_t async_work_group_copy(__local short2 *, const __global short2 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg2I2(__local ushort2 *dst, const __global ushort2 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg2I2"))) event_t async_work_group_strided_copy(__local ushort2 *, const __global ushort2 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg2I2"))) event_t async_work_group_strided_copy(__local short2 *, const __global short2 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl2I2(__global ushort2 * dst, const __local ushort2 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl2I2"))) event_t async_work_group_copy(__global ushort2 *, const __local ushort2 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl2I2"))) event_t async_work_group_copy(__global short2 *, const __local short2 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl2I2(__global ushort2 *dst, const __local ushort2 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl2I2"))) event_t async_work_group_strided_copy(__global ushort2 *, const __local ushort2 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl2I2"))) event_t async_work_group_strided_copy(__global short2 *, const __local short2 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ushort2 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global short2 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg2I4(__local uint2 * dst, const __global uint2 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg2I4"))) event_t async_work_group_copy(__local uint2 *, const __global uint2 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg2I4"))) event_t async_work_group_copy(__local int2 *, const __global int2 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg2I4(__local uint2 *dst, const __global uint2 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg2I4"))) event_t async_work_group_strided_copy(__local uint2 *, const __global uint2 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg2I4"))) event_t async_work_group_strided_copy(__local int2 *, const __global int2 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl2I4(__global uint2 * dst, const __local uint2 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl2I4"))) event_t async_work_group_copy(__global uint2 *, const __local uint2 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl2I4"))) event_t async_work_group_copy(__global int2 *, const __local int2 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl2I4(__global uint2 *dst, const __local uint2 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl2I4"))) event_t async_work_group_strided_copy(__global uint2 *, const __local uint2 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl2I4"))) event_t async_work_group_strided_copy(__global int2 *, const __local int2 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uint2 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global int2 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg2I8(__local ulong2 * dst, const __global ulong2 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg2I8"))) event_t async_work_group_copy(__local ulong2 *, const __global ulong2 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg2I8"))) event_t async_work_group_copy(__local long2 *, const __global long2 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg2I8(__local ulong2 *dst, const __global ulong2 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg2I8"))) event_t async_work_group_strided_copy(__local ulong2 *, const __global ulong2 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg2I8"))) event_t async_work_group_strided_copy(__local long2 *, const __global long2 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl2I8(__global ulong2 * dst, const __local ulong2 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl2I8"))) event_t async_work_group_copy(__global ulong2 *, const __local ulong2 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl2I8"))) event_t async_work_group_copy(__global long2 *, const __local long2 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl2I8(__global ulong2 *dst, const __local ulong2 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl2I8"))) event_t async_work_group_strided_copy(__global ulong2 *, const __local ulong2 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl2I8"))) event_t async_work_group_strided_copy(__global long2 *, const __local long2 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ulong2 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global long2 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local float2 * dst, const __global float2 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local float2 *dst, const __global float2 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global float2 * dst, const __local float2 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global float2 *dst, const __local float2 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global float2 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local double2 * dst, const __global double2 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local double2 *dst, const __global double2 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global double2 * dst, const __local double2 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global double2 *dst, const __local double2 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global double2 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg3I1(__local uchar3 * dst, const __global uchar3 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg3I1"))) event_t async_work_group_copy(__local uchar3 *, const __global uchar3 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg3I1"))) event_t async_work_group_copy(__local char3 *, const __global char3 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg3I1(__local uchar3 *dst, const __global uchar3 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg3I1"))) event_t async_work_group_strided_copy(__local uchar3 *, const __global uchar3 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg3I1"))) event_t async_work_group_strided_copy(__local char3 *, const __global char3 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl3I1(__global uchar3 * dst, const __local uchar3 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl3I1"))) event_t async_work_group_copy(__global uchar3 *, const __local uchar3 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl3I1"))) event_t async_work_group_copy(__global char3 *, const __local char3 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl3I1(__global uchar3 *dst, const __local uchar3 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl3I1"))) event_t async_work_group_strided_copy(__global uchar3 *, const __local uchar3 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl3I1"))) event_t async_work_group_strided_copy(__global char3 *, const __local char3 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uchar3 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global char3 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg3I2(__local ushort3 * dst, const __global ushort3 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg3I2"))) event_t async_work_group_copy(__local ushort3 *, const __global ushort3 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg3I2"))) event_t async_work_group_copy(__local short3 *, const __global short3 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg3I2(__local ushort3 *dst, const __global ushort3 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg3I2"))) event_t async_work_group_strided_copy(__local ushort3 *, const __global ushort3 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg3I2"))) event_t async_work_group_strided_copy(__local short3 *, const __global short3 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl3I2(__global ushort3 * dst, const __local ushort3 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl3I2"))) event_t async_work_group_copy(__global ushort3 *, const __local ushort3 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl3I2"))) event_t async_work_group_copy(__global short3 *, const __local short3 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl3I2(__global ushort3 *dst, const __local ushort3 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl3I2"))) event_t async_work_group_strided_copy(__global ushort3 *, const __local ushort3 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl3I2"))) event_t async_work_group_strided_copy(__global short3 *, const __local short3 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ushort3 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global short3 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg3I4(__local uint3 * dst, const __global uint3 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg3I4"))) event_t async_work_group_copy(__local uint3 *, const __global uint3 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg3I4"))) event_t async_work_group_copy(__local int3 *, const __global int3 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg3I4(__local uint3 *dst, const __global uint3 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg3I4"))) event_t async_work_group_strided_copy(__local uint3 *, const __global uint3 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg3I4"))) event_t async_work_group_strided_copy(__local int3 *, const __global int3 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl3I4(__global uint3 * dst, const __local uint3 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl3I4"))) event_t async_work_group_copy(__global uint3 *, const __local uint3 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl3I4"))) event_t async_work_group_copy(__global int3 *, const __local int3 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl3I4(__global uint3 *dst, const __local uint3 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl3I4"))) event_t async_work_group_strided_copy(__global uint3 *, const __local uint3 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl3I4"))) event_t async_work_group_strided_copy(__global int3 *, const __local int3 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uint3 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global int3 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg3I8(__local ulong3 * dst, const __global ulong3 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg3I8"))) event_t async_work_group_copy(__local ulong3 *, const __global ulong3 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg3I8"))) event_t async_work_group_copy(__local long3 *, const __global long3 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg3I8(__local ulong3 *dst, const __global ulong3 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg3I8"))) event_t async_work_group_strided_copy(__local ulong3 *, const __global ulong3 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg3I8"))) event_t async_work_group_strided_copy(__local long3 *, const __global long3 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl3I8(__global ulong3 * dst, const __local ulong3 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl3I8"))) event_t async_work_group_copy(__global ulong3 *, const __local ulong3 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl3I8"))) event_t async_work_group_copy(__global long3 *, const __local long3 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl3I8(__global ulong3 *dst, const __local ulong3 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl3I8"))) event_t async_work_group_strided_copy(__global ulong3 *, const __local ulong3 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl3I8"))) event_t async_work_group_strided_copy(__global long3 *, const __local long3 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ulong3 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global long3 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local float3 * dst, const __global float3 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local float3 *dst, const __global float3 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global float3 * dst, const __local float3 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global float3 *dst, const __local float3 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global float3 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local double3 * dst, const __global double3 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local double3 *dst, const __global double3 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global double3 * dst, const __local double3 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global double3 *dst, const __local double3 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global double3 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg4I1(__local uchar4 * dst, const __global uchar4 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg4I1"))) event_t async_work_group_copy(__local uchar4 *, const __global uchar4 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg4I1"))) event_t async_work_group_copy(__local char4 *, const __global char4 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg4I1(__local uchar4 *dst, const __global uchar4 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg4I1"))) event_t async_work_group_strided_copy(__local uchar4 *, const __global uchar4 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg4I1"))) event_t async_work_group_strided_copy(__local char4 *, const __global char4 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl4I1(__global uchar4 * dst, const __local uchar4 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl4I1"))) event_t async_work_group_copy(__global uchar4 *, const __local uchar4 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl4I1"))) event_t async_work_group_copy(__global char4 *, const __local char4 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl4I1(__global uchar4 *dst, const __local uchar4 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl4I1"))) event_t async_work_group_strided_copy(__global uchar4 *, const __local uchar4 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl4I1"))) event_t async_work_group_strided_copy(__global char4 *, const __local char4 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uchar4 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global char4 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg4I2(__local ushort4 * dst, const __global ushort4 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg4I2"))) event_t async_work_group_copy(__local ushort4 *, const __global ushort4 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg4I2"))) event_t async_work_group_copy(__local short4 *, const __global short4 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg4I2(__local ushort4 *dst, const __global ushort4 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg4I2"))) event_t async_work_group_strided_copy(__local ushort4 *, const __global ushort4 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg4I2"))) event_t async_work_group_strided_copy(__local short4 *, const __global short4 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl4I2(__global ushort4 * dst, const __local ushort4 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl4I2"))) event_t async_work_group_copy(__global ushort4 *, const __local ushort4 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl4I2"))) event_t async_work_group_copy(__global short4 *, const __local short4 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl4I2(__global ushort4 *dst, const __local ushort4 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl4I2"))) event_t async_work_group_strided_copy(__global ushort4 *, const __local ushort4 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl4I2"))) event_t async_work_group_strided_copy(__global short4 *, const __local short4 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ushort4 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global short4 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg4I4(__local uint4 * dst, const __global uint4 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg4I4"))) event_t async_work_group_copy(__local uint4 *, const __global uint4 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg4I4"))) event_t async_work_group_copy(__local int4 *, const __global int4 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg4I4(__local uint4 *dst, const __global uint4 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg4I4"))) event_t async_work_group_strided_copy(__local uint4 *, const __global uint4 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg4I4"))) event_t async_work_group_strided_copy(__local int4 *, const __global int4 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl4I4(__global uint4 * dst, const __local uint4 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl4I4"))) event_t async_work_group_copy(__global uint4 *, const __local uint4 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl4I4"))) event_t async_work_group_copy(__global int4 *, const __local int4 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl4I4(__global uint4 *dst, const __local uint4 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl4I4"))) event_t async_work_group_strided_copy(__global uint4 *, const __local uint4 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl4I4"))) event_t async_work_group_strided_copy(__global int4 *, const __local int4 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uint4 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global int4 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg4I8(__local ulong4 * dst, const __global ulong4 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg4I8"))) event_t async_work_group_copy(__local ulong4 *, const __global ulong4 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg4I8"))) event_t async_work_group_copy(__local long4 *, const __global long4 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg4I8(__local ulong4 *dst, const __global ulong4 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg4I8"))) event_t async_work_group_strided_copy(__local ulong4 *, const __global ulong4 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg4I8"))) event_t async_work_group_strided_copy(__local long4 *, const __global long4 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl4I8(__global ulong4 * dst, const __local ulong4 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl4I8"))) event_t async_work_group_copy(__global ulong4 *, const __local ulong4 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl4I8"))) event_t async_work_group_copy(__global long4 *, const __local long4 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl4I8(__global ulong4 *dst, const __local ulong4 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl4I8"))) event_t async_work_group_strided_copy(__global ulong4 *, const __local ulong4 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl4I8"))) event_t async_work_group_strided_copy(__global long4 *, const __local long4 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ulong4 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global long4 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local float4 * dst, const __global float4 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local float4 *dst, const __global float4 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global float4 * dst, const __local float4 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global float4 *dst, const __local float4 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global float4 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local double4 * dst, const __global double4 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local double4 *dst, const __global double4 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global double4 * dst, const __local double4 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global double4 *dst, const __local double4 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global double4 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg8I1(__local uchar8 * dst, const __global uchar8 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg8I1"))) event_t async_work_group_copy(__local uchar8 *, const __global uchar8 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg8I1"))) event_t async_work_group_copy(__local char8 *, const __global char8 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg8I1(__local uchar8 *dst, const __global uchar8 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg8I1"))) event_t async_work_group_strided_copy(__local uchar8 *, const __global uchar8 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg8I1"))) event_t async_work_group_strided_copy(__local char8 *, const __global char8 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl8I1(__global uchar8 * dst, const __local uchar8 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl8I1"))) event_t async_work_group_copy(__global uchar8 *, const __local uchar8 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl8I1"))) event_t async_work_group_copy(__global char8 *, const __local char8 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl8I1(__global uchar8 *dst, const __local uchar8 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl8I1"))) event_t async_work_group_strided_copy(__global uchar8 *, const __local uchar8 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl8I1"))) event_t async_work_group_strided_copy(__global char8 *, const __local char8 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uchar8 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global char8 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg8I2(__local ushort8 * dst, const __global ushort8 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg8I2"))) event_t async_work_group_copy(__local ushort8 *, const __global ushort8 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg8I2"))) event_t async_work_group_copy(__local short8 *, const __global short8 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg8I2(__local ushort8 *dst, const __global ushort8 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg8I2"))) event_t async_work_group_strided_copy(__local ushort8 *, const __global ushort8 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg8I2"))) event_t async_work_group_strided_copy(__local short8 *, const __global short8 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl8I2(__global ushort8 * dst, const __local ushort8 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl8I2"))) event_t async_work_group_copy(__global ushort8 *, const __local ushort8 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl8I2"))) event_t async_work_group_copy(__global short8 *, const __local short8 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl8I2(__global ushort8 *dst, const __local ushort8 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl8I2"))) event_t async_work_group_strided_copy(__global ushort8 *, const __local ushort8 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl8I2"))) event_t async_work_group_strided_copy(__global short8 *, const __local short8 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ushort8 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global short8 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg8I4(__local uint8 * dst, const __global uint8 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg8I4"))) event_t async_work_group_copy(__local uint8 *, const __global uint8 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg8I4"))) event_t async_work_group_copy(__local int8 *, const __global int8 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg8I4(__local uint8 *dst, const __global uint8 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg8I4"))) event_t async_work_group_strided_copy(__local uint8 *, const __global uint8 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg8I4"))) event_t async_work_group_strided_copy(__local int8 *, const __global int8 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl8I4(__global uint8 * dst, const __local uint8 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl8I4"))) event_t async_work_group_copy(__global uint8 *, const __local uint8 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl8I4"))) event_t async_work_group_copy(__global int8 *, const __local int8 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl8I4(__global uint8 *dst, const __local uint8 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl8I4"))) event_t async_work_group_strided_copy(__global uint8 *, const __local uint8 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl8I4"))) event_t async_work_group_strided_copy(__global int8 *, const __local int8 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uint8 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global int8 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg8I8(__local ulong8 * dst, const __global ulong8 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg8I8"))) event_t async_work_group_copy(__local ulong8 *, const __global ulong8 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg8I8"))) event_t async_work_group_copy(__local long8 *, const __global long8 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg8I8(__local ulong8 *dst, const __global ulong8 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg8I8"))) event_t async_work_group_strided_copy(__local ulong8 *, const __global ulong8 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg8I8"))) event_t async_work_group_strided_copy(__local long8 *, const __global long8 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl8I8(__global ulong8 * dst, const __local ulong8 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl8I8"))) event_t async_work_group_copy(__global ulong8 *, const __local ulong8 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl8I8"))) event_t async_work_group_copy(__global long8 *, const __local long8 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl8I8(__global ulong8 *dst, const __local ulong8 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl8I8"))) event_t async_work_group_strided_copy(__global ulong8 *, const __local ulong8 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl8I8"))) event_t async_work_group_strided_copy(__global long8 *, const __local long8 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ulong8 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global long8 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local float8 * dst, const __global float8 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local float8 *dst, const __global float8 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global float8 * dst, const __local float8 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global float8 *dst, const __local float8 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global float8 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local double8 * dst, const __global double8 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local double8 *dst, const __global double8 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global double8 * dst, const __local double8 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global double8 *dst, const __local double8 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global double8 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg16I1(__local uchar16 * dst, const __global uchar16 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg16I1"))) event_t async_work_group_copy(__local uchar16 *, const __global uchar16 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg16I1"))) event_t async_work_group_copy(__local char16 *, const __global char16 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg16I1(__local uchar16 *dst, const __global uchar16 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg16I1"))) event_t async_work_group_strided_copy(__local uchar16 *, const __global uchar16 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg16I1"))) event_t async_work_group_strided_copy(__local char16 *, const __global char16 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl16I1(__global uchar16 * dst, const __local uchar16 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl16I1"))) event_t async_work_group_copy(__global uchar16 *, const __local uchar16 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl16I1"))) event_t async_work_group_copy(__global char16 *, const __local char16 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl16I1(__global uchar16 *dst, const __local uchar16 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl16I1"))) event_t async_work_group_strided_copy(__global uchar16 *, const __local uchar16 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl16I1"))) event_t async_work_group_strided_copy(__global char16 *, const __local char16 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uchar16 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global char16 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg16I2(__local ushort16 * dst, const __global ushort16 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg16I2"))) event_t async_work_group_copy(__local ushort16 *, const __global ushort16 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg16I2"))) event_t async_work_group_copy(__local short16 *, const __global short16 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg16I2(__local ushort16 *dst, const __global ushort16 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg16I2"))) event_t async_work_group_strided_copy(__local ushort16 *, const __global ushort16 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg16I2"))) event_t async_work_group_strided_copy(__local short16 *, const __global short16 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl16I2(__global ushort16 * dst, const __local ushort16 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl16I2"))) event_t async_work_group_copy(__global ushort16 *, const __local ushort16 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl16I2"))) event_t async_work_group_copy(__global short16 *, const __local short16 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl16I2(__global ushort16 *dst, const __local ushort16 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl16I2"))) event_t async_work_group_strided_copy(__global ushort16 *, const __local ushort16 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl16I2"))) event_t async_work_group_strided_copy(__global short16 *, const __local short16 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ushort16 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global short16 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg16I4(__local uint16 * dst, const __global uint16 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg16I4"))) event_t async_work_group_copy(__local uint16 *, const __global uint16 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg16I4"))) event_t async_work_group_copy(__local int16 *, const __global int16 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg16I4(__local uint16 *dst, const __global uint16 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg16I4"))) event_t async_work_group_strided_copy(__local uint16 *, const __global uint16 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg16I4"))) event_t async_work_group_strided_copy(__local int16 *, const __global int16 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl16I4(__global uint16 * dst, const __local uint16 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl16I4"))) event_t async_work_group_copy(__global uint16 *, const __local uint16 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl16I4"))) event_t async_work_group_copy(__global int16 *, const __local int16 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl16I4(__global uint16 *dst, const __local uint16 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl16I4"))) event_t async_work_group_strided_copy(__global uint16 *, const __local uint16 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl16I4"))) event_t async_work_group_strided_copy(__global int16 *, const __local int16 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uint16 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global int16 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg16I8(__local ulong16 * dst, const __global ulong16 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg16I8"))) event_t async_work_group_copy(__local ulong16 *, const __global ulong16 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg16I8"))) event_t async_work_group_copy(__local long16 *, const __global long16 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg16I8(__local ulong16 *dst, const __global ulong16 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg16I8"))) event_t async_work_group_strided_copy(__local ulong16 *, const __global ulong16 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg16I8"))) event_t async_work_group_strided_copy(__local long16 *, const __global long16 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl16I8(__global ulong16 * dst, const __local ulong16 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl16I8"))) event_t async_work_group_copy(__global ulong16 *, const __local ulong16 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl16I8"))) event_t async_work_group_copy(__global long16 *, const __local long16 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl16I8(__global ulong16 *dst, const __local ulong16 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl16I8"))) event_t async_work_group_strided_copy(__global ulong16 *, const __local ulong16 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl16I8"))) event_t async_work_group_strided_copy(__global long16 *, const __local long16 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ulong16 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global long16 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local float16 * dst, const __global float16 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local float16 *dst, const __global float16 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global float16 * dst, const __local float16 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global float16 *dst, const __local float16 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global float16 *p, size_t n)
+{
+ // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local double16 * dst, const __global double16 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local double16 *dst, const __global double16 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i*j];
+ i += d;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global double16 * dst, const __local double16 * src, size_t n, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global double16 *dst, const __local double16 *src, size_t n, size_t j, event_t e)
+{
+ int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+ size_t i = __hsail_workitemid_flat();
+ size_t d = ls.x * ls.y * ls.z;
+ while (i < n) {
+ dst[i*j] = src[i];
+ i += d;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global double16 *p, size_t n)
+{
+ // nothing to do
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) void
+wait_group_events(int num_events, event_t *event_list)
+{
+ // Nothing to do
+}
Added: libclc/branches/amd-builtins/amd-builtins/misc/bitsel.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/misc/bitsel.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/misc/bitsel.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/misc/bitsel.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+extern __attribute__((pure)) uint __amdil_bfi_u32(uint, uint, uint);
+
+// [u]int
+
+__attribute__((always_inline)) static uint
+__BSELI4(uint a, uint b, uint c)
+{
+ return __amdil_bfi_u32(c, b, a);
+}
+
+extern __attribute__((overloadable, alias("__BSELI4"))) uint bitselect(uint, uint, uint);
+extern __attribute__((overloadable, alias("__BSELI4"))) int bitselect(int, int, int);
+
+// float
+
+__attribute__((overloadable, always_inline)) float
+bitselect(float a, float b, float c)
+{
+ return as_float(__amdil_bfi_u32(as_uint(c), as_uint(b), as_uint(a)));
+}
+
+// [u]long
+
+// No __amdil equivalent, so use __hsail intrinsic here
+extern __attribute__((const)) ulong __hsail_bitselect_u64(ulong, ulong, ulong);
+
+__attribute__((always_inline)) static ulong
+__BSELI8(ulong a, ulong b, ulong c)
+{
+ return __hsail_bitselect_u64(c, b, a);
+}
+
+extern __attribute__((overloadable, alias("__BSELI8"))) ulong bitselect(ulong, ulong, ulong);
+extern __attribute__((overloadable, alias("__BSELI8"))) long bitselect(long, long, long);
+
+// double
+
+__attribute__((overloadable, always_inline)) double
+bitselect(double a, double b, double c)
+{
+ return as_double(__hsail_bitselect_u64(as_ulong(c), as_ulong(b), as_ulong(a)));
+}
+
Added: libclc/branches/amd-builtins/amd-builtins/misc/class.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/misc/class.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/misc/class.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/misc/class.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#define SNAN 0x001
+#define QNAN 0x002
+#define NINF 0x004
+#define NNOR 0x008
+#define NSUB 0x010
+#define NZER 0x020
+#define PZER 0x040
+#define PSUB 0x080
+#define PNOR 0x100
+#define PINF 0x200
+
+extern __attribute__((pure)) int __amdil_class_f32(float, int);
+extern __attribute__((pure)) int __amdil_class_f64(double, int);
+
+#define FC(F,M) \
+__attribute__((overloadable, always_inline)) int \
+F(float x) \
+{ \
+ return __amdil_class_f32(x, M) & 1; \
+} \
+__attribute__((overloadable, always_inline)) int2 \
+F(float2 x) \
+{ \
+ int2 ret; \
+ ret.s0 = __amdil_class_f32(x.s0, M); \
+ ret.s1 = __amdil_class_f32(x.s1, M); \
+ return ret; \
+} \
+__attribute__((overloadable, always_inline)) int3 \
+F(float3 x) \
+{ \
+ int3 ret; \
+ ret.s0 = __amdil_class_f32(x.s0, M); \
+ ret.s1 = __amdil_class_f32(x.s1, M); \
+ ret.s2 = __amdil_class_f32(x.s2, M); \
+ return ret; \
+} \
+__attribute__((overloadable, always_inline)) int4 \
+F(float4 x) \
+{ \
+ int4 ret; \
+ ret.s0 = __amdil_class_f32(x.s0, M); \
+ ret.s1 = __amdil_class_f32(x.s1, M); \
+ ret.s2 = __amdil_class_f32(x.s2, M); \
+ ret.s3 = __amdil_class_f32(x.s3, M); \
+ return ret; \
+} \
+__attribute__((overloadable, always_inline)) int8 \
+F(float8 x) \
+{ \
+ int8 ret; \
+ ret.s0 = __amdil_class_f32(x.s0, M); \
+ ret.s1 = __amdil_class_f32(x.s1, M); \
+ ret.s2 = __amdil_class_f32(x.s2, M); \
+ ret.s3 = __amdil_class_f32(x.s3, M); \
+ ret.s4 = __amdil_class_f32(x.s4, M); \
+ ret.s5 = __amdil_class_f32(x.s5, M); \
+ ret.s6 = __amdil_class_f32(x.s6, M); \
+ ret.s7 = __amdil_class_f32(x.s7, M); \
+ return ret; \
+} \
+__attribute__((overloadable, always_inline)) int16 \
+F(float16 x) \
+{ \
+ int16 ret; \
+ ret.s0 = __amdil_class_f32(x.s0, M); \
+ ret.s1 = __amdil_class_f32(x.s1, M); \
+ ret.s2 = __amdil_class_f32(x.s2, M); \
+ ret.s3 = __amdil_class_f32(x.s3, M); \
+ ret.s4 = __amdil_class_f32(x.s4, M); \
+ ret.s5 = __amdil_class_f32(x.s5, M); \
+ ret.s6 = __amdil_class_f32(x.s6, M); \
+ ret.s7 = __amdil_class_f32(x.s7, M); \
+ ret.s8 = __amdil_class_f32(x.s8, M); \
+ ret.s9 = __amdil_class_f32(x.s9, M); \
+ ret.sa = __amdil_class_f32(x.sa, M); \
+ ret.sb = __amdil_class_f32(x.sb, M); \
+ ret.sc = __amdil_class_f32(x.sc, M); \
+ ret.sd = __amdil_class_f32(x.sd, M); \
+ ret.se = __amdil_class_f32(x.se, M); \
+ ret.sf = __amdil_class_f32(x.sf, M); \
+ return ret; \
+}
+
+
+#define DC(F,M) \
+__attribute__((overloadable, always_inline)) int \
+F(double x) \
+{ \
+ return __amdil_class_f64(x, M) & 1; \
+} \
+__attribute__((overloadable, always_inline)) long2 \
+F(double2 x) \
+{ \
+ long2 ret; \
+ ret.s0 = __amdil_class_f64(x.s0, M); \
+ ret.s1 = __amdil_class_f64(x.s1, M); \
+ return ret; \
+} \
+__attribute__((overloadable, always_inline)) long3 \
+F(double3 x) \
+{ \
+ long3 ret; \
+ ret.s0 = __amdil_class_f64(x.s0, M); \
+ ret.s1 = __amdil_class_f64(x.s1, M); \
+ ret.s2 = __amdil_class_f64(x.s2, M); \
+ return ret; \
+} \
+__attribute__((overloadable, always_inline)) long4 \
+F(double4 x) \
+{ \
+ long4 ret; \
+ ret.s0 = __amdil_class_f64(x.s0, M); \
+ ret.s1 = __amdil_class_f64(x.s1, M); \
+ ret.s2 = __amdil_class_f64(x.s2, M); \
+ ret.s3 = __amdil_class_f64(x.s3, M); \
+ return ret; \
+} \
+__attribute__((overloadable, always_inline)) long8 \
+F(double8 x) \
+{ \
+ long8 ret; \
+ ret.s0 = __amdil_class_f64(x.s0, M); \
+ ret.s1 = __amdil_class_f64(x.s1, M); \
+ ret.s2 = __amdil_class_f64(x.s2, M); \
+ ret.s3 = __amdil_class_f64(x.s3, M); \
+ ret.s4 = __amdil_class_f64(x.s4, M); \
+ ret.s5 = __amdil_class_f64(x.s5, M); \
+ ret.s6 = __amdil_class_f64(x.s6, M); \
+ ret.s7 = __amdil_class_f64(x.s7, M); \
+ return ret; \
+} \
+__attribute__((overloadable, always_inline)) long16 \
+F(double16 x) \
+{ \
+ long16 ret; \
+ ret.s0 = __amdil_class_f64(x.s0, M); \
+ ret.s1 = __amdil_class_f64(x.s1, M); \
+ ret.s2 = __amdil_class_f64(x.s2, M); \
+ ret.s3 = __amdil_class_f64(x.s3, M); \
+ ret.s4 = __amdil_class_f64(x.s4, M); \
+ ret.s5 = __amdil_class_f64(x.s5, M); \
+ ret.s6 = __amdil_class_f64(x.s6, M); \
+ ret.s7 = __amdil_class_f64(x.s7, M); \
+ ret.s8 = __amdil_class_f64(x.s8, M); \
+ ret.s9 = __amdil_class_f64(x.s9, M); \
+ ret.sa = __amdil_class_f64(x.sa, M); \
+ ret.sb = __amdil_class_f64(x.sb, M); \
+ ret.sc = __amdil_class_f64(x.sc, M); \
+ ret.sd = __amdil_class_f64(x.sd, M); \
+ ret.se = __amdil_class_f64(x.se, M); \
+ ret.sf = __amdil_class_f64(x.sf, M); \
+ return ret; \
+}
+
+FC(isfinite, (NNOR|NSUB|NZER|PZER|PSUB|PNOR))
+FC(isinf, (NINF|PINF))
+FC(isnan, (SNAN|QNAN))
+FC(isnormal, (NNOR|PNOR))
+
+DC(isfinite, (NNOR|NSUB|NZER|PZER|PSUB|PNOR))
+DC(isinf, (NINF|PINF))
+DC(isnan, (SNAN|QNAN))
+DC(isnormal, (NNOR|PNOR))
+
Added: libclc/branches/amd-builtins/amd-builtins/misc/counter.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/misc/counter.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/misc/counter.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/misc/counter.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifdef USE_COUNTER
+
+#pragma OPENCL EXTENSION cl_amd_atomic_counters32 : enable
+
+extern uint __amdil_append_alloc_i32(counter32_t);
+extern uint __amdil_append_consume_i32(counter32_t);
+
+__attribute__((overloadable, always_inline)) uint
+atomic_inc(counter32_t p)
+{
+ return __amdil_append_alloc_i32(p);
+}
+
+__attribute__((overloadable, always_inline)) uint
+atomic_dec(counter32_t p)
+{
+ // The instruction returns the updated value
+ return __amdil_append_consume_i32(p) + 1U;
+}
+
+#endif
+
Added: libclc/branches/amd-builtins/amd-builtins/misc/floattointconversion.h
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/misc/floattointconversion.h?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/misc/floattointconversion.h (added)
+++ libclc/branches/amd-builtins/amd-builtins/misc/floattointconversion.h Tue Oct 7 12:10:46 2014
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+
+static inline double float_uint_to_double(uint x)
+{
+ double d;
+ float f = as_float(x);
+
+ // Fix up subnormal, if necessary
+ uint fmant = x & 0x007fffff;
+ float temp = as_float(fmant | 0x3f800000);
+ temp -= 1.0;
+ d = (float)temp;
+ ulong ld = as_ulong(d);
+ ld -= 0x07e0000000000000;
+ d = as_double(ld);
+ d = fmant ? d : 0.0;
+ d = x & 0x80000000 ? -d : d;
+ d = (f != 0.0) ? (double)f : d;
+
+ return d;
+
+}
+
+static inline uint double_to_float_uint(double d)
+{
+ uint dlow, dhigh, dsign;
+ float f = (float)d;
+ uint uf;
+
+ double dabs = (d < 0.) ? -d : d;
+
+ // Fix up subnormal
+ ulong ld;
+ ld = as_ulong(d);
+ dlow = ld;
+ dhigh = ld >> 32;
+ dsign = dhigh & 0x80000000;
+
+ int dexp = (dhigh >> 20) & 0x7ff;
+ int shiftcount = 0x381 - dexp;
+ dhigh &= 0x000fffff;
+ dhigh |= 0x00100000;
+ dhigh = (dhigh << 3) | (dlow >> 29);
+ dlow <<= 3;
+ uint extrabits = dlow << (32 - shiftcount);
+ dlow = (dlow >> shiftcount) | (dhigh << (32 - shiftcount));
+ dhigh >>= shiftcount;
+ dhigh = ((dlow > 0x80000000u) ||
+ ((dlow == 0x80000000u) && ((dhigh & 1) | extrabits))) ?
+ dhigh + 1 : dhigh;
+ uf = dhigh | dsign;
+ uf = dabs >= 7.0064923216240869000000e-046 ? uf : 0;
+
+
+ uf = f != 0. ? as_uint(f) : uf;
+ return uf;
+}
\ No newline at end of file
Added: libclc/branches/amd-builtins/amd-builtins/misc/minmax.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/misc/minmax.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/misc/minmax.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/misc/minmax.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+//#define G(F,T,N) \
+//__attribute__((overloadable, always_inline)) T##N \
+//F(T##N x, T##N y) \
+//{ \
+// T##N ret; \
+// ret.lo = F(x.lo, y.lo); \
+// ret.hi = F(x.hi, y.hi); \
+// return ret; \
+//}
+//
+//G(min,float,16)
+//G(min,float,8)
+
+//__attribute__((overloadable, always_inline)) float4
+//min(float4 x, float4 y)
+//{
+// return __amdil_min_v4f32(x, y);
+//}
+//
+//__attribute__((overloadable, always_inline)) float3
+//min(float3 x, float3 y)
+//{
+//#if defined VEC3_BACKEND
+// return __amdil_min_v3f32(x, y);
+//#else
+// float3 ret;
+// ret.xy = min(x.xy, y.xy);
+// ret.z = min(x.z, y.z);
+// return ret;
+//#endif
+//}
+//
+//__attribute__((overloadable, always_inline)) float2
+//min(float2 x, float2 y)
+//{
+// return __amdil_min_v2f32(x, y);
+//}
+
+extern __attribute__((pure)) float __hsail_min_f32(float,float);
+
+__attribute__((weak, overloadable, always_inline)) float
+min(float x, float y)
+{
+ return __hsail_min_f32(x, y);
+}
+
+//G(min,double,16)
+//G(min,double,8)
+//G(min,double,4)
+//G(min,double,3)
+//G(min,double,2)
+
+extern __attribute__((pure)) double __hsail_min_f64(double,double);
+
+__attribute__((weak, overloadable, always_inline)) double
+min(double x, double y)
+{
+ return __hsail_min_f64(x, y);
+}
+
+//G(max,float,16)
+//G(max,float,8)
+//
+//__attribute__((overloadable, always_inline)) float4
+//max(float4 x, float4 y)
+//{
+// return __amdil_max_v4f32(x, y);
+//}
+//
+//__attribute__((overloadable, always_inline)) float3
+//max(float3 x, float3 y)
+//{
+//#if defined VEC3_BACKEND
+// return __amdil_max_v3f32(x, y);
+//#else
+// float3 ret;
+// ret.xy = max(x.xy, y.xy);
+// ret.z = max(x.z, y.z);
+// return ret;
+//#endif
+//}
+//
+//__attribute__((overloadable, always_inline)) float2
+//max(float2 x, float2 y)
+//{
+// return __amdil_max_v2f32(x, y);
+//}
+
+extern __attribute__((pure)) float __hsail_max_f32(float,float);
+
+__attribute__((weak, overloadable, always_inline)) float
+max(float x, float y)
+{
+ return __hsail_max_f32(x, y);
+}
+
+//G(max,double,16)
+//G(max,double,8)
+//G(max,double,4)
+//G(max,double,3)
+//G(max,double,2)
+
+extern __attribute__((pure)) double __hsail_max_f64(double,double);
+
+__attribute__((weak, overloadable, always_inline)) double
+max(double x, double y)
+{
+ return __hsail_max_f64(x, y);
+}
Added: libclc/branches/amd-builtins/amd-builtins/misc/printf_alloc.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/misc/printf_alloc.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/misc/printf_alloc.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/misc/printf_alloc.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#if __OPENCL_C_VERSION__ >= 200
+
+#ifndef NULL
+#define NULL 0
+#endif
+
+extern __attribute__((const)) uint __hsail_ld_kernarg_u32(uint);
+extern __attribute__((const)) ulong __hsail_ld_kernarg_u64(uint);
+
+#define OFFSET 8
+
+__global char* __printf_alloc(unsigned int bytes)
+{
+ // Functionality:
+ // The __get_printf_ptr is a builtin that is replaced by
+ // the backend. The first 8 bytes of the buffer returned
+ // by the call are skipped.
+ // buffer[0] maintains the latest offset in the buffer. The value
+ // is updated using atomic adds for the number of bytes
+ // requested in the function argument.
+ // buffer[4] has the size of the buffer
+ // when access needs to go over buffer[0] + size of buffer
+ // i.e. we have the buffer overflow condition -- we return NULL
+ // The buffer size is hard limited by sizeof(uint)
+ //
+ __global char* ptr;
+ if (sizeof(size_t) == 4)
+ ptr = (__global char*) __hsail_ld_kernarg_u32(12);
+ else
+ ptr = (__global char*) __hsail_ld_kernarg_u64(24);
+ uint size = ((global uint *)ptr)[1];
+ uint offset = atomic_load_explicit((__global atomic_uint *)ptr,
+ memory_order_acquire, memory_scope_device);
+ for (;;) {
+ if (OFFSET + offset + bytes > size)
+ return NULL;
+ if (atomic_compare_exchange_strong_explicit((__global atomic_uint *)ptr,
+ &offset, offset+bytes, memory_order_acq_rel, memory_order_acquire,
+ memory_scope_device))
+ break;
+ }
+ return ptr + OFFSET + offset;
+}
+#endif
Added: libclc/branches/amd-builtins/amd-builtins/misc/relationals.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/misc/relationals.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/misc/relationals.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/misc/relationals.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+// Vector expansions for HSAIL relationals
+
+#define UnaryRelationalVector(oty, ity, fun, mgl) \
+__attribute__((weak,always_inline)) \
+oty##16 __##fun##_16##mgl(ity##16 a) \
+{ \
+ oty##16 c; \
+ c.lo = fun(a.lo); \
+ c.hi = fun(a.hi); \
+ return c; \
+} \
+__attribute__((weak,always_inline)) \
+oty##8 __##fun##_8##mgl(ity##8 a) \
+{ \
+ oty##8 c; \
+ c.lo = fun(a.lo); \
+ c.hi = fun(a.hi); \
+ return c; \
+} \
+__attribute__((weak,always_inline)) \
+oty##4 __##fun##_4##mgl(ity##4 a) \
+{ \
+ oty##4 c; \
+ c.lo = fun(a.lo); \
+ c.hi = fun(a.hi); \
+ return c; \
+} \
+__attribute__((weak,always_inline)) \
+oty##3 __##fun##_3##mgl(ity##3 a) \
+{ \
+ oty##3 c; \
+ c.xy = fun(a.xy); \
+ c.z = fun(a.z); \
+ return c; \
+} \
+__attribute__((weak,always_inline)) \
+oty##2 __##fun##_2##mgl(ity##2 a) \
+{ \
+ oty##2 c; \
+ c.lo = fun(a.lo); \
+ c.hi = fun(a.hi); \
+ return c; \
+}
+
+UnaryRelationalVector(int, float, isfinite, f32)
+UnaryRelationalVector(long, double, isfinite, f64)
+
+UnaryRelationalVector(int, float, isinf, f32)
+UnaryRelationalVector(long, double, isinf, f64)
+
+UnaryRelationalVector(int, float, isnan, f32)
+UnaryRelationalVector(long, double, isnan, f64)
+
+UnaryRelationalVector(int, float, isnormal, f32)
+UnaryRelationalVector(long, double, isnormal, f64)
+
Added: libclc/branches/amd-builtins/amd-builtins/misc/synchronization.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/misc/synchronization.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/misc/synchronization.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/misc/synchronization.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+extern void __hsail_memfence();
+extern void __hsail_memfence_global();
+extern void __hsail_memfence_group();
+extern void __hsail_barrier();
+
+void mem_fence_impl(uint val) {
+ if (val == CLK_GLOBAL_MEM_FENCE) {
+ __hsail_memfence_global();
+ } else if (val == CLK_LOCAL_MEM_FENCE) {
+ __hsail_memfence_group();
+ } else {
+ __hsail_memfence();
+ }
+}
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+void mem_fence(uint val) {
+ mem_fence_impl(val);
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+void read_mem_fence(uint val) {
+ mem_fence_impl(val);
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+void write_mem_fence(uint val) {
+ mem_fence_impl(val);
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline))
+void barrier(uint flags) {
+ __hsail_barrier();
+}
Added: libclc/branches/amd-builtins/amd-builtins/misc/workitem.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/misc/workitem.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/misc/workitem.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/misc/workitem.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+extern __attribute__((const)) uint __hsail_get_global_size(uint);
+extern __attribute__((const)) uint __hsail_get_global_id(uint);
+extern __attribute__((const)) uint __hsail_workgroup_size(uint);
+extern __attribute__((const)) uint __hsail_currentworkgroup_size(uint);
+extern __attribute__((const)) uint __hsail_get_local_id(uint);
+extern __attribute__((const)) uint __hsail_get_num_groups(uint);
+extern __attribute__((const)) uint __hsail_get_group_id(uint);
+extern __attribute__((const)) uint __hsail_get_work_dim(void);
+extern __attribute__((const)) uint __hsail_ld_kernarg_u32(uint);
+extern __attribute__((const)) ulong __hsail_ld_kernarg_u64(uint);
+extern __attribute__((pure)) uint __hsail_workitemid_flat(void);
+
+// FIXME - this will change to ulong soon
+extern __attribute__((pure)) uint __hsail_workitemid_flatabs(void);
+
+#ifdef __clang__
+ __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_global_offset(uint d) {
+ if (sizeof(size_t) == 4) { // 32 bit
+ switch(d) {
+ default:
+ return 0;
+ case 0:
+ return __hsail_ld_kernarg_u32(0);
+ case 1:
+ return __hsail_ld_kernarg_u32(4);
+ case 2:
+ return __hsail_ld_kernarg_u32(8);
+ }
+ } else { // 64 bit
+ switch(d) {
+ default:
+ return 0;
+ case 0:
+ return __hsail_ld_kernarg_u64(0);
+ case 1:
+ return __hsail_ld_kernarg_u64(8);
+ case 2:
+ return __hsail_ld_kernarg_u64(16);
+ }
+ }
+}
+
+#ifdef __clang__
+ __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_global_id(uint d) {
+ size_t id;
+ size_t o = get_global_offset(d);
+ switch(d) {
+ default:
+ id = 0;
+ break;
+ case 0:
+ id = __hsail_get_global_id(0);
+ break;
+ case 1:
+ id = __hsail_get_global_id(1);
+ break;
+ case 2:
+ id = __hsail_get_global_id(2);
+ break;
+ }
+
+ return o + id;
+}
+
+#ifdef __clang__
+ __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_local_id(uint d) {
+ switch(d) {
+ default:
+ return 0;
+ case 0:
+ return __hsail_get_local_id(0);
+ case 1:
+ return __hsail_get_local_id(1);
+ case 2:
+ return __hsail_get_local_id(2);
+ }
+}
+
+#ifdef __clang__
+ __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_group_id(uint d) {
+ switch(d) {
+ default:
+ return 0;
+ case 0:
+ return __hsail_get_group_id(0);
+ case 1:
+ return __hsail_get_group_id(1);
+ case 2:
+ return __hsail_get_group_id(2);
+ }
+}
+
+#ifdef __clang__
+ __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_global_size(uint d) {
+ switch(d) {
+ default:
+ return 1;
+ case 0:
+ return __hsail_get_global_size(0);
+ case 1:
+ return __hsail_get_global_size(1);
+ case 2:
+ return __hsail_get_global_size(2);
+ }
+}
+
+#ifdef __clang__
+ __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_local_size(uint d) {
+ switch(d) {
+ default:
+ return 1;
+ case 0:
+ return __hsail_currentworkgroup_size(0);
+ case 1:
+ return __hsail_currentworkgroup_size(1);
+ case 2:
+ return __hsail_currentworkgroup_size(2);
+ }
+}
+
+#ifdef __clang__
+ __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_num_groups(uint d) {
+ switch(d) {
+ default:
+ return 1;
+ case 0:
+ return __hsail_get_num_groups(0);
+ case 1:
+ return __hsail_get_num_groups(1);
+ case 2:
+ return __hsail_get_num_groups(2);
+ }
+}
+
+#ifdef __clang__
+ __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+uint get_work_dim() {
+ return __hsail_get_work_dim();
+}
+
+#if __OPENCL_C_VERSION__ >= 200
+#ifdef __clang__
+ __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_enqueued_local_size(uint d) {
+ switch(d) {
+ default:
+ return 1;
+ case 0:
+ return __hsail_workgroup_size(0);
+ case 1:
+ return __hsail_workgroup_size(1);
+ case 2:
+ return __hsail_workgroup_size(2);
+ }
+}
+
+#ifdef __clang__
+ __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_global_linear_id(void) {
+#if defined NO_WORKITEM_FLATABS
+ return (__hsail_get_global_id(2) * __hsail_get_global_size(1) +
+ __hsail_get_global_id(1)) * __hsail_get_global_size(0) +
+ __hsail_get_global_id(0);
+#else
+ return __hsail_workitemid_flatabs();
+#endif
+}
+
+#ifdef __clang__
+ __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_local_linear_id(void) {
+ return __hsail_workitemid_flat();
+}
+
+#endif
+
Added: libclc/branches/amd-builtins/amd-builtins/pipes/commitp.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/pipes/commitp.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/pipes/commitp.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/pipes/commitp.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+//
+// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#include "pipes.h"
+
+#define __COMMIT_READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) void \
+__commit_read_pipe_internal_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+DO_PIPE_INTERNAL_SIZE(__COMMIT_READ_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) void
+__commit_read_pipe_internal_user(__global struct pipeimp* p, size_t rid, size_t size)
+{
+}
+
+#define __COMMIT_WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) void \
+__commit_write_pipe_internal_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+DO_PIPE_INTERNAL_SIZE(__COMMIT_WRITE_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) void
+__commit_write_pipe_internal_user(__global struct pipeimp* p, size_t rid, size_t size)
+{
+}
+
+// Work group functions
+
+#define __WORK_GROUP_COMMIT_READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) void \
+__work_group_commit_read_pipe_internal_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+DO_PIPE_INTERNAL_SIZE(__WORK_GROUP_COMMIT_READ_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) void
+__work_group_commit_read_pipe_internal_user(__global struct pipeimp* p, size_t rid, size_t size)
+{
+}
+
+#define __WORK_GROUP_COMMIT_WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) void \
+__work_group_commit_write_pipe_internal_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+DO_PIPE_INTERNAL_SIZE(__WORK_GROUP_COMMIT_WRITE_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) void
+__work_group_commit_write_pipe_internal_user(__global struct pipeimp* p, size_t rid, size_t size)
+{
+}
+
+// sub group functions
+
+#define __SUB_GROUP_COMMIT_READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) void \
+__sub_group_commit_read_pipe_internal_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+DO_PIPE_INTERNAL_SIZE(__SUB_GROUP_COMMIT_READ_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) void
+__sub_group_commit_read_pipe_internal_user(__global struct pipeimp* p, size_t rid, size_t size)
+{
+}
+
+#define __SUB_GROUP_COMMIT_WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) void \
+__sub_group_commit_write_pipe_internal_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+DO_PIPE_INTERNAL_SIZE(__SUB_GROUP_COMMIT_WRITE_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) void
+__sub_group_commit_write_pipe_internal_user(__global struct pipeimp* p, size_t rid, size_t size)
+{
+}
+
Added: libclc/branches/amd-builtins/amd-builtins/pipes/getp.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/pipes/getp.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/pipes/getp.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/pipes/getp.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+//
+// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#include "pipes.h"
+
+#define __GET_PIPE_NUM_PACKETS_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) uint \
+__get_pipe_num_packets_internal_##SIZE(__global struct pipeimp* p) \
+{ \
+ size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); \
+ size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device); \
+ return (uint)(wi - ri); \
+}
+
+DO_PIPE_INTERNAL_SIZE(__GET_PIPE_NUM_PACKETS_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) uint
+__get_pipe_num_packets_internal_user(__global struct pipeimp* p, size_t size)
+{
+ size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device);
+ size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device);
+ return (uint)(wi - ri);
+}
+
+#define __GET_PIPE_MAX_PACKETS_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) uint \
+__get_pipe_max_packets_internal_##SIZE(__global struct pipeimp* p) \
+{ \
+ return (uint)p->end_idx; \
+}
+
+DO_PIPE_INTERNAL_SIZE(__GET_PIPE_MAX_PACKETS_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) uint
+__get_pipe_max_packets_internal_user(__global struct pipeimp* p, size_t size)
+{
+ return (uint)p->end_idx;
+}
+
Added: libclc/branches/amd-builtins/amd-builtins/pipes/memcpyia.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/pipes/memcpyia.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/pipes/memcpyia.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/pipes/memcpyia.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+__attribute__((always_inline, weak)) void
+__memcpy_internal_aligned(void *d, const void *s, size_t size, size_t align)
+{
+ if (align == 2) {
+ short *d2 = (short *)d;
+ short *s2 = (short *)s;
+ short *e2 = s2 + size/2;
+
+ while (s2 < e2)
+ *d2++ = *s2++;
+ } else if (align == 4) {
+ int *d4 = (int *)d;
+ int *s4 = (int *)s;
+ int *e4 = s4 + size/4;
+
+ while (s4 < e4)
+ *d4++ = *s4++;
+ } else if (align == 8) {
+ long *d8 = (long *)d;
+ long *s8 = (long *)s;
+ long *e8 = s8 + size/8;
+
+ while (s8 < e8)
+ *d8++ = *s8++;
+ } else if (align == 16) {
+ long2 *d16 = (long2 *)d;
+ long2 *s16 = (long2 *)s;
+ long2 *e16 = s16 + size/16;
+
+ while (s16 < e16)
+ *d16++ = *s16++;
+ } else if (align == 32 || align == 64 || align == 128) {
+ long4 *d32 = (long4 *)d;
+ long4 *s32 = (long4 *)s;
+ long4 *e32 = s32 + size/32;
+
+ while (s32 < e32)
+ *d32++ = *s32++;
+ } else {
+ char *d1 = (char *)d;
+ char *s1 = (char *)s;
+ char *e1 = s1 + size;
+
+ while (s1 < e1)
+ *d1++ = *s1++;
+ }
+}
+
Added: libclc/branches/amd-builtins/amd-builtins/pipes/pipes.h
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/pipes/pipes.h?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/pipes/pipes.h (added)
+++ libclc/branches/amd-builtins/amd-builtins/pipes/pipes.h Tue Oct 7 12:10:46 2014
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+//
+// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#ifndef _PIPES_H
+#define _PIPES_H 1
+
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
+
+#define DO_PIPE_INTERNAL_SIZE(F) \
+F(1,uchar) \
+F(2,ushort) \
+F(4,uint) \
+F(8,ulong) \
+F(16,ulong2) \
+F(32,ulong4) \
+F(64,ulong8) \
+F(128,ulong16)
+
+struct pipeimp {
+ atomic_size_t read_idx;
+ atomic_size_t write_idx;
+ size_t end_idx;
+ uchar pad[128 - 3*sizeof(size_t)];
+ uchar packets[1];
+};
+
+extern void __memcpy_internal_aligned(void *, const void *, size_t, size_t);
+
+static inline size_t
+reserve(volatile __global atomic_size_t *pidx, size_t lim, size_t n)
+{
+ size_t idx = atomic_load_explicit(pidx, memory_order_acquire, memory_scope_device);
+
+ for (;;) {
+ if (idx + n > lim)
+ return ~(size_t)0;
+
+ if (atomic_compare_exchange_strong_explicit(pidx, &idx, idx + n, memory_order_acq_rel, memory_order_acquire, memory_scope_device))
+ break;
+ }
+
+ return idx;
+}
+
+#endif // _PIPES_H
+
Added: libclc/branches/amd-builtins/amd-builtins/pipes/readp.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/pipes/readp.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/pipes/readp.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/pipes/readp.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+//
+// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#include "pipes.h"
+
+#define __READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) int \
+__read_pipe_internal_##SIZE(__global struct pipeimp* p, STYPE* ptr) \
+{ \
+ size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device); \
+ size_t ri = reserve(&p->read_idx, wi, 1); \
+ if (ri == ~(size_t)0) \
+ return -1; \
+ \
+ *ptr = ((__global STYPE *)p->packets)[ri % p->end_idx]; \
+ \
+ if (ri == wi-1) { \
+ atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device); \
+ atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \
+ }\
+\
+ return 0; \
+}
+
+DO_PIPE_INTERNAL_SIZE(__READ_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) int
+__read_pipe_internal_user( __global struct pipeimp* p, void* ptr, size_t size, size_t align)
+{
+ size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device);
+ size_t ri = reserve(&p->read_idx, wi, 1);
+ if (ri == ~(size_t)0)
+ return -1;
+
+ __memcpy_internal_aligned(ptr, p->packets + (ri % p->end_idx)*size, size, align);
+
+ if (ri == wi-1) {
+ atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device);
+ atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device);
+ }
+
+ return 0;
+}
+
+#define __READ_PIPE_INDEXED_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) int \
+__read_pipe_reserved_internal_##SIZE(__global struct pipeimp* p, size_t rid, uint i, STYPE* ptr) \
+{ \
+ rid += i; \
+ *ptr = ((__global STYPE *)p->packets)[rid % p->end_idx]; \
+ \
+ return 0; \
+}
+
+DO_PIPE_INTERNAL_SIZE(__READ_PIPE_INDEXED_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) int
+__read_pipe_reserved_internal_user(__global struct pipeimp* p, size_t rid, uint i, void *ptr, size_t size, size_t align)
+{
+ rid += i;
+
+ __memcpy_internal_aligned(ptr, p->packets + (rid % p->end_idx)*size, size, align);
+
+ return 0;
+}
+
Added: libclc/branches/amd-builtins/amd-builtins/pipes/reservep.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/pipes/reservep.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/pipes/reservep.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/pipes/reservep.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,235 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+//
+// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#include "pipes.h"
+#include "../workgroup/wg.h"
+
+#define __RESERVE_READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) size_t \
+__reserve_read_pipe_internal_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+ size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device); \
+ size_t rid = reserve(&p->read_idx, wi, num_packets); \
+ \
+ if (rid + num_packets == wi) { \
+ atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device); \
+ atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \
+ } \
+ \
+ return rid; \
+}
+
+DO_PIPE_INTERNAL_SIZE(__RESERVE_READ_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) size_t
+__reserve_read_pipe_internal_user(__global struct pipeimp *p, uint num_packets, size_t size)
+{
+ size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device);
+ size_t rid = reserve(&p->read_idx, wi, num_packets);
+
+ if (rid + num_packets == wi) {
+ atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device);
+ atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device);
+ }
+
+ return rid;
+}
+
+#define __RESERVE_WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) size_t \
+__reserve_write_pipe_internal_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+ size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); \
+ size_t ei = p->end_idx; \
+ return reserve(&p->write_idx, ri + ei, num_packets); \
+}
+
+DO_PIPE_INTERNAL_SIZE(__RESERVE_WRITE_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) size_t
+__reserve_write_pipe_internal_user(__global struct pipeimp *p, uint num_packets, size_t size)
+{
+ size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device);
+ size_t ei = p->end_idx;
+ return reserve(&p->write_idx, ri + ei, num_packets);
+}
+
+// Work group functions
+
+#define __WORK_GROUP_RESERVE_READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) size_t \
+__work_group_reserve_read_pipe_internal_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+ __local size_t *t = (__local size_t *)__wg_scratch; \
+ \
+ if ((int)get_local_linear_id() == 0) { \
+ size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device); \
+ size_t rid = reserve(&p->read_idx, wi, num_packets); \
+ \
+ if (rid + num_packets == wi) { \
+ atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device); \
+ atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \
+ } \
+ \
+ *t = rid; \
+ } \
+ \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+ return *t; \
+}
+
+DO_PIPE_INTERNAL_SIZE(__WORK_GROUP_RESERVE_READ_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) size_t
+__work_group_reserve_read_pipe_internal_user(__global struct pipeimp *p, uint num_packets, size_t size)
+{
+ __local size_t *t = (__local size_t *)__wg_scratch;
+
+ if ((int)get_local_linear_id() == 0) {
+ size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device);
+ size_t rid = reserve(&p->read_idx, wi, num_packets);
+
+ if (rid + num_packets == wi) {
+ atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device);
+ atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device);
+ }
+
+ *t = rid;
+ }
+
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+ return *t;
+}
+
+#define __WORK_GROUP_RESERVE_WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) size_t \
+__work_group_reserve_write_pipe_internal_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+ __local size_t *t = (__local size_t *)__wg_scratch; \
+ \
+ if ((int)get_local_linear_id() == 0) { \
+ size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); \
+ size_t ei = p->end_idx; \
+ *t = reserve(&p->write_idx, ri + ei, num_packets); \
+ } \
+ \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+ return *t; \
+}
+
+DO_PIPE_INTERNAL_SIZE(__WORK_GROUP_RESERVE_WRITE_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) size_t
+__work_group_reserve_write_pipe_internal_user(__global struct pipeimp *p, uint num_packets, size_t size)
+{
+ __local size_t *t = (__local size_t *)__wg_scratch;
+
+ if ((int)get_local_linear_id() == 0) {
+ size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device);
+ size_t ei = p->end_idx;
+ *t = reserve(&p->write_idx, ri + ei, num_packets);
+ }
+
+ work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+ return *t;
+}
+
+// sub group functions
+
+#define __SUB_GROUP_RESERVE_READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) size_t \
+__sub_group_reserve_read_pipe_internal_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+ size_t rid = ~(size_t)0; \
+ \
+ if (get_sub_group_local_id() == 0) { \
+ size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device); \
+ rid = reserve(&p->read_idx, wi, num_packets); \
+ \
+ if (rid + num_packets == wi) { \
+ atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device); \
+ atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \
+ } \
+ } \
+ \
+ return sub_group_broadcast(rid, 0); \
+}
+
+DO_PIPE_INTERNAL_SIZE(__SUB_GROUP_RESERVE_READ_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) size_t
+__sub_group_reserve_read_pipe_internal_user(__global struct pipeimp *p, uint num_packets, size_t size)
+{
+ size_t rid = ~(size_t)0;
+
+ if (get_sub_group_local_id() == 0) {
+ size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device);
+ rid = reserve(&p->read_idx, wi, num_packets);
+
+ if (rid + num_packets == wi) {
+ atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device);
+ atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device);
+ }
+ }
+
+ return sub_group_broadcast(rid, 0);
+}
+
+#define __SUB_GROUP_RESERVE_WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) size_t \
+__sub_group_reserve_write_pipe_internal_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+ size_t rid = ~(size_t)0; \
+ \
+ if (get_sub_group_local_id() == 0) { \
+ size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); \
+ size_t ei = p->end_idx; \
+ rid = reserve(&p->write_idx, ri + ei, num_packets); \
+ } \
+ \
+ return sub_group_broadcast(rid, 0); \
+}
+
+DO_PIPE_INTERNAL_SIZE(__SUB_GROUP_RESERVE_WRITE_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) size_t
+__sub_group_reserve_write_pipe_internal_user(__global struct pipeimp *p, uint num_packets, size_t size)
+{
+ size_t rid = ~(size_t)0;
+
+ if (get_sub_group_local_id() == 0) {
+ size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device);
+ size_t ei = p->end_idx;
+ rid = reserve(&p->write_idx, ri + ei, num_packets);
+ }
+
+ return sub_group_broadcast(rid, 0);
+}
+
Added: libclc/branches/amd-builtins/amd-builtins/pipes/validp.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/pipes/validp.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/pipes/validp.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/pipes/validp.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+//
+// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+
+__attribute__((always_inline, weak)) bool
+__is_valid_reserve_id(size_t rid)
+{
+ return rid != ~(size_t)0;
+}
+
Added: libclc/branches/amd-builtins/amd-builtins/pipes/writep.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/pipes/writep.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/pipes/writep.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/pipes/writep.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+//
+// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#include "pipes.h"
+
+#define __WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) int \
+__write_pipe_internal_##SIZE(__global struct pipeimp* p, const STYPE* ptr) \
+{ \
+ size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); \
+ size_t ei = p->end_idx; \
+ size_t wi = reserve(&p->write_idx, ri+ei, 1); \
+ if (wi == ~(size_t)0) \
+ return -1; \
+ \
+ ((__global STYPE *)p->packets)[wi % ei] = *ptr; \
+ return 0; \
+}
+
+DO_PIPE_INTERNAL_SIZE(__WRITE_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) int
+__write_pipe_internal_user(__global struct pipeimp* p, const void* ptr, size_t size, size_t align)
+{
+ size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device);
+ size_t ei = p->end_idx;
+ size_t wi = reserve(&p->write_idx, ri+ei, 1);
+ if (wi == ~(size_t)0)
+ return -1;
+
+ __memcpy_internal_aligned(p->packets + (wi % ei)*size, ptr, size, align);
+
+ return 0;
+}
+
+#define __WRITE_PIPE_INDEXED_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) int \
+__write_pipe_reserved_internal_##SIZE(__global struct pipeimp* p, size_t rid, uint i, const STYPE* ptr) \
+{ \
+ rid += i; \
+ ((__global STYPE *)p->packets)[rid % p->end_idx] = *ptr; \
+ return 0; \
+}
+
+DO_PIPE_INTERNAL_SIZE(__WRITE_PIPE_INDEXED_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) int
+__write_pipe_reserved_internal_user(__global struct pipeimp* p, size_t rid, uint i, const void *ptr, size_t size, size_t align)
+{
+ rid += i;
+
+ __memcpy_internal_aligned(p->packets + (rid % p->end_idx)*size, ptr, size, align);
+
+ return 0;
+}
+
Added: libclc/branches/amd-builtins/amd-builtins/subgroup/subany.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/subgroup/subany.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/subgroup/subany.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/subgroup/subany.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+extern __attribute__((pure)) uint __hsail_activelanecount_wavewidth_u32_b1(bool);
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) int
+sub_group_all(int predicate)
+{
+ return __hsail_activelanecount_wavewidth_u32_b1(predicate != 0) == __hsail_activelanecount_wavewidth_u32_b1(true);
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) int
+sub_group_any(int predicate)
+{
+ return __hsail_activelanecount_wavewidth_u32_b1(predicate != 0) != 0;
+}
+
Added: libclc/branches/amd-builtins/amd-builtins/subgroup/subbar.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/subgroup/subbar.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/subgroup/subbar.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/subgroup/subbar.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+extern void __hsail_wavebarrier(void);
+
+__attribute__((overloadable,weak,always_inline)) void
+sub_group_barrier(cl_mem_fence_flags flags)
+{
+ sub_group_barrier(flags, memory_scope_sub_group);
+}
+
+__attribute__((overloadable,weak,always_inline)) void
+sub_group_barrier(cl_mem_fence_flags flags, memory_scope scope)
+{
+ // What about CLK_IMAGE_MEM_FENCE
+ atomic_work_item_fence(flags, memory_order_release, scope);
+ __hsail_wavebarrier();
+ atomic_work_item_fence(flags, memory_order_acquire, scope);
+}
+
Added: libclc/branches/amd-builtins/amd-builtins/subgroup/subbcast.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/subgroup/subbcast.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/subgroup/subbcast.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/subgroup/subbcast.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+extern uint __hsail_activelaneshuffle_wavewidth_b32(uint src, uint lid, uint ival, bool useival);
+extern ulong __hsail_activelaneshuffle_wavewidth_b64(ulong src, uint lid, ulong ival, bool useival);
+extern void __hsail_wavebarrier();
+
+__attribute__((always_inline)) static uint
+bcast32(uint a, uint lid)
+{
+ a = __hsail_activelaneshuffle_wavewidth_b32(a, lid, 0U, false);
+ __hsail_wavebarrier();
+ return a;
+}
+
+extern __attribute__((overloadable, alias("bcast32"))) uint sub_group_broadcast(uint, uint);
+extern __attribute__((overloadable, alias("bcast32"))) int sub_group_broadcast(int, uint);
+extern __attribute__((overloadable, alias("bcast32"))) float sub_group_broadcast(float, uint);
+
+
+__attribute__((always_inline)) static ulong
+bcast64(ulong a, uint lid)
+{
+ a = __hsail_activelaneshuffle_wavewidth_b64(a, lid, 0UL, false);
+ __hsail_wavebarrier();
+ return a;
+}
+
+extern __attribute__((overloadable, alias("bcast64"))) ulong sub_group_broadcast(ulong, uint);
+extern __attribute__((overloadable, alias("bcast64"))) long sub_group_broadcast(long, uint);
+extern __attribute__((overloadable, alias("bcast64"))) double sub_group_broadcast(double, uint);
+
Added: libclc/branches/amd-builtins/amd-builtins/subgroup/subget.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/subgroup/subget.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/subgroup/subget.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/subgroup/subget.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+extern __attribute__((pure)) uint __hsail_workitemid_flat(void);
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) uint
+get_sub_group_size(void)
+{
+ uint wgs = (uint)get_local_size(0) * (uint)get_local_size(1) * (uint)get_local_size(2);
+ uint lid = (uint)get_local_linear_id();
+ return min(64U, wgs - (lid & ~63U));
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) uint
+get_max_sub_group_size(void)
+{
+ uint wgs = (uint)get_enqueued_local_size(0) * get_enqueued_local_size(1) * get_enqueued_local_size(2);
+ return min(64U, wgs);
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) uint
+get_num_sub_groups(void)
+{
+ uint wgs = (uint)get_local_size(0) * (uint)get_local_size(1) * (uint)get_local_size(2);
+ return (wgs + 63U) >> 6U;
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) uint
+get_enqueued_num_sub_groups(void)
+{
+ uint wgs = (uint)get_enqueued_local_size(0) * get_enqueued_local_size(1) * get_enqueued_local_size(2);
+ return (wgs + 63U) >> 6U;
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) uint
+get_sub_group_id(void)
+{
+ return __hsail_workitemid_flat() >> 6U;
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) uint
+get_sub_group_local_id(void)
+{
+ return __hsail_workitemid_flat() & 0x3fU;
+}
+
Added: libclc/branches/amd-builtins/amd-builtins/subgroup/subreduce.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/subgroup/subreduce.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/subgroup/subreduce.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/subgroup/subreduce.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if __OPENCL_C_VERSION__ >= 200
+
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+extern uint __hsail_get_lane_id(void);
+extern uint __hsail_activelaneshuffle_wavewidth_b32(uint src, uint lid, uint ival, bool useival);
+extern ulong __hsail_activelaneshuffle_wavewidth_b64(ulong src, uint lid, ulong ival, bool useival);
+extern void __hsail_wavebarrier();
+
+#define GENA(TY,SZ,AO,AI,Z) \
+__attribute__((overloadable, always_inline)) TY \
+sub_group_reduce_add(TY a) \
+{ \
+ uint lid = __hsail_get_lane_id(); \
+ a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^1, AI(Z), false)); \
+ __hsail_wavebarrier(); \
+ a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^2, AI(Z), false)); \
+ __hsail_wavebarrier(); \
+ a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^4, AI(Z), false)); \
+ __hsail_wavebarrier(); \
+ a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^8, AI(Z), false)); \
+ __hsail_wavebarrier(); \
+ a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^16, AI(Z), false)); \
+ __hsail_wavebarrier(); \
+ a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^32, AI(Z), false)); \
+ __hsail_wavebarrier(); \
+ return a; \
+}
+
+GENA(int,32,as_int,as_uint,0)
+GENA(uint,32,,,0U)
+GENA(long,64,as_long,as_ulong,0L)
+GENA(ulong,64,,,0UL)
+GENA(float,32,as_float,as_uint,0.0f)
+GENA(double,64,as_double,as_ulong,0.0)
+
+#define GENO(TY,SZ,OP,AO,AI,ID) \
+__attribute__((overloadable, always_inline)) TY \
+sub_group_reduce_##OP(TY a) \
+{ \
+ uint lid = __hsail_get_lane_id(); \
+ a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^1, AI(ID), false))); \
+ __hsail_wavebarrier(); \
+ a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^2, AI(ID), false))); \
+ __hsail_wavebarrier(); \
+ a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^4, AI(ID), false))); \
+ __hsail_wavebarrier(); \
+ a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^8, AI(ID), false))); \
+ __hsail_wavebarrier(); \
+ a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^16, AI(ID), false))); \
+ __hsail_wavebarrier(); \
+ a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^32, AI(ID), false))); \
+ __hsail_wavebarrier(); \
+ return a; \
+}
+
+GENO(int,32,min,as_int,as_uint,INT_MAX)
+GENO(uint,32,min,,,UINT_MAX)
+GENO(long,64,min,as_long,as_ulong,LONG_MAX)
+GENO(ulong,64,min,,,ULONG_MAX)
+GENO(float,32,min,as_float,as_uint,INFINITY)
+GENO(double,64,min,as_double,as_ulong,(double)INFINITY)
+
+GENO(int,32,max,as_int,as_uint,INT_MIN)
+GENO(uint,32,max,,,0U)
+GENO(long,64,max,as_long,as_ulong,LONG_MIN)
+GENO(ulong,64,max,,,0UL)
+GENO(float,32,max,as_float,as_uint,-INFINITY)
+GENO(double,64,max,as_double,as_ulong,-(double)INFINITY)
+
+#endif
+
Added: libclc/branches/amd-builtins/amd-builtins/subgroup/subscan.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/subgroup/subscan.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/subgroup/subscan.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/subgroup/subscan.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+extern uint __hsail_get_lane_id(void);
+extern uint __hsail_activelaneshuffle_wavewidth_b32(uint src, uint lid, uint ival, bool useival);
+extern ulong __hsail_activelaneshuffle_wavewidth_b64(ulong src, uint lid, ulong ival, bool useival);
+extern void __hsail_wavebarrier();
+
+// Define exclusive in terms of inclusive
+
+#define EGEN(TY,OP,SZ,AO,AI,ID) \
+__attribute__((overloadable, always_inline)) TY \
+sub_group_scan_exclusive_##OP(TY a) \
+{ \
+ a = sub_group_scan_inclusive_##OP(a); \
+ uint lid = __hsail_get_lane_id(); \
+ a = AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-1)&0x3f, AI(ID), lid < 1)); \
+ return a; \
+}
+
+EGEN(int,add,32,as_int,as_uint,0)
+EGEN(int,min,32,as_int,as_uint,INT_MAX)
+EGEN(int,max,32,as_int,as_uint,INT_MIN)
+
+EGEN(uint,add,32,,,0)
+EGEN(uint,min,32,,,UINT_MAX)
+EGEN(uint,max,32,,,0U)
+
+EGEN(long,add,64,as_long,as_ulong,0L)
+EGEN(long,min,64,as_long,as_ulong,LONG_MAX)
+EGEN(long,max,64,as_long,as_ulong,LONG_MIN)
+
+EGEN(ulong,add,64,,,0UL)
+EGEN(ulong,min,64,,,ULONG_MAX)
+EGEN(ulong,max,64,,,0UL)
+
+EGEN(float,add,32,as_float,as_uint,0.0f)
+EGEN(float,min,32,as_float,as_uint,INFINITY)
+EGEN(float,max,32,as_float,as_uint,-INFINITY)
+
+EGEN(double,add,64,as_double,as_ulong,0.0)
+EGEN(double,min,64,as_double,as_ulong,(double)INFINITY)
+EGEN(double,max,64,as_double,as_ulong,-(double)INFINITY)
+
+// Now inclusive scan
+
+#define IGENA(TY,SZ,AO,AI,ID) \
+__attribute__((overloadable, always_inline)) TY \
+sub_group_scan_inclusive_add(TY a) \
+{ \
+ uint lid = __hsail_get_lane_id(); \
+ a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-1)&0x3f, AI(ID), lid < 1)); \
+ __hsail_wavebarrier(); \
+ a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-2)&0x3f, AI(ID), lid < 2)); \
+ __hsail_wavebarrier(); \
+ a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-4)&0x3f, AI(ID), lid < 4)); \
+ __hsail_wavebarrier(); \
+ a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-8)&0x3f, AI(ID), lid < 8)); \
+ __hsail_wavebarrier(); \
+ a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-16)&0x3f, AI(ID), lid < 16)); \
+ __hsail_wavebarrier(); \
+ a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-32)&0x3f, AI(ID), lid < 32)); \
+ __hsail_wavebarrier(); \
+ return a; \
+}
+
+#define IGENO(TY,SZ,OP,AO,AI,ID) \
+__attribute__((overloadable, always_inline)) TY \
+sub_group_scan_inclusive_##OP(TY a) \
+{ \
+ uint lid = __hsail_get_lane_id(); \
+ a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-1)&0x3f, AI(ID), lid < 1))); \
+ __hsail_wavebarrier(); \
+ a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-2)&0x3f, AI(ID), lid < 2))); \
+ __hsail_wavebarrier(); \
+ a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-4)&0x3f, AI(ID), lid < 4))); \
+ __hsail_wavebarrier(); \
+ a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-8)&0x3f, AI(ID), lid < 8))); \
+ __hsail_wavebarrier(); \
+ a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-16)&0x3f, AI(ID), lid < 16))); \
+ __hsail_wavebarrier(); \
+ a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-32)&0x3f, AI(ID), lid < 32))); \
+ __hsail_wavebarrier(); \
+ return a; \
+}
+
+IGENA(int,32,as_int,as_uint,0)
+IGENO(int,32,min,as_int,as_uint,INT_MAX)
+IGENO(int,32,max,as_int,as_uint,INT_MIN)
+
+IGENA(uint,32,,,0U)
+IGENO(uint,32,min,,,UINT_MAX)
+IGENO(uint,32,max,,,0U)
+
+IGENA(long,64,as_long,as_ulong,0L)
+IGENO(long,64,min,as_long,as_ulong,LONG_MAX)
+IGENO(long,64,max,as_long,as_ulong,LONG_MIN)
+
+IGENA(ulong,64,,,0UL)
+IGENO(ulong,64,min,,,ULONG_MAX)
+IGENO(ulong,64,max,,,0UL)
+
+IGENA(float,32,as_float,as_uint,0.0f)
+IGENO(float,32,min,as_float,as_uint,INFINITY)
+IGENO(float,32,max,as_float,as_uint,-INFINITY)
+
+IGENA(double,64,as_double,as_ulong,0.0)
+IGENO(double,64,min,as_double,as_ulong,(double)INFINITY)
+IGENO(double,64,max,as_double,as_ulong,-(double)INFINITY)
+
Added: libclc/branches/amd-builtins/amd-builtins/vldst/f16_f32.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/vldst/f16_f32.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/vldst/f16_f32.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/vldst/f16_f32.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+extern float __amdil_half_to_float_f32(uint op1);
+
+extern float __amdil_float_to_half_f32(float op1);
+extern float __amdil_float_to_half_near_f32(float op1);
+extern float __amdil_float_to_half_neg_inf_f32(float op1);
+extern float __amdil_float_to_half_plus_inf_f32(float op1);
+
+// half -> float
+__attribute__((always_inline)) float
+__cvt_f16_to_f32(ushort a)
+{
+ return __amdil_half_to_float_f32((uint)a);
+}
+
+__attribute__((always_inline)) float2
+__cvt_2f16_to_2f32(ushort2 ush)
+{
+ float2 ret;
+ ret.s0 = __cvt_f16_to_f32(ush.s0);
+ ret.s1 = __cvt_f16_to_f32(ush.s1);
+ return ret;
+}
+
+__attribute__((always_inline)) float3
+__cvt_3f16_to_3f32(ushort3 ush)
+{
+ float3 ret;
+ ret.lo = __cvt_2f16_to_2f32(ush.lo);
+ ret.s2 = __cvt_f16_to_f32(ush.s2);
+ return ret;
+}
+
+__attribute__((always_inline)) float4
+__cvt_4f16_to_4f32(ushort4 ush)
+{
+ float4 ret;
+ ret.lo = __cvt_2f16_to_2f32(ush.lo);
+ ret.hi = __cvt_2f16_to_2f32(ush.hi);
+ return ret;
+}
+
+__attribute__((always_inline)) float8
+__cvt_8f16_to_8f32(ushort8 ush)
+{
+ float8 ret;
+ ret.lo = __cvt_4f16_to_4f32(ush.lo);
+ ret.hi = __cvt_4f16_to_4f32(ush.hi);
+ return ret;
+}
+
+__attribute__((always_inline)) float16
+__cvt_16f16_to_16f32(ushort16 ush)
+{
+ float16 ret;
+ ret.lo = __cvt_8f16_to_8f32(ush.lo);
+ ret.hi = __cvt_8f16_to_8f32(ush.hi);
+ return ret;
+}
+
+// float -> half rte
+__attribute__((always_inline)) ushort
+__cvt_f32_to_f16_rte(float a)
+{
+ return (ushort)as_uint(__amdil_float_to_half_near_f32(a));
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f32_to_2f16_rte(float2 f)
+{
+ ushort2 ret;
+ ret.s0 = __cvt_f32_to_f16_rte(f.s0);
+ ret.s1 = __cvt_f32_to_f16_rte(f.s1);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f32_to_3f16_rte(float3 f)
+{
+ ushort3 ret;
+ ret.lo = __cvt_2f32_to_2f16_rte(f.lo);
+ ret.s2 = __cvt_f32_to_f16_rte(f.s2);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f32_to_4f16_rte(float4 f)
+{
+ ushort4 ret;
+ ret.lo = __cvt_2f32_to_2f16_rte(f.lo);
+ ret.hi = __cvt_2f32_to_2f16_rte(f.hi);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f32_to_8f16_rte(float8 f)
+{
+ ushort8 ret;
+ ret.lo = __cvt_4f32_to_4f16_rte(f.lo);
+ ret.hi = __cvt_4f32_to_4f16_rte(f.hi);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f32_to_16f16_rte(float16 f)
+{
+ ushort16 ret;
+ ret.lo = __cvt_8f32_to_8f16_rte(f.lo);
+ ret.hi = __cvt_8f32_to_8f16_rte(f.hi);
+ return ret;
+}
+
+// float -> half cur
+// XXX assumes RTE
+__attribute__((always_inline)) ushort
+__cvt_f32_to_f16_cur(float f)
+{
+ return __cvt_f32_to_f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f32_to_2f16_cur(float2 f)
+{
+ return __cvt_2f32_to_2f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f32_to_3f16_cur(float3 f)
+{
+ return __cvt_3f32_to_3f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f32_to_4f16_cur(float4 f)
+{
+ return __cvt_4f32_to_4f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f32_to_8f16_cur(float8 f)
+{
+ return __cvt_8f32_to_8f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f32_to_16f16_cur(float16 f)
+{
+ return __cvt_16f32_to_16f16_rte(f);
+}
+
+//float -> half rtp
+
+ushort
+__cvt_f32_to_f16_rtp(float a)
+{
+ return (ushort)as_uint(__amdil_float_to_half_plus_inf_f32(a));
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f32_to_2f16_rtp(float2 f)
+{
+ ushort2 ret;
+ ret.s0 = __cvt_f32_to_f16_rtp(f.s0);
+ ret.s1 = __cvt_f32_to_f16_rtp(f.s1);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f32_to_3f16_rtp(float3 f)
+{
+ ushort3 ret;
+ ret.lo = __cvt_2f32_to_2f16_rtp(f.lo);
+ ret.s2 = __cvt_f32_to_f16_rtp(f.s2);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f32_to_4f16_rtp(float4 f)
+{
+ ushort4 ret;
+ ret.lo = __cvt_2f32_to_2f16_rtp(f.lo);
+ ret.hi = __cvt_2f32_to_2f16_rtp(f.hi);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f32_to_8f16_rtp(float8 f)
+{
+ ushort8 ret;
+ ret.lo = __cvt_4f32_to_4f16_rtp(f.lo);
+ ret.hi = __cvt_4f32_to_4f16_rtp(f.hi);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f32_to_16f16_rtp(float16 f)
+{
+ ushort16 ret;
+ ret.lo = __cvt_8f32_to_8f16_rtp(f.lo);
+ ret.hi = __cvt_8f32_to_8f16_rtp(f.hi);
+ return ret;
+}
+
+// float -> half rtn
+
+ushort
+__cvt_f32_to_f16_rtn(float a)
+{
+ return (ushort)as_uint(__amdil_float_to_half_neg_inf_f32(a));
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f32_to_2f16_rtn(float2 f)
+{
+ ushort2 ret;
+ ret.s0 = __cvt_f32_to_f16_rtn(f.s0);
+ ret.s1 = __cvt_f32_to_f16_rtn(f.s1);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f32_to_3f16_rtn(float3 f)
+{
+ ushort3 ret;
+ ret.lo = __cvt_2f32_to_2f16_rtn(f.lo);
+ ret.s2 = __cvt_f32_to_f16_rtn(f.s2);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f32_to_4f16_rtn(float4 f)
+{
+ ushort4 ret;
+ ret.lo = __cvt_2f32_to_2f16_rtn(f.lo);
+ ret.hi = __cvt_2f32_to_2f16_rtn(f.hi);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f32_to_8f16_rtn(float8 f)
+{
+ ushort8 ret;
+ ret.lo = __cvt_4f32_to_4f16_rtn(f.lo);
+ ret.hi = __cvt_4f32_to_4f16_rtn(f.hi);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f32_to_16f16_rtn(float16 f)
+{
+ ushort16 ret;
+ ret.lo = __cvt_8f32_to_8f16_rtn(f.lo);
+ ret.hi = __cvt_8f32_to_8f16_rtn(f.hi);
+ return ret;
+}
+
+// float -> half rtz
+
+ushort
+__cvt_f32_to_f16_rtz(float a)
+{
+ return (ushort)as_uint(__amdil_float_to_half_f32(a));
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f32_to_2f16_rtz(float2 f)
+{
+ ushort2 ret;
+ ret.s0 = __cvt_f32_to_f16_rtz(f.s0);
+ ret.s1 = __cvt_f32_to_f16_rtz(f.s1);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f32_to_3f16_rtz(float3 f)
+{
+ ushort3 ret;
+ ret.lo = __cvt_2f32_to_2f16_rtz(f.lo);
+ ret.s2 = __cvt_f32_to_f16_rtz(f.s2);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f32_to_4f16_rtz(float4 f)
+{
+ ushort4 ret;
+ ret.lo = __cvt_2f32_to_2f16_rtz(f.lo);
+ ret.hi = __cvt_2f32_to_2f16_rtz(f.hi);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f32_to_8f16_rtz(float8 f)
+{
+ ushort8 ret;
+ ret.lo = __cvt_4f32_to_4f16_rtz(f.lo);
+ ret.hi = __cvt_4f32_to_4f16_rtz(f.hi);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f32_to_16f16_rtz(float16 f)
+{
+ ushort16 ret;
+ ret.lo = __cvt_8f32_to_8f16_rtz(f.lo);
+ ret.hi = __cvt_8f32_to_8f16_rtz(f.hi);
+ return ret;
+}
+
Added: libclc/branches/amd-builtins/amd-builtins/vldst/f64_f16.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/vldst/f64_f16.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/vldst/f64_f16.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/vldst/f64_f16.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+extern float __amdil_double_to_half_f64(double op1);
+extern float __amdil_double_to_half_near_f64(double op1);
+extern float __amdil_double_to_half_neg_inf_f64(double op1);
+extern float __amdil_double_to_half_plus_inf_f64(double op1);
+
+// double -> half rte
+__attribute__((always_inline)) ushort
+__cvt_f64_to_f16_rte(double a)
+{
+ return (ushort)as_uint(__amdil_double_to_half_near_f64(a));
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f64_to_2f16_rte(double2 f)
+{
+ ushort2 ret;
+ ret.s0 = __cvt_f64_to_f16_rte(f.s0);
+ ret.s1 = __cvt_f64_to_f16_rte(f.s1);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f64_to_3f16_rte(double3 f)
+{
+ ushort3 ret;
+ ret.lo = __cvt_2f64_to_2f16_rte(f.lo);
+ ret.s2 = __cvt_f64_to_f16_rte(f.s2);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f64_to_4f16_rte(double4 f)
+{
+ ushort4 ret;
+ ret.lo = __cvt_2f64_to_2f16_rte(f.lo);
+ ret.hi = __cvt_2f64_to_2f16_rte(f.hi);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f64_to_8f16_rte(double8 f)
+{
+ ushort8 ret;
+ ret.lo = __cvt_4f64_to_4f16_rte(f.lo);
+ ret.hi = __cvt_4f64_to_4f16_rte(f.hi);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f64_to_16f16_rte(double16 f)
+{
+ ushort16 ret;
+ ret.lo = __cvt_8f64_to_8f16_rte(f.lo);
+ ret.hi = __cvt_8f64_to_8f16_rte(f.hi);
+ return ret;
+}
+
+// double -> half cur
+// XXX assumes RTE
+__attribute__((always_inline)) ushort
+__cvt_f64_to_f16_cur(double f)
+{
+ return __cvt_f64_to_f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f64_to_2f16_cur(double2 f)
+{
+ return __cvt_2f64_to_2f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f64_to_3f16_cur(double3 f)
+{
+ return __cvt_3f64_to_3f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f64_to_4f16_cur(double4 f)
+{
+ return __cvt_4f64_to_4f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f64_to_8f16_cur(double8 f)
+{
+ return __cvt_8f64_to_8f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f64_to_16f16_cur(double16 f)
+{
+ return __cvt_16f64_to_16f16_rte(f);
+}
+
+//double -> half rtp
+
+ushort
+__cvt_f64_to_f16_rtp(double a)
+{
+ return (ushort)as_uint(__amdil_double_to_half_plus_inf_f64(a));
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f64_to_2f16_rtp(double2 f)
+{
+ ushort2 ret;
+ ret.s0 = __cvt_f64_to_f16_rtp(f.s0);
+ ret.s1 = __cvt_f64_to_f16_rtp(f.s1);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f64_to_3f16_rtp(double3 f)
+{
+ ushort3 ret;
+ ret.lo = __cvt_2f64_to_2f16_rtp(f.lo);
+ ret.s2 = __cvt_f64_to_f16_rtp(f.s2);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f64_to_4f16_rtp(double4 f)
+{
+ ushort4 ret;
+ ret.lo = __cvt_2f64_to_2f16_rtp(f.lo);
+ ret.hi = __cvt_2f64_to_2f16_rtp(f.hi);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f64_to_8f16_rtp(double8 f)
+{
+ ushort8 ret;
+ ret.lo = __cvt_4f64_to_4f16_rtp(f.lo);
+ ret.hi = __cvt_4f64_to_4f16_rtp(f.hi);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f64_to_16f16_rtp(double16 f)
+{
+ ushort16 ret;
+ ret.lo = __cvt_8f64_to_8f16_rtp(f.lo);
+ ret.hi = __cvt_8f64_to_8f16_rtp(f.hi);
+ return ret;
+}
+
+// double -> half rtn
+
+ushort
+__cvt_f64_to_f16_rtn(double a)
+{
+ return (ushort)as_uint(__amdil_double_to_half_neg_inf_f64(a));
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f64_to_2f16_rtn(double2 f)
+{
+ ushort2 ret;
+ ret.s0 = __cvt_f64_to_f16_rtn(f.s0);
+ ret.s1 = __cvt_f64_to_f16_rtn(f.s1);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f64_to_3f16_rtn(double3 f)
+{
+ ushort3 ret;
+ ret.lo = __cvt_2f64_to_2f16_rtn(f.lo);
+ ret.s2 = __cvt_f64_to_f16_rtn(f.s2);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f64_to_4f16_rtn(double4 f)
+{
+ ushort4 ret;
+ ret.lo = __cvt_2f64_to_2f16_rtn(f.lo);
+ ret.hi = __cvt_2f64_to_2f16_rtn(f.hi);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f64_to_8f16_rtn(double8 f)
+{
+ ushort8 ret;
+ ret.lo = __cvt_4f64_to_4f16_rtn(f.lo);
+ ret.hi = __cvt_4f64_to_4f16_rtn(f.hi);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f64_to_16f16_rtn(double16 f)
+{
+ ushort16 ret;
+ ret.lo = __cvt_8f64_to_8f16_rtn(f.lo);
+ ret.hi = __cvt_8f64_to_8f16_rtn(f.hi);
+ return ret;
+}
+
+// double -> half rtz
+
+ushort
+__cvt_f64_to_f16_rtz(double a)
+{
+ return (ushort)as_uint(__amdil_double_to_half_f64(a));
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f64_to_2f16_rtz(double2 f)
+{
+ ushort2 ret;
+ ret.s0 = __cvt_f64_to_f16_rtz(f.s0);
+ ret.s1 = __cvt_f64_to_f16_rtz(f.s1);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f64_to_3f16_rtz(double3 f)
+{
+ ushort3 ret;
+ ret.lo = __cvt_2f64_to_2f16_rtz(f.lo);
+ ret.s2 = __cvt_f64_to_f16_rtz(f.s2);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f64_to_4f16_rtz(double4 f)
+{
+ ushort4 ret;
+ ret.lo = __cvt_2f64_to_2f16_rtz(f.lo);
+ ret.hi = __cvt_2f64_to_2f16_rtz(f.hi);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f64_to_8f16_rtz(double8 f)
+{
+ ushort8 ret;
+ ret.lo = __cvt_4f64_to_4f16_rtz(f.lo);
+ ret.hi = __cvt_4f64_to_4f16_rtz(f.hi);
+ return ret;
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f64_to_16f16_rtz(double16 f)
+{
+ ushort16 ret;
+ ret.lo = __cvt_8f64_to_8f16_rtz(f.lo);
+ ret.hi = __cvt_8f64_to_8f16_rtz(f.hi);
+ return ret;
+}
+
Added: libclc/branches/amd-builtins/amd-builtins/vldst/vldst_gen.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/vldst/vldst_gen.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/vldst/vldst_gen.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/vldst/vldst_gen.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,3206 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+__attribute__((overloadable, always_inline, weak)) float2
+vload2(size_t i, const float *p)
+{
+ return as_float2(vload2(i, (const int *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) float2
+vload2(size_t i, const __constant float *p)
+{
+ return as_float2(vload2(i, (const __constant int *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float2
+vload2(size_t i, const __global float *p)
+{
+ return as_float2(vload2(i, (const __global int *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float2
+vload2(size_t i, const __local float *p)
+{
+ return as_float2(vload2(i, (const __local int *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) double2
+vload2(size_t i, const double *p)
+{
+ return as_double2(vload2(i, (const long *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) double2
+vload2(size_t i, const __constant double *p)
+{
+ return as_double2(vload2(i, (const __constant long *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double2
+vload2(size_t i, const __global double *p)
+{
+ return as_double2(vload2(i, (const __global long *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double2
+vload2(size_t i, const __local double *p)
+{
+ return as_double2(vload2(i, (const __local long *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) float3
+vload3(size_t i, const float *p)
+{
+ return as_float3(vload3(i, (const int *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) float3
+vload3(size_t i, const __constant float *p)
+{
+ return as_float3(vload3(i, (const __constant int *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float3
+vload3(size_t i, const __global float *p)
+{
+ return as_float3(vload3(i, (const __global int *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float3
+vload3(size_t i, const __local float *p)
+{
+ return as_float3(vload3(i, (const __local int *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) double3
+vload3(size_t i, const double *p)
+{
+ return as_double3(vload3(i, (const long *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) double3
+vload3(size_t i, const __constant double *p)
+{
+ return as_double3(vload3(i, (const __constant long *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double3
+vload3(size_t i, const __global double *p)
+{
+ return as_double3(vload3(i, (const __global long *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double3
+vload3(size_t i, const __local double *p)
+{
+ return as_double3(vload3(i, (const __local long *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) float4
+vload4(size_t i, const float *p)
+{
+ return as_float4(vload4(i, (const int *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) float4
+vload4(size_t i, const __constant float *p)
+{
+ return as_float4(vload4(i, (const __constant int *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float4
+vload4(size_t i, const __global float *p)
+{
+ return as_float4(vload4(i, (const __global int *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float4
+vload4(size_t i, const __local float *p)
+{
+ return as_float4(vload4(i, (const __local int *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) double4
+vload4(size_t i, const double *p)
+{
+ return as_double4(vload4(i, (const long *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) double4
+vload4(size_t i, const __constant double *p)
+{
+ return as_double4(vload4(i, (const __constant long *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double4
+vload4(size_t i, const __global double *p)
+{
+ return as_double4(vload4(i, (const __global long *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double4
+vload4(size_t i, const __local double *p)
+{
+ return as_double4(vload4(i, (const __local long *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) float8
+vload8(size_t i, const float *p)
+{
+ return as_float8(vload8(i, (const int *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) float8
+vload8(size_t i, const __constant float *p)
+{
+ return as_float8(vload8(i, (const __constant int *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float8
+vload8(size_t i, const __global float *p)
+{
+ return as_float8(vload8(i, (const __global int *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float8
+vload8(size_t i, const __local float *p)
+{
+ return as_float8(vload8(i, (const __local int *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) double8
+vload8(size_t i, const double *p)
+{
+ return as_double8(vload8(i, (const long *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) double8
+vload8(size_t i, const __constant double *p)
+{
+ return as_double8(vload8(i, (const __constant long *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double8
+vload8(size_t i, const __global double *p)
+{
+ return as_double8(vload8(i, (const __global long *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double8
+vload8(size_t i, const __local double *p)
+{
+ return as_double8(vload8(i, (const __local long *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) float16
+vload16(size_t i, const float *p)
+{
+ return as_float16(vload16(i, (const int *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) float16
+vload16(size_t i, const __constant float *p)
+{
+ return as_float16(vload16(i, (const __constant int *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float16
+vload16(size_t i, const __global float *p)
+{
+ return as_float16(vload16(i, (const __global int *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float16
+vload16(size_t i, const __local float *p)
+{
+ return as_float16(vload16(i, (const __local int *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) double16
+vload16(size_t i, const double *p)
+{
+ return as_double16(vload16(i, (const long *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) double16
+vload16(size_t i, const __constant double *p)
+{
+ return as_double16(vload16(i, (const __constant long *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double16
+vload16(size_t i, const __global double *p)
+{
+ return as_double16(vload16(i, (const __global long *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double16
+vload16(size_t i, const __local double *p)
+{
+ return as_double16(vload16(i, (const __local long *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore2(float2 v, size_t i, float *p)
+{
+ vstore2(as_int2(v), i, (int *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore2(float2 v, size_t i, __global float *p)
+{
+ vstore2(as_int2(v), i, (__global int *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore2(float2 v, size_t i, __local float *p)
+{
+ vstore2(as_int2(v), i, (__local int *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore2(double2 v, size_t i, double *p)
+{
+ vstore2(as_long2(v), i, (long *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore2(double2 v, size_t i, __global double *p)
+{
+ vstore2(as_long2(v), i, (__global long *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore2(double2 v, size_t i, __local double *p)
+{
+ vstore2(as_long2(v), i, (__local long *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore3(float3 v, size_t i, float *p)
+{
+ vstore3(as_int3(v), i, (int *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore3(float3 v, size_t i, __global float *p)
+{
+ vstore3(as_int3(v), i, (__global int *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore3(float3 v, size_t i, __local float *p)
+{
+ vstore3(as_int3(v), i, (__local int *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore3(double3 v, size_t i, double *p)
+{
+ vstore3(as_long3(v), i, (long *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore3(double3 v, size_t i, __global double *p)
+{
+ vstore3(as_long3(v), i, (__global long *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore3(double3 v, size_t i, __local double *p)
+{
+ vstore3(as_long3(v), i, (__local long *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore4(float4 v, size_t i, float *p)
+{
+ vstore4(as_int4(v), i, (int *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore4(float4 v, size_t i, __global float *p)
+{
+ vstore4(as_int4(v), i, (__global int *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore4(float4 v, size_t i, __local float *p)
+{
+ vstore4(as_int4(v), i, (__local int *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore4(double4 v, size_t i, double *p)
+{
+ vstore4(as_long4(v), i, (long *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore4(double4 v, size_t i, __global double *p)
+{
+ vstore4(as_long4(v), i, (__global long *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore4(double4 v, size_t i, __local double *p)
+{
+ vstore4(as_long4(v), i, (__local long *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore8(float8 v, size_t i, float *p)
+{
+ vstore8(as_int8(v), i, (int *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore8(float8 v, size_t i, __global float *p)
+{
+ vstore8(as_int8(v), i, (__global int *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore8(float8 v, size_t i, __local float *p)
+{
+ vstore8(as_int8(v), i, (__local int *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore8(double8 v, size_t i, double *p)
+{
+ vstore8(as_long8(v), i, (long *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore8(double8 v, size_t i, __global double *p)
+{
+ vstore8(as_long8(v), i, (__global long *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore8(double8 v, size_t i, __local double *p)
+{
+ vstore8(as_long8(v), i, (__local long *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore16(float16 v, size_t i, float *p)
+{
+ vstore16(as_int16(v), i, (int *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore16(float16 v, size_t i, __global float *p)
+{
+ vstore16(as_int16(v), i, (__global int *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore16(float16 v, size_t i, __local float *p)
+{
+ vstore16(as_int16(v), i, (__local int *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore16(double16 v, size_t i, double *p)
+{
+ vstore16(as_long16(v), i, (long *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore16(double16 v, size_t i, __global double *p)
+{
+ vstore16(as_long16(v), i, (__global long *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore16(double16 v, size_t i, __local double *p)
+{
+ vstore16(as_long16(v), i, (__local long *)p);
+}
+#endif
+
+
+__attribute__((always_inline)) static char2
+vldp12(size_t i, const char *p)
+{
+ char2 ret;
+ p += i * 2;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp12"))) char2 vload2(size_t, const char *);
+extern __attribute__((overloadable, weak, alias("vldp12"))) uchar2 vload2(size_t, const uchar *);
+
+
+
+__attribute__((always_inline)) static char2
+vldc12(size_t i, const __constant char *p)
+{
+ char2 ret;
+ p += i * 2;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc12"))) char2 vload2(size_t, const __constant char *);
+extern __attribute__((overloadable, weak, alias("vldc12"))) uchar2 vload2(size_t, const __constant uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char2
+vldg12(size_t i, const __global char *p)
+{
+ char2 ret;
+ p += i * 2;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg12"))) char2 vload2(size_t, const __global char *);
+extern __attribute__((overloadable, weak, alias("vldg12"))) uchar2 vload2(size_t, const __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char2
+vldl12(size_t i, const __local char *p)
+{
+ char2 ret;
+ p += i * 2;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl12"))) char2 vload2(size_t, const __local char *);
+extern __attribute__((overloadable, weak, alias("vldl12"))) uchar2 vload2(size_t, const __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static short2
+vldp22(size_t i, const short *p)
+{
+ short2 ret;
+ p += i * 2;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp22"))) short2 vload2(size_t, const short *);
+extern __attribute__((overloadable, weak, alias("vldp22"))) ushort2 vload2(size_t, const ushort *);
+
+
+
+__attribute__((always_inline)) static short2
+vldc22(size_t i, const __constant short *p)
+{
+ short2 ret;
+ p += i * 2;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc22"))) short2 vload2(size_t, const __constant short *);
+extern __attribute__((overloadable, weak, alias("vldc22"))) ushort2 vload2(size_t, const __constant ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short2
+vldg22(size_t i, const __global short *p)
+{
+ short2 ret;
+ p += i * 2;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg22"))) short2 vload2(size_t, const __global short *);
+extern __attribute__((overloadable, weak, alias("vldg22"))) ushort2 vload2(size_t, const __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short2
+vldl22(size_t i, const __local short *p)
+{
+ short2 ret;
+ p += i * 2;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl22"))) short2 vload2(size_t, const __local short *);
+extern __attribute__((overloadable, weak, alias("vldl22"))) ushort2 vload2(size_t, const __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static int2
+vldp42(size_t i, const int *p)
+{
+ int2 ret;
+ p += i * 2;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp42"))) int2 vload2(size_t, const int *);
+extern __attribute__((overloadable, weak, alias("vldp42"))) uint2 vload2(size_t, const uint *);
+
+
+
+__attribute__((always_inline)) static int2
+vldc42(size_t i, const __constant int *p)
+{
+ int2 ret;
+ p += i * 2;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc42"))) int2 vload2(size_t, const __constant int *);
+extern __attribute__((overloadable, weak, alias("vldc42"))) uint2 vload2(size_t, const __constant uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int2
+vldg42(size_t i, const __global int *p)
+{
+ int2 ret;
+ p += i * 2;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg42"))) int2 vload2(size_t, const __global int *);
+extern __attribute__((overloadable, weak, alias("vldg42"))) uint2 vload2(size_t, const __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int2
+vldl42(size_t i, const __local int *p)
+{
+ int2 ret;
+ p += i * 2;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl42"))) int2 vload2(size_t, const __local int *);
+extern __attribute__((overloadable, weak, alias("vldl42"))) uint2 vload2(size_t, const __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static long2
+vldp82(size_t i, const long *p)
+{
+ long2 ret;
+ p += i * 2;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp82"))) long2 vload2(size_t, const long *);
+extern __attribute__((overloadable, weak, alias("vldp82"))) ulong2 vload2(size_t, const ulong *);
+
+
+
+__attribute__((always_inline)) static long2
+vldc82(size_t i, const __constant long *p)
+{
+ long2 ret;
+ p += i * 2;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc82"))) long2 vload2(size_t, const __constant long *);
+extern __attribute__((overloadable, weak, alias("vldc82"))) ulong2 vload2(size_t, const __constant ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long2
+vldg82(size_t i, const __global long *p)
+{
+ long2 ret;
+ p += i * 2;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg82"))) long2 vload2(size_t, const __global long *);
+extern __attribute__((overloadable, weak, alias("vldg82"))) ulong2 vload2(size_t, const __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long2
+vldl82(size_t i, const __local long *p)
+{
+ long2 ret;
+ p += i * 2;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl82"))) long2 vload2(size_t, const __local long *);
+extern __attribute__((overloadable, weak, alias("vldl82"))) ulong2 vload2(size_t, const __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static char3
+vldp13(size_t i, const char *p)
+{
+ char3 ret;
+ p += i * 3;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp13"))) char3 vload3(size_t, const char *);
+extern __attribute__((overloadable, weak, alias("vldp13"))) uchar3 vload3(size_t, const uchar *);
+
+
+
+__attribute__((always_inline)) static char3
+vldc13(size_t i, const __constant char *p)
+{
+ char3 ret;
+ p += i * 3;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc13"))) char3 vload3(size_t, const __constant char *);
+extern __attribute__((overloadable, weak, alias("vldc13"))) uchar3 vload3(size_t, const __constant uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char3
+vldg13(size_t i, const __global char *p)
+{
+ char3 ret;
+ p += i * 3;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg13"))) char3 vload3(size_t, const __global char *);
+extern __attribute__((overloadable, weak, alias("vldg13"))) uchar3 vload3(size_t, const __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char3
+vldl13(size_t i, const __local char *p)
+{
+ char3 ret;
+ p += i * 3;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl13"))) char3 vload3(size_t, const __local char *);
+extern __attribute__((overloadable, weak, alias("vldl13"))) uchar3 vload3(size_t, const __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static short3
+vldp23(size_t i, const short *p)
+{
+ short3 ret;
+ p += i * 3;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp23"))) short3 vload3(size_t, const short *);
+extern __attribute__((overloadable, weak, alias("vldp23"))) ushort3 vload3(size_t, const ushort *);
+
+
+
+__attribute__((always_inline)) static short3
+vldc23(size_t i, const __constant short *p)
+{
+ short3 ret;
+ p += i * 3;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc23"))) short3 vload3(size_t, const __constant short *);
+extern __attribute__((overloadable, weak, alias("vldc23"))) ushort3 vload3(size_t, const __constant ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short3
+vldg23(size_t i, const __global short *p)
+{
+ short3 ret;
+ p += i * 3;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg23"))) short3 vload3(size_t, const __global short *);
+extern __attribute__((overloadable, weak, alias("vldg23"))) ushort3 vload3(size_t, const __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short3
+vldl23(size_t i, const __local short *p)
+{
+ short3 ret;
+ p += i * 3;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl23"))) short3 vload3(size_t, const __local short *);
+extern __attribute__((overloadable, weak, alias("vldl23"))) ushort3 vload3(size_t, const __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static int3
+vldp43(size_t i, const int *p)
+{
+ int3 ret;
+ p += i * 3;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp43"))) int3 vload3(size_t, const int *);
+extern __attribute__((overloadable, weak, alias("vldp43"))) uint3 vload3(size_t, const uint *);
+
+
+
+__attribute__((always_inline)) static int3
+vldc43(size_t i, const __constant int *p)
+{
+ int3 ret;
+ p += i * 3;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc43"))) int3 vload3(size_t, const __constant int *);
+extern __attribute__((overloadable, weak, alias("vldc43"))) uint3 vload3(size_t, const __constant uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int3
+vldg43(size_t i, const __global int *p)
+{
+ int3 ret;
+ p += i * 3;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg43"))) int3 vload3(size_t, const __global int *);
+extern __attribute__((overloadable, weak, alias("vldg43"))) uint3 vload3(size_t, const __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int3
+vldl43(size_t i, const __local int *p)
+{
+ int3 ret;
+ p += i * 3;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl43"))) int3 vload3(size_t, const __local int *);
+extern __attribute__((overloadable, weak, alias("vldl43"))) uint3 vload3(size_t, const __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static long3
+vldp83(size_t i, const long *p)
+{
+ long3 ret;
+ p += i * 3;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp83"))) long3 vload3(size_t, const long *);
+extern __attribute__((overloadable, weak, alias("vldp83"))) ulong3 vload3(size_t, const ulong *);
+
+
+
+__attribute__((always_inline)) static long3
+vldc83(size_t i, const __constant long *p)
+{
+ long3 ret;
+ p += i * 3;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc83"))) long3 vload3(size_t, const __constant long *);
+extern __attribute__((overloadable, weak, alias("vldc83"))) ulong3 vload3(size_t, const __constant ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long3
+vldg83(size_t i, const __global long *p)
+{
+ long3 ret;
+ p += i * 3;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg83"))) long3 vload3(size_t, const __global long *);
+extern __attribute__((overloadable, weak, alias("vldg83"))) ulong3 vload3(size_t, const __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long3
+vldl83(size_t i, const __local long *p)
+{
+ long3 ret;
+ p += i * 3;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl83"))) long3 vload3(size_t, const __local long *);
+extern __attribute__((overloadable, weak, alias("vldl83"))) ulong3 vload3(size_t, const __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static char4
+vldp14(size_t i, const char *p)
+{
+ char4 ret;
+ p += i * 4;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp14"))) char4 vload4(size_t, const char *);
+extern __attribute__((overloadable, weak, alias("vldp14"))) uchar4 vload4(size_t, const uchar *);
+
+
+
+__attribute__((always_inline)) static char4
+vldc14(size_t i, const __constant char *p)
+{
+ char4 ret;
+ p += i * 4;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc14"))) char4 vload4(size_t, const __constant char *);
+extern __attribute__((overloadable, weak, alias("vldc14"))) uchar4 vload4(size_t, const __constant uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char4
+vldg14(size_t i, const __global char *p)
+{
+ char4 ret;
+ p += i * 4;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg14"))) char4 vload4(size_t, const __global char *);
+extern __attribute__((overloadable, weak, alias("vldg14"))) uchar4 vload4(size_t, const __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char4
+vldl14(size_t i, const __local char *p)
+{
+ char4 ret;
+ p += i * 4;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl14"))) char4 vload4(size_t, const __local char *);
+extern __attribute__((overloadable, weak, alias("vldl14"))) uchar4 vload4(size_t, const __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static short4
+vldp24(size_t i, const short *p)
+{
+ short4 ret;
+ p += i * 4;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp24"))) short4 vload4(size_t, const short *);
+extern __attribute__((overloadable, weak, alias("vldp24"))) ushort4 vload4(size_t, const ushort *);
+
+
+
+__attribute__((always_inline)) static short4
+vldc24(size_t i, const __constant short *p)
+{
+ short4 ret;
+ p += i * 4;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc24"))) short4 vload4(size_t, const __constant short *);
+extern __attribute__((overloadable, weak, alias("vldc24"))) ushort4 vload4(size_t, const __constant ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short4
+vldg24(size_t i, const __global short *p)
+{
+ short4 ret;
+ p += i * 4;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg24"))) short4 vload4(size_t, const __global short *);
+extern __attribute__((overloadable, weak, alias("vldg24"))) ushort4 vload4(size_t, const __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short4
+vldl24(size_t i, const __local short *p)
+{
+ short4 ret;
+ p += i * 4;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl24"))) short4 vload4(size_t, const __local short *);
+extern __attribute__((overloadable, weak, alias("vldl24"))) ushort4 vload4(size_t, const __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static int4
+vldp44(size_t i, const int *p)
+{
+ int4 ret;
+ p += i * 4;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp44"))) int4 vload4(size_t, const int *);
+extern __attribute__((overloadable, weak, alias("vldp44"))) uint4 vload4(size_t, const uint *);
+
+
+
+__attribute__((always_inline)) static int4
+vldc44(size_t i, const __constant int *p)
+{
+ int4 ret;
+ p += i * 4;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc44"))) int4 vload4(size_t, const __constant int *);
+extern __attribute__((overloadable, weak, alias("vldc44"))) uint4 vload4(size_t, const __constant uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int4
+vldg44(size_t i, const __global int *p)
+{
+ int4 ret;
+ p += i * 4;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg44"))) int4 vload4(size_t, const __global int *);
+extern __attribute__((overloadable, weak, alias("vldg44"))) uint4 vload4(size_t, const __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int4
+vldl44(size_t i, const __local int *p)
+{
+ int4 ret;
+ p += i * 4;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl44"))) int4 vload4(size_t, const __local int *);
+extern __attribute__((overloadable, weak, alias("vldl44"))) uint4 vload4(size_t, const __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static long4
+vldp84(size_t i, const long *p)
+{
+ long4 ret;
+ p += i * 4;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp84"))) long4 vload4(size_t, const long *);
+extern __attribute__((overloadable, weak, alias("vldp84"))) ulong4 vload4(size_t, const ulong *);
+
+
+
+__attribute__((always_inline)) static long4
+vldc84(size_t i, const __constant long *p)
+{
+ long4 ret;
+ p += i * 4;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc84"))) long4 vload4(size_t, const __constant long *);
+extern __attribute__((overloadable, weak, alias("vldc84"))) ulong4 vload4(size_t, const __constant ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long4
+vldg84(size_t i, const __global long *p)
+{
+ long4 ret;
+ p += i * 4;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg84"))) long4 vload4(size_t, const __global long *);
+extern __attribute__((overloadable, weak, alias("vldg84"))) ulong4 vload4(size_t, const __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long4
+vldl84(size_t i, const __local long *p)
+{
+ long4 ret;
+ p += i * 4;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl84"))) long4 vload4(size_t, const __local long *);
+extern __attribute__((overloadable, weak, alias("vldl84"))) ulong4 vload4(size_t, const __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static char8
+vldp18(size_t i, const char *p)
+{
+ char8 ret;
+ p += i * 8;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp18"))) char8 vload8(size_t, const char *);
+extern __attribute__((overloadable, weak, alias("vldp18"))) uchar8 vload8(size_t, const uchar *);
+
+
+
+__attribute__((always_inline)) static char8
+vldc18(size_t i, const __constant char *p)
+{
+ char8 ret;
+ p += i * 8;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc18"))) char8 vload8(size_t, const __constant char *);
+extern __attribute__((overloadable, weak, alias("vldc18"))) uchar8 vload8(size_t, const __constant uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char8
+vldg18(size_t i, const __global char *p)
+{
+ char8 ret;
+ p += i * 8;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg18"))) char8 vload8(size_t, const __global char *);
+extern __attribute__((overloadable, weak, alias("vldg18"))) uchar8 vload8(size_t, const __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char8
+vldl18(size_t i, const __local char *p)
+{
+ char8 ret;
+ p += i * 8;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl18"))) char8 vload8(size_t, const __local char *);
+extern __attribute__((overloadable, weak, alias("vldl18"))) uchar8 vload8(size_t, const __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static short8
+vldp28(size_t i, const short *p)
+{
+ short8 ret;
+ p += i * 8;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp28"))) short8 vload8(size_t, const short *);
+extern __attribute__((overloadable, weak, alias("vldp28"))) ushort8 vload8(size_t, const ushort *);
+
+
+
+__attribute__((always_inline)) static short8
+vldc28(size_t i, const __constant short *p)
+{
+ short8 ret;
+ p += i * 8;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc28"))) short8 vload8(size_t, const __constant short *);
+extern __attribute__((overloadable, weak, alias("vldc28"))) ushort8 vload8(size_t, const __constant ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short8
+vldg28(size_t i, const __global short *p)
+{
+ short8 ret;
+ p += i * 8;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg28"))) short8 vload8(size_t, const __global short *);
+extern __attribute__((overloadable, weak, alias("vldg28"))) ushort8 vload8(size_t, const __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short8
+vldl28(size_t i, const __local short *p)
+{
+ short8 ret;
+ p += i * 8;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl28"))) short8 vload8(size_t, const __local short *);
+extern __attribute__((overloadable, weak, alias("vldl28"))) ushort8 vload8(size_t, const __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static int8
+vldp48(size_t i, const int *p)
+{
+ int8 ret;
+ p += i * 8;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp48"))) int8 vload8(size_t, const int *);
+extern __attribute__((overloadable, weak, alias("vldp48"))) uint8 vload8(size_t, const uint *);
+
+
+
+__attribute__((always_inline)) static int8
+vldc48(size_t i, const __constant int *p)
+{
+ int8 ret;
+ p += i * 8;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc48"))) int8 vload8(size_t, const __constant int *);
+extern __attribute__((overloadable, weak, alias("vldc48"))) uint8 vload8(size_t, const __constant uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int8
+vldg48(size_t i, const __global int *p)
+{
+ int8 ret;
+ p += i * 8;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg48"))) int8 vload8(size_t, const __global int *);
+extern __attribute__((overloadable, weak, alias("vldg48"))) uint8 vload8(size_t, const __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int8
+vldl48(size_t i, const __local int *p)
+{
+ int8 ret;
+ p += i * 8;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl48"))) int8 vload8(size_t, const __local int *);
+extern __attribute__((overloadable, weak, alias("vldl48"))) uint8 vload8(size_t, const __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static long8
+vldp88(size_t i, const long *p)
+{
+ long8 ret;
+ p += i * 8;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp88"))) long8 vload8(size_t, const long *);
+extern __attribute__((overloadable, weak, alias("vldp88"))) ulong8 vload8(size_t, const ulong *);
+
+
+
+__attribute__((always_inline)) static long8
+vldc88(size_t i, const __constant long *p)
+{
+ long8 ret;
+ p += i * 8;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc88"))) long8 vload8(size_t, const __constant long *);
+extern __attribute__((overloadable, weak, alias("vldc88"))) ulong8 vload8(size_t, const __constant ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long8
+vldg88(size_t i, const __global long *p)
+{
+ long8 ret;
+ p += i * 8;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg88"))) long8 vload8(size_t, const __global long *);
+extern __attribute__((overloadable, weak, alias("vldg88"))) ulong8 vload8(size_t, const __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long8
+vldl88(size_t i, const __local long *p)
+{
+ long8 ret;
+ p += i * 8;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl88"))) long8 vload8(size_t, const __local long *);
+extern __attribute__((overloadable, weak, alias("vldl88"))) ulong8 vload8(size_t, const __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static char16
+vldp116(size_t i, const char *p)
+{
+ char16 ret;
+ p += i * 16;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+ ret.s8 = p[8];
+ ret.s9 = p[9];
+ ret.sa = p[10];
+ ret.sb = p[11];
+ ret.sc = p[12];
+ ret.sd = p[13];
+ ret.se = p[14];
+ ret.sf = p[15];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp116"))) char16 vload16(size_t, const char *);
+extern __attribute__((overloadable, weak, alias("vldp116"))) uchar16 vload16(size_t, const uchar *);
+
+
+
+__attribute__((always_inline)) static char16
+vldc116(size_t i, const __constant char *p)
+{
+ char16 ret;
+ p += i * 16;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+ ret.s8 = p[8];
+ ret.s9 = p[9];
+ ret.sa = p[10];
+ ret.sb = p[11];
+ ret.sc = p[12];
+ ret.sd = p[13];
+ ret.se = p[14];
+ ret.sf = p[15];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc116"))) char16 vload16(size_t, const __constant char *);
+extern __attribute__((overloadable, weak, alias("vldc116"))) uchar16 vload16(size_t, const __constant uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char16
+vldg116(size_t i, const __global char *p)
+{
+ char16 ret;
+ p += i * 16;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+ ret.s8 = p[8];
+ ret.s9 = p[9];
+ ret.sa = p[10];
+ ret.sb = p[11];
+ ret.sc = p[12];
+ ret.sd = p[13];
+ ret.se = p[14];
+ ret.sf = p[15];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg116"))) char16 vload16(size_t, const __global char *);
+extern __attribute__((overloadable, weak, alias("vldg116"))) uchar16 vload16(size_t, const __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char16
+vldl116(size_t i, const __local char *p)
+{
+ char16 ret;
+ p += i * 16;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+ ret.s8 = p[8];
+ ret.s9 = p[9];
+ ret.sa = p[10];
+ ret.sb = p[11];
+ ret.sc = p[12];
+ ret.sd = p[13];
+ ret.se = p[14];
+ ret.sf = p[15];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl116"))) char16 vload16(size_t, const __local char *);
+extern __attribute__((overloadable, weak, alias("vldl116"))) uchar16 vload16(size_t, const __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static short16
+vldp216(size_t i, const short *p)
+{
+ short16 ret;
+ p += i * 16;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+ ret.s8 = p[8];
+ ret.s9 = p[9];
+ ret.sa = p[10];
+ ret.sb = p[11];
+ ret.sc = p[12];
+ ret.sd = p[13];
+ ret.se = p[14];
+ ret.sf = p[15];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp216"))) short16 vload16(size_t, const short *);
+extern __attribute__((overloadable, weak, alias("vldp216"))) ushort16 vload16(size_t, const ushort *);
+
+
+
+__attribute__((always_inline)) static short16
+vldc216(size_t i, const __constant short *p)
+{
+ short16 ret;
+ p += i * 16;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+ ret.s8 = p[8];
+ ret.s9 = p[9];
+ ret.sa = p[10];
+ ret.sb = p[11];
+ ret.sc = p[12];
+ ret.sd = p[13];
+ ret.se = p[14];
+ ret.sf = p[15];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc216"))) short16 vload16(size_t, const __constant short *);
+extern __attribute__((overloadable, weak, alias("vldc216"))) ushort16 vload16(size_t, const __constant ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short16
+vldg216(size_t i, const __global short *p)
+{
+ short16 ret;
+ p += i * 16;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+ ret.s8 = p[8];
+ ret.s9 = p[9];
+ ret.sa = p[10];
+ ret.sb = p[11];
+ ret.sc = p[12];
+ ret.sd = p[13];
+ ret.se = p[14];
+ ret.sf = p[15];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg216"))) short16 vload16(size_t, const __global short *);
+extern __attribute__((overloadable, weak, alias("vldg216"))) ushort16 vload16(size_t, const __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short16
+vldl216(size_t i, const __local short *p)
+{
+ short16 ret;
+ p += i * 16;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+ ret.s8 = p[8];
+ ret.s9 = p[9];
+ ret.sa = p[10];
+ ret.sb = p[11];
+ ret.sc = p[12];
+ ret.sd = p[13];
+ ret.se = p[14];
+ ret.sf = p[15];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl216"))) short16 vload16(size_t, const __local short *);
+extern __attribute__((overloadable, weak, alias("vldl216"))) ushort16 vload16(size_t, const __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static int16
+vldp416(size_t i, const int *p)
+{
+ int16 ret;
+ p += i * 16;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+ ret.s8 = p[8];
+ ret.s9 = p[9];
+ ret.sa = p[10];
+ ret.sb = p[11];
+ ret.sc = p[12];
+ ret.sd = p[13];
+ ret.se = p[14];
+ ret.sf = p[15];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp416"))) int16 vload16(size_t, const int *);
+extern __attribute__((overloadable, weak, alias("vldp416"))) uint16 vload16(size_t, const uint *);
+
+
+
+__attribute__((always_inline)) static int16
+vldc416(size_t i, const __constant int *p)
+{
+ int16 ret;
+ p += i * 16;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+ ret.s8 = p[8];
+ ret.s9 = p[9];
+ ret.sa = p[10];
+ ret.sb = p[11];
+ ret.sc = p[12];
+ ret.sd = p[13];
+ ret.se = p[14];
+ ret.sf = p[15];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc416"))) int16 vload16(size_t, const __constant int *);
+extern __attribute__((overloadable, weak, alias("vldc416"))) uint16 vload16(size_t, const __constant uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int16
+vldg416(size_t i, const __global int *p)
+{
+ int16 ret;
+ p += i * 16;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+ ret.s8 = p[8];
+ ret.s9 = p[9];
+ ret.sa = p[10];
+ ret.sb = p[11];
+ ret.sc = p[12];
+ ret.sd = p[13];
+ ret.se = p[14];
+ ret.sf = p[15];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg416"))) int16 vload16(size_t, const __global int *);
+extern __attribute__((overloadable, weak, alias("vldg416"))) uint16 vload16(size_t, const __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int16
+vldl416(size_t i, const __local int *p)
+{
+ int16 ret;
+ p += i * 16;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+ ret.s8 = p[8];
+ ret.s9 = p[9];
+ ret.sa = p[10];
+ ret.sb = p[11];
+ ret.sc = p[12];
+ ret.sd = p[13];
+ ret.se = p[14];
+ ret.sf = p[15];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl416"))) int16 vload16(size_t, const __local int *);
+extern __attribute__((overloadable, weak, alias("vldl416"))) uint16 vload16(size_t, const __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static long16
+vldp816(size_t i, const long *p)
+{
+ long16 ret;
+ p += i * 16;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+ ret.s8 = p[8];
+ ret.s9 = p[9];
+ ret.sa = p[10];
+ ret.sb = p[11];
+ ret.sc = p[12];
+ ret.sd = p[13];
+ ret.se = p[14];
+ ret.sf = p[15];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp816"))) long16 vload16(size_t, const long *);
+extern __attribute__((overloadable, weak, alias("vldp816"))) ulong16 vload16(size_t, const ulong *);
+
+
+
+__attribute__((always_inline)) static long16
+vldc816(size_t i, const __constant long *p)
+{
+ long16 ret;
+ p += i * 16;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+ ret.s8 = p[8];
+ ret.s9 = p[9];
+ ret.sa = p[10];
+ ret.sb = p[11];
+ ret.sc = p[12];
+ ret.sd = p[13];
+ ret.se = p[14];
+ ret.sf = p[15];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc816"))) long16 vload16(size_t, const __constant long *);
+extern __attribute__((overloadable, weak, alias("vldc816"))) ulong16 vload16(size_t, const __constant ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long16
+vldg816(size_t i, const __global long *p)
+{
+ long16 ret;
+ p += i * 16;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+ ret.s8 = p[8];
+ ret.s9 = p[9];
+ ret.sa = p[10];
+ ret.sb = p[11];
+ ret.sc = p[12];
+ ret.sd = p[13];
+ ret.se = p[14];
+ ret.sf = p[15];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg816"))) long16 vload16(size_t, const __global long *);
+extern __attribute__((overloadable, weak, alias("vldg816"))) ulong16 vload16(size_t, const __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long16
+vldl816(size_t i, const __local long *p)
+{
+ long16 ret;
+ p += i * 16;
+ ret.s0 = p[0];
+ ret.s1 = p[1];
+ ret.s2 = p[2];
+ ret.s3 = p[3];
+ ret.s4 = p[4];
+ ret.s5 = p[5];
+ ret.s6 = p[6];
+ ret.s7 = p[7];
+ ret.s8 = p[8];
+ ret.s9 = p[9];
+ ret.sa = p[10];
+ ret.sb = p[11];
+ ret.sc = p[12];
+ ret.sd = p[13];
+ ret.se = p[14];
+ ret.sf = p[15];
+
+ return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl816"))) long16 vload16(size_t, const __local long *);
+extern __attribute__((overloadable, weak, alias("vldl816"))) ulong16 vload16(size_t, const __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp12(char2 v, size_t i, char *p)
+{
+ p += i * 2;
+ p[0] = v.s0;
+ p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp12"))) void vstore2( char2, size_t, char *);
+extern __attribute__((overloadable, weak, alias("vstp12"))) void vstore2(uchar2, size_t, uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg12(char2 v, size_t i, __global char *p)
+{
+ p += i * 2;
+ p[0] = v.s0;
+ p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg12"))) void vstore2( char2, size_t, __global char *);
+extern __attribute__((overloadable, weak, alias("vstg12"))) void vstore2(uchar2, size_t, __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl12(char2 v, size_t i, __local char *p)
+{
+ p += i * 2;
+ p[0] = v.s0;
+ p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl12"))) void vstore2( char2, size_t, __local char *);
+extern __attribute__((overloadable, weak, alias("vstl12"))) void vstore2(uchar2, size_t, __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp22(short2 v, size_t i, short *p)
+{
+ p += i * 2;
+ p[0] = v.s0;
+ p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp22"))) void vstore2( short2, size_t, short *);
+extern __attribute__((overloadable, weak, alias("vstp22"))) void vstore2(ushort2, size_t, ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg22(short2 v, size_t i, __global short *p)
+{
+ p += i * 2;
+ p[0] = v.s0;
+ p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg22"))) void vstore2( short2, size_t, __global short *);
+extern __attribute__((overloadable, weak, alias("vstg22"))) void vstore2(ushort2, size_t, __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl22(short2 v, size_t i, __local short *p)
+{
+ p += i * 2;
+ p[0] = v.s0;
+ p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl22"))) void vstore2( short2, size_t, __local short *);
+extern __attribute__((overloadable, weak, alias("vstl22"))) void vstore2(ushort2, size_t, __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp42(int2 v, size_t i, int *p)
+{
+ p += i * 2;
+ p[0] = v.s0;
+ p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp42"))) void vstore2( int2, size_t, int *);
+extern __attribute__((overloadable, weak, alias("vstp42"))) void vstore2(uint2, size_t, uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg42(int2 v, size_t i, __global int *p)
+{
+ p += i * 2;
+ p[0] = v.s0;
+ p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg42"))) void vstore2( int2, size_t, __global int *);
+extern __attribute__((overloadable, weak, alias("vstg42"))) void vstore2(uint2, size_t, __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl42(int2 v, size_t i, __local int *p)
+{
+ p += i * 2;
+ p[0] = v.s0;
+ p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl42"))) void vstore2( int2, size_t, __local int *);
+extern __attribute__((overloadable, weak, alias("vstl42"))) void vstore2(uint2, size_t, __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp82(long2 v, size_t i, long *p)
+{
+ p += i * 2;
+ p[0] = v.s0;
+ p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp82"))) void vstore2( long2, size_t, long *);
+extern __attribute__((overloadable, weak, alias("vstp82"))) void vstore2(ulong2, size_t, ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg82(long2 v, size_t i, __global long *p)
+{
+ p += i * 2;
+ p[0] = v.s0;
+ p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg82"))) void vstore2( long2, size_t, __global long *);
+extern __attribute__((overloadable, weak, alias("vstg82"))) void vstore2(ulong2, size_t, __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl82(long2 v, size_t i, __local long *p)
+{
+ p += i * 2;
+ p[0] = v.s0;
+ p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl82"))) void vstore2( long2, size_t, __local long *);
+extern __attribute__((overloadable, weak, alias("vstl82"))) void vstore2(ulong2, size_t, __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp13(char3 v, size_t i, char *p)
+{
+ p += i * 3;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp13"))) void vstore3( char3, size_t, char *);
+extern __attribute__((overloadable, weak, alias("vstp13"))) void vstore3(uchar3, size_t, uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg13(char3 v, size_t i, __global char *p)
+{
+ p += i * 3;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg13"))) void vstore3( char3, size_t, __global char *);
+extern __attribute__((overloadable, weak, alias("vstg13"))) void vstore3(uchar3, size_t, __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl13(char3 v, size_t i, __local char *p)
+{
+ p += i * 3;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl13"))) void vstore3( char3, size_t, __local char *);
+extern __attribute__((overloadable, weak, alias("vstl13"))) void vstore3(uchar3, size_t, __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp23(short3 v, size_t i, short *p)
+{
+ p += i * 3;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp23"))) void vstore3( short3, size_t, short *);
+extern __attribute__((overloadable, weak, alias("vstp23"))) void vstore3(ushort3, size_t, ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg23(short3 v, size_t i, __global short *p)
+{
+ p += i * 3;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg23"))) void vstore3( short3, size_t, __global short *);
+extern __attribute__((overloadable, weak, alias("vstg23"))) void vstore3(ushort3, size_t, __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl23(short3 v, size_t i, __local short *p)
+{
+ p += i * 3;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl23"))) void vstore3( short3, size_t, __local short *);
+extern __attribute__((overloadable, weak, alias("vstl23"))) void vstore3(ushort3, size_t, __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp43(int3 v, size_t i, int *p)
+{
+ p += i * 3;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp43"))) void vstore3( int3, size_t, int *);
+extern __attribute__((overloadable, weak, alias("vstp43"))) void vstore3(uint3, size_t, uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg43(int3 v, size_t i, __global int *p)
+{
+ p += i * 3;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg43"))) void vstore3( int3, size_t, __global int *);
+extern __attribute__((overloadable, weak, alias("vstg43"))) void vstore3(uint3, size_t, __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl43(int3 v, size_t i, __local int *p)
+{
+ p += i * 3;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl43"))) void vstore3( int3, size_t, __local int *);
+extern __attribute__((overloadable, weak, alias("vstl43"))) void vstore3(uint3, size_t, __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp83(long3 v, size_t i, long *p)
+{
+ p += i * 3;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp83"))) void vstore3( long3, size_t, long *);
+extern __attribute__((overloadable, weak, alias("vstp83"))) void vstore3(ulong3, size_t, ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg83(long3 v, size_t i, __global long *p)
+{
+ p += i * 3;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg83"))) void vstore3( long3, size_t, __global long *);
+extern __attribute__((overloadable, weak, alias("vstg83"))) void vstore3(ulong3, size_t, __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl83(long3 v, size_t i, __local long *p)
+{
+ p += i * 3;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl83"))) void vstore3( long3, size_t, __local long *);
+extern __attribute__((overloadable, weak, alias("vstl83"))) void vstore3(ulong3, size_t, __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp14(char4 v, size_t i, char *p)
+{
+ p += i * 4;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp14"))) void vstore4( char4, size_t, char *);
+extern __attribute__((overloadable, weak, alias("vstp14"))) void vstore4(uchar4, size_t, uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg14(char4 v, size_t i, __global char *p)
+{
+ p += i * 4;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg14"))) void vstore4( char4, size_t, __global char *);
+extern __attribute__((overloadable, weak, alias("vstg14"))) void vstore4(uchar4, size_t, __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl14(char4 v, size_t i, __local char *p)
+{
+ p += i * 4;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl14"))) void vstore4( char4, size_t, __local char *);
+extern __attribute__((overloadable, weak, alias("vstl14"))) void vstore4(uchar4, size_t, __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp24(short4 v, size_t i, short *p)
+{
+ p += i * 4;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp24"))) void vstore4( short4, size_t, short *);
+extern __attribute__((overloadable, weak, alias("vstp24"))) void vstore4(ushort4, size_t, ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg24(short4 v, size_t i, __global short *p)
+{
+ p += i * 4;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg24"))) void vstore4( short4, size_t, __global short *);
+extern __attribute__((overloadable, weak, alias("vstg24"))) void vstore4(ushort4, size_t, __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl24(short4 v, size_t i, __local short *p)
+{
+ p += i * 4;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl24"))) void vstore4( short4, size_t, __local short *);
+extern __attribute__((overloadable, weak, alias("vstl24"))) void vstore4(ushort4, size_t, __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp44(int4 v, size_t i, int *p)
+{
+ p += i * 4;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp44"))) void vstore4( int4, size_t, int *);
+extern __attribute__((overloadable, weak, alias("vstp44"))) void vstore4(uint4, size_t, uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg44(int4 v, size_t i, __global int *p)
+{
+ p += i * 4;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg44"))) void vstore4( int4, size_t, __global int *);
+extern __attribute__((overloadable, weak, alias("vstg44"))) void vstore4(uint4, size_t, __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl44(int4 v, size_t i, __local int *p)
+{
+ p += i * 4;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl44"))) void vstore4( int4, size_t, __local int *);
+extern __attribute__((overloadable, weak, alias("vstl44"))) void vstore4(uint4, size_t, __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp84(long4 v, size_t i, long *p)
+{
+ p += i * 4;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp84"))) void vstore4( long4, size_t, long *);
+extern __attribute__((overloadable, weak, alias("vstp84"))) void vstore4(ulong4, size_t, ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg84(long4 v, size_t i, __global long *p)
+{
+ p += i * 4;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg84"))) void vstore4( long4, size_t, __global long *);
+extern __attribute__((overloadable, weak, alias("vstg84"))) void vstore4(ulong4, size_t, __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl84(long4 v, size_t i, __local long *p)
+{
+ p += i * 4;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl84"))) void vstore4( long4, size_t, __local long *);
+extern __attribute__((overloadable, weak, alias("vstl84"))) void vstore4(ulong4, size_t, __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp18(char8 v, size_t i, char *p)
+{
+ p += i * 8;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+ p[4] = v.s4;
+ p[5] = v.s5;
+ p[6] = v.s6;
+ p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp18"))) void vstore8( char8, size_t, char *);
+extern __attribute__((overloadable, weak, alias("vstp18"))) void vstore8(uchar8, size_t, uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg18(char8 v, size_t i, __global char *p)
+{
+ p += i * 8;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+ p[4] = v.s4;
+ p[5] = v.s5;
+ p[6] = v.s6;
+ p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg18"))) void vstore8( char8, size_t, __global char *);
+extern __attribute__((overloadable, weak, alias("vstg18"))) void vstore8(uchar8, size_t, __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl18(char8 v, size_t i, __local char *p)
+{
+ p += i * 8;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+ p[4] = v.s4;
+ p[5] = v.s5;
+ p[6] = v.s6;
+ p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl18"))) void vstore8( char8, size_t, __local char *);
+extern __attribute__((overloadable, weak, alias("vstl18"))) void vstore8(uchar8, size_t, __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp28(short8 v, size_t i, short *p)
+{
+ p += i * 8;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+ p[4] = v.s4;
+ p[5] = v.s5;
+ p[6] = v.s6;
+ p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp28"))) void vstore8( short8, size_t, short *);
+extern __attribute__((overloadable, weak, alias("vstp28"))) void vstore8(ushort8, size_t, ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg28(short8 v, size_t i, __global short *p)
+{
+ p += i * 8;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+ p[4] = v.s4;
+ p[5] = v.s5;
+ p[6] = v.s6;
+ p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg28"))) void vstore8( short8, size_t, __global short *);
+extern __attribute__((overloadable, weak, alias("vstg28"))) void vstore8(ushort8, size_t, __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl28(short8 v, size_t i, __local short *p)
+{
+ p += i * 8;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+ p[4] = v.s4;
+ p[5] = v.s5;
+ p[6] = v.s6;
+ p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl28"))) void vstore8( short8, size_t, __local short *);
+extern __attribute__((overloadable, weak, alias("vstl28"))) void vstore8(ushort8, size_t, __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp48(int8 v, size_t i, int *p)
+{
+ p += i * 8;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+ p[4] = v.s4;
+ p[5] = v.s5;
+ p[6] = v.s6;
+ p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp48"))) void vstore8( int8, size_t, int *);
+extern __attribute__((overloadable, weak, alias("vstp48"))) void vstore8(uint8, size_t, uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg48(int8 v, size_t i, __global int *p)
+{
+ p += i * 8;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+ p[4] = v.s4;
+ p[5] = v.s5;
+ p[6] = v.s6;
+ p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg48"))) void vstore8( int8, size_t, __global int *);
+extern __attribute__((overloadable, weak, alias("vstg48"))) void vstore8(uint8, size_t, __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl48(int8 v, size_t i, __local int *p)
+{
+ p += i * 8;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+ p[4] = v.s4;
+ p[5] = v.s5;
+ p[6] = v.s6;
+ p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl48"))) void vstore8( int8, size_t, __local int *);
+extern __attribute__((overloadable, weak, alias("vstl48"))) void vstore8(uint8, size_t, __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp88(long8 v, size_t i, long *p)
+{
+ p += i * 8;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+ p[4] = v.s4;
+ p[5] = v.s5;
+ p[6] = v.s6;
+ p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp88"))) void vstore8( long8, size_t, long *);
+extern __attribute__((overloadable, weak, alias("vstp88"))) void vstore8(ulong8, size_t, ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg88(long8 v, size_t i, __global long *p)
+{
+ p += i * 8;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+ p[4] = v.s4;
+ p[5] = v.s5;
+ p[6] = v.s6;
+ p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg88"))) void vstore8( long8, size_t, __global long *);
+extern __attribute__((overloadable, weak, alias("vstg88"))) void vstore8(ulong8, size_t, __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl88(long8 v, size_t i, __local long *p)
+{
+ p += i * 8;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+ p[4] = v.s4;
+ p[5] = v.s5;
+ p[6] = v.s6;
+ p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl88"))) void vstore8( long8, size_t, __local long *);
+extern __attribute__((overloadable, weak, alias("vstl88"))) void vstore8(ulong8, size_t, __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp116(char16 v, size_t i, char *p)
+{
+ p += i * 16;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+ p[4] = v.s4;
+ p[5] = v.s5;
+ p[6] = v.s6;
+ p[7] = v.s7;
+ p[8] = v.s8;
+ p[9] = v.s9;
+ p[10] = v.sa;
+ p[11] = v.sb;
+ p[12] = v.sc;
+ p[13] = v.sd;
+ p[14] = v.se;
+ p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp116"))) void vstore16( char16, size_t, char *);
+extern __attribute__((overloadable, weak, alias("vstp116"))) void vstore16(uchar16, size_t, uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg116(char16 v, size_t i, __global char *p)
+{
+ p += i * 16;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+ p[4] = v.s4;
+ p[5] = v.s5;
+ p[6] = v.s6;
+ p[7] = v.s7;
+ p[8] = v.s8;
+ p[9] = v.s9;
+ p[10] = v.sa;
+ p[11] = v.sb;
+ p[12] = v.sc;
+ p[13] = v.sd;
+ p[14] = v.se;
+ p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg116"))) void vstore16( char16, size_t, __global char *);
+extern __attribute__((overloadable, weak, alias("vstg116"))) void vstore16(uchar16, size_t, __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl116(char16 v, size_t i, __local char *p)
+{
+ p += i * 16;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+ p[4] = v.s4;
+ p[5] = v.s5;
+ p[6] = v.s6;
+ p[7] = v.s7;
+ p[8] = v.s8;
+ p[9] = v.s9;
+ p[10] = v.sa;
+ p[11] = v.sb;
+ p[12] = v.sc;
+ p[13] = v.sd;
+ p[14] = v.se;
+ p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl116"))) void vstore16( char16, size_t, __local char *);
+extern __attribute__((overloadable, weak, alias("vstl116"))) void vstore16(uchar16, size_t, __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp216(short16 v, size_t i, short *p)
+{
+ p += i * 16;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+ p[4] = v.s4;
+ p[5] = v.s5;
+ p[6] = v.s6;
+ p[7] = v.s7;
+ p[8] = v.s8;
+ p[9] = v.s9;
+ p[10] = v.sa;
+ p[11] = v.sb;
+ p[12] = v.sc;
+ p[13] = v.sd;
+ p[14] = v.se;
+ p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp216"))) void vstore16( short16, size_t, short *);
+extern __attribute__((overloadable, weak, alias("vstp216"))) void vstore16(ushort16, size_t, ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg216(short16 v, size_t i, __global short *p)
+{
+ p += i * 16;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+ p[4] = v.s4;
+ p[5] = v.s5;
+ p[6] = v.s6;
+ p[7] = v.s7;
+ p[8] = v.s8;
+ p[9] = v.s9;
+ p[10] = v.sa;
+ p[11] = v.sb;
+ p[12] = v.sc;
+ p[13] = v.sd;
+ p[14] = v.se;
+ p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg216"))) void vstore16( short16, size_t, __global short *);
+extern __attribute__((overloadable, weak, alias("vstg216"))) void vstore16(ushort16, size_t, __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl216(short16 v, size_t i, __local short *p)
+{
+ p += i * 16;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+ p[4] = v.s4;
+ p[5] = v.s5;
+ p[6] = v.s6;
+ p[7] = v.s7;
+ p[8] = v.s8;
+ p[9] = v.s9;
+ p[10] = v.sa;
+ p[11] = v.sb;
+ p[12] = v.sc;
+ p[13] = v.sd;
+ p[14] = v.se;
+ p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl216"))) void vstore16( short16, size_t, __local short *);
+extern __attribute__((overloadable, weak, alias("vstl216"))) void vstore16(ushort16, size_t, __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp416(int16 v, size_t i, int *p)
+{
+ p += i * 16;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+ p[4] = v.s4;
+ p[5] = v.s5;
+ p[6] = v.s6;
+ p[7] = v.s7;
+ p[8] = v.s8;
+ p[9] = v.s9;
+ p[10] = v.sa;
+ p[11] = v.sb;
+ p[12] = v.sc;
+ p[13] = v.sd;
+ p[14] = v.se;
+ p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp416"))) void vstore16( int16, size_t, int *);
+extern __attribute__((overloadable, weak, alias("vstp416"))) void vstore16(uint16, size_t, uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg416(int16 v, size_t i, __global int *p)
+{
+ p += i * 16;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+ p[4] = v.s4;
+ p[5] = v.s5;
+ p[6] = v.s6;
+ p[7] = v.s7;
+ p[8] = v.s8;
+ p[9] = v.s9;
+ p[10] = v.sa;
+ p[11] = v.sb;
+ p[12] = v.sc;
+ p[13] = v.sd;
+ p[14] = v.se;
+ p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg416"))) void vstore16( int16, size_t, __global int *);
+extern __attribute__((overloadable, weak, alias("vstg416"))) void vstore16(uint16, size_t, __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl416(int16 v, size_t i, __local int *p)
+{
+ p += i * 16;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+ p[4] = v.s4;
+ p[5] = v.s5;
+ p[6] = v.s6;
+ p[7] = v.s7;
+ p[8] = v.s8;
+ p[9] = v.s9;
+ p[10] = v.sa;
+ p[11] = v.sb;
+ p[12] = v.sc;
+ p[13] = v.sd;
+ p[14] = v.se;
+ p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl416"))) void vstore16( int16, size_t, __local int *);
+extern __attribute__((overloadable, weak, alias("vstl416"))) void vstore16(uint16, size_t, __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp816(long16 v, size_t i, long *p)
+{
+ p += i * 16;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+ p[4] = v.s4;
+ p[5] = v.s5;
+ p[6] = v.s6;
+ p[7] = v.s7;
+ p[8] = v.s8;
+ p[9] = v.s9;
+ p[10] = v.sa;
+ p[11] = v.sb;
+ p[12] = v.sc;
+ p[13] = v.sd;
+ p[14] = v.se;
+ p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp816"))) void vstore16( long16, size_t, long *);
+extern __attribute__((overloadable, weak, alias("vstp816"))) void vstore16(ulong16, size_t, ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg816(long16 v, size_t i, __global long *p)
+{
+ p += i * 16;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+ p[4] = v.s4;
+ p[5] = v.s5;
+ p[6] = v.s6;
+ p[7] = v.s7;
+ p[8] = v.s8;
+ p[9] = v.s9;
+ p[10] = v.sa;
+ p[11] = v.sb;
+ p[12] = v.sc;
+ p[13] = v.sd;
+ p[14] = v.se;
+ p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg816"))) void vstore16( long16, size_t, __global long *);
+extern __attribute__((overloadable, weak, alias("vstg816"))) void vstore16(ulong16, size_t, __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl816(long16 v, size_t i, __local long *p)
+{
+ p += i * 16;
+ p[0] = v.s0;
+ p[1] = v.s1;
+ p[2] = v.s2;
+ p[3] = v.s3;
+ p[4] = v.s4;
+ p[5] = v.s5;
+ p[6] = v.s6;
+ p[7] = v.s7;
+ p[8] = v.s8;
+ p[9] = v.s9;
+ p[10] = v.sa;
+ p[11] = v.sb;
+ p[12] = v.sc;
+ p[13] = v.sd;
+ p[14] = v.se;
+ p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl816"))) void vstore16( long16, size_t, __local long *);
+extern __attribute__((overloadable, weak, alias("vstl816"))) void vstore16(ulong16, size_t, __local ulong *);
+#endif
+
Added: libclc/branches/amd-builtins/amd-builtins/vldst/vldst_half.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/vldst/vldst_half.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/vldst/vldst_half.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/vldst/vldst_half.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,4237 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+extern __attribute__((pure)) float __cvt_f16_to_f32(ushort);
+
+__attribute__((always_inline)) static float
+vldhp(size_t i, const half *p)
+{
+ ushort h = *(const short *)(p + i);
+ return __cvt_f16_to_f32(h);
+}
+extern __attribute__((overloadable, weak, alias("vldhp"))) float vload_half(size_t, const half *);
+extern __attribute__((overloadable, weak, alias("vldhp"))) float vloada_half(size_t, const half *);
+
+
+
+extern __attribute__((pure)) float __cvt_f16_to_f32(ushort);
+
+__attribute__((always_inline)) static float
+vldhc(size_t i, const __constant half *p)
+{
+ ushort h = *(const __constant short *)(p + i);
+ return __cvt_f16_to_f32(h);
+}
+extern __attribute__((overloadable, weak, alias("vldhc"))) float vload_half(size_t, const __constant half *);
+extern __attribute__((overloadable, weak, alias("vldhc"))) float vloada_half(size_t, const __constant half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float __cvt_f16_to_f32(ushort);
+
+__attribute__((always_inline)) static float
+vldhg(size_t i, const __global half *p)
+{
+ ushort h = *(const __global short *)(p + i);
+ return __cvt_f16_to_f32(h);
+}
+extern __attribute__((overloadable, weak, alias("vldhg"))) float vload_half(size_t, const __global half *);
+extern __attribute__((overloadable, weak, alias("vldhg"))) float vloada_half(size_t, const __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float __cvt_f16_to_f32(ushort);
+
+__attribute__((always_inline)) static float
+vldhl(size_t i, const __local half *p)
+{
+ ushort h = *(const __local short *)(p + i);
+ return __cvt_f16_to_f32(h);
+}
+extern __attribute__((overloadable, weak, alias("vldhl"))) float vload_half(size_t, const __local half *);
+extern __attribute__((overloadable, weak, alias("vldhl"))) float vloada_half(size_t, const __local half *);
+#endif
+
+
+extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2);
+
+__attribute__((overloadable, always_inline, weak)) float2
+vload_half2(size_t i, const half *p)
+{
+ return __cvt_2f16_to_2f32(vload2(i, (const ushort *)p));
+}
+
+
+
+extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2);
+
+__attribute__((overloadable, always_inline, weak)) float2
+vload_half2(size_t i, const __constant half *p)
+{
+ return __cvt_2f16_to_2f32(vload2(i, (const __constant ushort *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2);
+
+__attribute__((overloadable, always_inline, weak)) float2
+vload_half2(size_t i, const __global half *p)
+{
+ return __cvt_2f16_to_2f32(vload2(i, (const __global ushort *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2);
+
+__attribute__((overloadable, always_inline, weak)) float2
+vload_half2(size_t i, const __local half *p)
+{
+ return __cvt_2f16_to_2f32(vload2(i, (const __local ushort *)p));
+}
+#endif
+
+
+extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3);
+
+__attribute__((overloadable, always_inline, weak)) float3
+vload_half3(size_t i, const half *p)
+{
+ return __cvt_3f16_to_3f32(vload3(i, (const ushort *)p));
+}
+
+
+
+extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3);
+
+__attribute__((overloadable, always_inline, weak)) float3
+vload_half3(size_t i, const __constant half *p)
+{
+ return __cvt_3f16_to_3f32(vload3(i, (const __constant ushort *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3);
+
+__attribute__((overloadable, always_inline, weak)) float3
+vload_half3(size_t i, const __global half *p)
+{
+ return __cvt_3f16_to_3f32(vload3(i, (const __global ushort *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3);
+
+__attribute__((overloadable, always_inline, weak)) float3
+vload_half3(size_t i, const __local half *p)
+{
+ return __cvt_3f16_to_3f32(vload3(i, (const __local ushort *)p));
+}
+#endif
+
+
+extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4);
+
+__attribute__((overloadable, always_inline, weak)) float4
+vload_half4(size_t i, const half *p)
+{
+ return __cvt_4f16_to_4f32(vload4(i, (const ushort *)p));
+}
+
+
+
+extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4);
+
+__attribute__((overloadable, always_inline, weak)) float4
+vload_half4(size_t i, const __constant half *p)
+{
+ return __cvt_4f16_to_4f32(vload4(i, (const __constant ushort *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4);
+
+__attribute__((overloadable, always_inline, weak)) float4
+vload_half4(size_t i, const __global half *p)
+{
+ return __cvt_4f16_to_4f32(vload4(i, (const __global ushort *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4);
+
+__attribute__((overloadable, always_inline, weak)) float4
+vload_half4(size_t i, const __local half *p)
+{
+ return __cvt_4f16_to_4f32(vload4(i, (const __local ushort *)p));
+}
+#endif
+
+
+extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8);
+
+__attribute__((overloadable, always_inline, weak)) float8
+vload_half8(size_t i, const half *p)
+{
+ return __cvt_8f16_to_8f32(vload8(i, (const ushort *)p));
+}
+
+
+
+extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8);
+
+__attribute__((overloadable, always_inline, weak)) float8
+vload_half8(size_t i, const __constant half *p)
+{
+ return __cvt_8f16_to_8f32(vload8(i, (const __constant ushort *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8);
+
+__attribute__((overloadable, always_inline, weak)) float8
+vload_half8(size_t i, const __global half *p)
+{
+ return __cvt_8f16_to_8f32(vload8(i, (const __global ushort *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8);
+
+__attribute__((overloadable, always_inline, weak)) float8
+vload_half8(size_t i, const __local half *p)
+{
+ return __cvt_8f16_to_8f32(vload8(i, (const __local ushort *)p));
+}
+#endif
+
+
+extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16);
+
+__attribute__((overloadable, always_inline, weak)) float16
+vload_half16(size_t i, const half *p)
+{
+ return __cvt_16f16_to_16f32(vload16(i, (const ushort *)p));
+}
+
+
+
+extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16);
+
+__attribute__((overloadable, always_inline, weak)) float16
+vload_half16(size_t i, const __constant half *p)
+{
+ return __cvt_16f16_to_16f32(vload16(i, (const __constant ushort *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16);
+
+__attribute__((overloadable, always_inline, weak)) float16
+vload_half16(size_t i, const __global half *p)
+{
+ return __cvt_16f16_to_16f32(vload16(i, (const __global ushort *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16);
+
+__attribute__((overloadable, always_inline, weak)) float16
+vload_half16(size_t i, const __local half *p)
+{
+ return __cvt_16f16_to_16f32(vload16(i, (const __local ushort *)p));
+}
+#endif
+
+
+extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2);
+
+__attribute__((overloadable, always_inline, weak)) float2
+vloada_half2(size_t i, const half *p)
+{
+
+ return __cvt_2f16_to_2f32(*(const ushort2 *)(p + i * 2));
+
+}
+
+
+
+extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2);
+
+__attribute__((overloadable, always_inline, weak)) float2
+vloada_half2(size_t i, const __constant half *p)
+{
+
+ return __cvt_2f16_to_2f32(*(const __constant ushort2 *)(p + i * 2));
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2);
+
+__attribute__((overloadable, always_inline, weak)) float2
+vloada_half2(size_t i, const __global half *p)
+{
+
+ return __cvt_2f16_to_2f32(*(const __global ushort2 *)(p + i * 2));
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2);
+
+__attribute__((overloadable, always_inline, weak)) float2
+vloada_half2(size_t i, const __local half *p)
+{
+
+ return __cvt_2f16_to_2f32(*(const __local ushort2 *)(p + i * 2));
+
+}
+#endif
+
+
+extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3);
+
+__attribute__((overloadable, always_inline, weak)) float3
+vloada_half3(size_t i, const half *p)
+{
+
+ ushort4 h = *(const ushort4 *)(p + i * 4);
+ return __cvt_3f16_to_3f32(h.s012);
+
+}
+
+
+
+extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3);
+
+__attribute__((overloadable, always_inline, weak)) float3
+vloada_half3(size_t i, const __constant half *p)
+{
+
+ ushort4 h = *(const __constant ushort4 *)(p + i * 4);
+ return __cvt_3f16_to_3f32(h.s012);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3);
+
+__attribute__((overloadable, always_inline, weak)) float3
+vloada_half3(size_t i, const __global half *p)
+{
+
+ ushort4 h = *(const __global ushort4 *)(p + i * 4);
+ return __cvt_3f16_to_3f32(h.s012);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3);
+
+__attribute__((overloadable, always_inline, weak)) float3
+vloada_half3(size_t i, const __local half *p)
+{
+
+ ushort4 h = *(const __local ushort4 *)(p + i * 4);
+ return __cvt_3f16_to_3f32(h.s012);
+
+}
+#endif
+
+
+extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4);
+
+__attribute__((overloadable, always_inline, weak)) float4
+vloada_half4(size_t i, const half *p)
+{
+
+ return __cvt_4f16_to_4f32(*(const ushort4 *)(p + i * 4));
+
+}
+
+
+
+extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4);
+
+__attribute__((overloadable, always_inline, weak)) float4
+vloada_half4(size_t i, const __constant half *p)
+{
+
+ return __cvt_4f16_to_4f32(*(const __constant ushort4 *)(p + i * 4));
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4);
+
+__attribute__((overloadable, always_inline, weak)) float4
+vloada_half4(size_t i, const __global half *p)
+{
+
+ return __cvt_4f16_to_4f32(*(const __global ushort4 *)(p + i * 4));
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4);
+
+__attribute__((overloadable, always_inline, weak)) float4
+vloada_half4(size_t i, const __local half *p)
+{
+
+ return __cvt_4f16_to_4f32(*(const __local ushort4 *)(p + i * 4));
+
+}
+#endif
+
+
+extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8);
+
+__attribute__((overloadable, always_inline, weak)) float8
+vloada_half8(size_t i, const half *p)
+{
+
+ return __cvt_8f16_to_8f32(*(const ushort8 *)(p + i * 8));
+
+}
+
+
+
+extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8);
+
+__attribute__((overloadable, always_inline, weak)) float8
+vloada_half8(size_t i, const __constant half *p)
+{
+
+ return __cvt_8f16_to_8f32(*(const __constant ushort8 *)(p + i * 8));
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8);
+
+__attribute__((overloadable, always_inline, weak)) float8
+vloada_half8(size_t i, const __global half *p)
+{
+
+ return __cvt_8f16_to_8f32(*(const __global ushort8 *)(p + i * 8));
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8);
+
+__attribute__((overloadable, always_inline, weak)) float8
+vloada_half8(size_t i, const __local half *p)
+{
+
+ return __cvt_8f16_to_8f32(*(const __local ushort8 *)(p + i * 8));
+
+}
+#endif
+
+
+extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16);
+
+__attribute__((overloadable, always_inline, weak)) float16
+vloada_half16(size_t i, const half *p)
+{
+
+ return __cvt_16f16_to_16f32(*(const ushort16 *)(p + i * 16));
+
+}
+
+
+
+extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16);
+
+__attribute__((overloadable, always_inline, weak)) float16
+vloada_half16(size_t i, const __constant half *p)
+{
+
+ return __cvt_16f16_to_16f32(*(const __constant ushort16 *)(p + i * 16));
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16);
+
+__attribute__((overloadable, always_inline, weak)) float16
+vloada_half16(size_t i, const __global half *p)
+{
+
+ return __cvt_16f16_to_16f32(*(const __global ushort16 *)(p + i * 16));
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16);
+
+__attribute__((overloadable, always_inline, weak)) float16
+vloada_half16(size_t i, const __local half *p)
+{
+
+ return __cvt_16f16_to_16f32(*(const __local ushort16 *)(p + i * 16));
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_cur(float);
+
+__attribute__((always_inline)) static void
+vsthpf32c(float v, size_t i, half *p)
+{
+ *(ushort *)(p + i) = __cvt_f32_to_f16_cur(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf32c"))) void vstore_half(float, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf32c"))) void vstorea_half(float, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_cur(float);
+
+__attribute__((always_inline)) static void
+vsthgf32c(float v, size_t i, __global half *p)
+{
+ *(__global ushort *)(p + i) = __cvt_f32_to_f16_cur(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf32c"))) void vstore_half(float, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf32c"))) void vstorea_half(float, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_cur(float);
+
+__attribute__((always_inline)) static void
+vsthlf32c(float v, size_t i, __local half *p)
+{
+ *(__local ushort *)(p + i) = __cvt_f32_to_f16_cur(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf32c"))) void vstore_half(float, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf32c"))) void vstorea_half(float, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rte(float);
+
+__attribute__((always_inline)) static void
+vsthpf32e(float v, size_t i, half *p)
+{
+ *(ushort *)(p + i) = __cvt_f32_to_f16_rte(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf32e"))) void vstore_half_rte(float, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf32e"))) void vstorea_half_rte(float, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rte(float);
+
+__attribute__((always_inline)) static void
+vsthgf32e(float v, size_t i, __global half *p)
+{
+ *(__global ushort *)(p + i) = __cvt_f32_to_f16_rte(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf32e"))) void vstore_half_rte(float, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf32e"))) void vstorea_half_rte(float, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rte(float);
+
+__attribute__((always_inline)) static void
+vsthlf32e(float v, size_t i, __local half *p)
+{
+ *(__local ushort *)(p + i) = __cvt_f32_to_f16_rte(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf32e"))) void vstore_half_rte(float, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf32e"))) void vstorea_half_rte(float, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtp(float);
+
+__attribute__((always_inline)) static void
+vsthpf32p(float v, size_t i, half *p)
+{
+ *(ushort *)(p + i) = __cvt_f32_to_f16_rtp(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf32p"))) void vstore_half_rtp(float, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf32p"))) void vstorea_half_rtp(float, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtp(float);
+
+__attribute__((always_inline)) static void
+vsthgf32p(float v, size_t i, __global half *p)
+{
+ *(__global ushort *)(p + i) = __cvt_f32_to_f16_rtp(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf32p"))) void vstore_half_rtp(float, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf32p"))) void vstorea_half_rtp(float, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtp(float);
+
+__attribute__((always_inline)) static void
+vsthlf32p(float v, size_t i, __local half *p)
+{
+ *(__local ushort *)(p + i) = __cvt_f32_to_f16_rtp(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf32p"))) void vstore_half_rtp(float, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf32p"))) void vstorea_half_rtp(float, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtn(float);
+
+__attribute__((always_inline)) static void
+vsthpf32n(float v, size_t i, half *p)
+{
+ *(ushort *)(p + i) = __cvt_f32_to_f16_rtn(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf32n"))) void vstore_half_rtn(float, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf32n"))) void vstorea_half_rtn(float, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtn(float);
+
+__attribute__((always_inline)) static void
+vsthgf32n(float v, size_t i, __global half *p)
+{
+ *(__global ushort *)(p + i) = __cvt_f32_to_f16_rtn(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf32n"))) void vstore_half_rtn(float, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf32n"))) void vstorea_half_rtn(float, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtn(float);
+
+__attribute__((always_inline)) static void
+vsthlf32n(float v, size_t i, __local half *p)
+{
+ *(__local ushort *)(p + i) = __cvt_f32_to_f16_rtn(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf32n"))) void vstore_half_rtn(float, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf32n"))) void vstorea_half_rtn(float, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtz(float);
+
+__attribute__((always_inline)) static void
+vsthpf32z(float v, size_t i, half *p)
+{
+ *(ushort *)(p + i) = __cvt_f32_to_f16_rtz(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf32z"))) void vstore_half_rtz(float, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf32z"))) void vstorea_half_rtz(float, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtz(float);
+
+__attribute__((always_inline)) static void
+vsthgf32z(float v, size_t i, __global half *p)
+{
+ *(__global ushort *)(p + i) = __cvt_f32_to_f16_rtz(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf32z"))) void vstore_half_rtz(float, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf32z"))) void vstorea_half_rtz(float, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtz(float);
+
+__attribute__((always_inline)) static void
+vsthlf32z(float v, size_t i, __local half *p)
+{
+ *(__local ushort *)(p + i) = __cvt_f32_to_f16_rtz(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf32z"))) void vstore_half_rtz(float, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf32z"))) void vstorea_half_rtz(float, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_cur(double);
+
+__attribute__((always_inline)) static void
+vsthpf64c(double v, size_t i, half *p)
+{
+ *(ushort *)(p + i) = __cvt_f64_to_f16_cur(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf64c"))) void vstore_half(double, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf64c"))) void vstorea_half(double, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_cur(double);
+
+__attribute__((always_inline)) static void
+vsthgf64c(double v, size_t i, __global half *p)
+{
+ *(__global ushort *)(p + i) = __cvt_f64_to_f16_cur(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf64c"))) void vstore_half(double, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf64c"))) void vstorea_half(double, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_cur(double);
+
+__attribute__((always_inline)) static void
+vsthlf64c(double v, size_t i, __local half *p)
+{
+ *(__local ushort *)(p + i) = __cvt_f64_to_f16_cur(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf64c"))) void vstore_half(double, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf64c"))) void vstorea_half(double, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rte(double);
+
+__attribute__((always_inline)) static void
+vsthpf64e(double v, size_t i, half *p)
+{
+ *(ushort *)(p + i) = __cvt_f64_to_f16_rte(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf64e"))) void vstore_half_rte(double, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf64e"))) void vstorea_half_rte(double, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rte(double);
+
+__attribute__((always_inline)) static void
+vsthgf64e(double v, size_t i, __global half *p)
+{
+ *(__global ushort *)(p + i) = __cvt_f64_to_f16_rte(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf64e"))) void vstore_half_rte(double, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf64e"))) void vstorea_half_rte(double, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rte(double);
+
+__attribute__((always_inline)) static void
+vsthlf64e(double v, size_t i, __local half *p)
+{
+ *(__local ushort *)(p + i) = __cvt_f64_to_f16_rte(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf64e"))) void vstore_half_rte(double, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf64e"))) void vstorea_half_rte(double, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtp(double);
+
+__attribute__((always_inline)) static void
+vsthpf64p(double v, size_t i, half *p)
+{
+ *(ushort *)(p + i) = __cvt_f64_to_f16_rtp(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf64p"))) void vstore_half_rtp(double, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf64p"))) void vstorea_half_rtp(double, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtp(double);
+
+__attribute__((always_inline)) static void
+vsthgf64p(double v, size_t i, __global half *p)
+{
+ *(__global ushort *)(p + i) = __cvt_f64_to_f16_rtp(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf64p"))) void vstore_half_rtp(double, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf64p"))) void vstorea_half_rtp(double, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtp(double);
+
+__attribute__((always_inline)) static void
+vsthlf64p(double v, size_t i, __local half *p)
+{
+ *(__local ushort *)(p + i) = __cvt_f64_to_f16_rtp(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf64p"))) void vstore_half_rtp(double, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf64p"))) void vstorea_half_rtp(double, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtn(double);
+
+__attribute__((always_inline)) static void
+vsthpf64n(double v, size_t i, half *p)
+{
+ *(ushort *)(p + i) = __cvt_f64_to_f16_rtn(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf64n"))) void vstore_half_rtn(double, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf64n"))) void vstorea_half_rtn(double, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtn(double);
+
+__attribute__((always_inline)) static void
+vsthgf64n(double v, size_t i, __global half *p)
+{
+ *(__global ushort *)(p + i) = __cvt_f64_to_f16_rtn(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf64n"))) void vstore_half_rtn(double, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf64n"))) void vstorea_half_rtn(double, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtn(double);
+
+__attribute__((always_inline)) static void
+vsthlf64n(double v, size_t i, __local half *p)
+{
+ *(__local ushort *)(p + i) = __cvt_f64_to_f16_rtn(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf64n"))) void vstore_half_rtn(double, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf64n"))) void vstorea_half_rtn(double, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtz(double);
+
+__attribute__((always_inline)) static void
+vsthpf64z(double v, size_t i, half *p)
+{
+ *(ushort *)(p + i) = __cvt_f64_to_f16_rtz(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf64z"))) void vstore_half_rtz(double, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf64z"))) void vstorea_half_rtz(double, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtz(double);
+
+__attribute__((always_inline)) static void
+vsthgf64z(double v, size_t i, __global half *p)
+{
+ *(__global ushort *)(p + i) = __cvt_f64_to_f16_rtz(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf64z"))) void vstore_half_rtz(double, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf64z"))) void vstorea_half_rtz(double, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtz(double);
+
+__attribute__((always_inline)) static void
+vsthlf64z(double v, size_t i, __local half *p)
+{
+ *(__local ushort *)(p + i) = __cvt_f64_to_f16_rtz(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf64z"))) void vstore_half_rtz(double, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf64z"))) void vstorea_half_rtz(double, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_cur(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2(float2 v, size_t i, half *p)
+{
+ vstore2(__cvt_2f32_to_2f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_cur(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2(float2 v, size_t i, __global half *p)
+{
+ vstore2(__cvt_2f32_to_2f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_cur(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2(float2 v, size_t i, __local half *p)
+{
+ vstore2(__cvt_2f32_to_2f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rte(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rte(float2 v, size_t i, half *p)
+{
+ vstore2(__cvt_2f32_to_2f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rte(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rte(float2 v, size_t i, __global half *p)
+{
+ vstore2(__cvt_2f32_to_2f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rte(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rte(float2 v, size_t i, __local half *p)
+{
+ vstore2(__cvt_2f32_to_2f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtp(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtp(float2 v, size_t i, half *p)
+{
+ vstore2(__cvt_2f32_to_2f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtp(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtp(float2 v, size_t i, __global half *p)
+{
+ vstore2(__cvt_2f32_to_2f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtp(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtp(float2 v, size_t i, __local half *p)
+{
+ vstore2(__cvt_2f32_to_2f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtn(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtn(float2 v, size_t i, half *p)
+{
+ vstore2(__cvt_2f32_to_2f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtn(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtn(float2 v, size_t i, __global half *p)
+{
+ vstore2(__cvt_2f32_to_2f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtn(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtn(float2 v, size_t i, __local half *p)
+{
+ vstore2(__cvt_2f32_to_2f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtz(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtz(float2 v, size_t i, half *p)
+{
+ vstore2(__cvt_2f32_to_2f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtz(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtz(float2 v, size_t i, __global half *p)
+{
+ vstore2(__cvt_2f32_to_2f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtz(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtz(float2 v, size_t i, __local half *p)
+{
+ vstore2(__cvt_2f32_to_2f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_cur(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3(float3 v, size_t i, half *p)
+{
+ vstore3(__cvt_3f32_to_3f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_cur(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3(float3 v, size_t i, __global half *p)
+{
+ vstore3(__cvt_3f32_to_3f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_cur(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3(float3 v, size_t i, __local half *p)
+{
+ vstore3(__cvt_3f32_to_3f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rte(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rte(float3 v, size_t i, half *p)
+{
+ vstore3(__cvt_3f32_to_3f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rte(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rte(float3 v, size_t i, __global half *p)
+{
+ vstore3(__cvt_3f32_to_3f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rte(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rte(float3 v, size_t i, __local half *p)
+{
+ vstore3(__cvt_3f32_to_3f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtp(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtp(float3 v, size_t i, half *p)
+{
+ vstore3(__cvt_3f32_to_3f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtp(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtp(float3 v, size_t i, __global half *p)
+{
+ vstore3(__cvt_3f32_to_3f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtp(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtp(float3 v, size_t i, __local half *p)
+{
+ vstore3(__cvt_3f32_to_3f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtn(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtn(float3 v, size_t i, half *p)
+{
+ vstore3(__cvt_3f32_to_3f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtn(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtn(float3 v, size_t i, __global half *p)
+{
+ vstore3(__cvt_3f32_to_3f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtn(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtn(float3 v, size_t i, __local half *p)
+{
+ vstore3(__cvt_3f32_to_3f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtz(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtz(float3 v, size_t i, half *p)
+{
+ vstore3(__cvt_3f32_to_3f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtz(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtz(float3 v, size_t i, __global half *p)
+{
+ vstore3(__cvt_3f32_to_3f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtz(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtz(float3 v, size_t i, __local half *p)
+{
+ vstore3(__cvt_3f32_to_3f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_cur(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4(float4 v, size_t i, half *p)
+{
+ vstore4(__cvt_4f32_to_4f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_cur(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4(float4 v, size_t i, __global half *p)
+{
+ vstore4(__cvt_4f32_to_4f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_cur(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4(float4 v, size_t i, __local half *p)
+{
+ vstore4(__cvt_4f32_to_4f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rte(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rte(float4 v, size_t i, half *p)
+{
+ vstore4(__cvt_4f32_to_4f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rte(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rte(float4 v, size_t i, __global half *p)
+{
+ vstore4(__cvt_4f32_to_4f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rte(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rte(float4 v, size_t i, __local half *p)
+{
+ vstore4(__cvt_4f32_to_4f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtp(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtp(float4 v, size_t i, half *p)
+{
+ vstore4(__cvt_4f32_to_4f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtp(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtp(float4 v, size_t i, __global half *p)
+{
+ vstore4(__cvt_4f32_to_4f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtp(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtp(float4 v, size_t i, __local half *p)
+{
+ vstore4(__cvt_4f32_to_4f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtn(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtn(float4 v, size_t i, half *p)
+{
+ vstore4(__cvt_4f32_to_4f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtn(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtn(float4 v, size_t i, __global half *p)
+{
+ vstore4(__cvt_4f32_to_4f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtn(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtn(float4 v, size_t i, __local half *p)
+{
+ vstore4(__cvt_4f32_to_4f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtz(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtz(float4 v, size_t i, half *p)
+{
+ vstore4(__cvt_4f32_to_4f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtz(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtz(float4 v, size_t i, __global half *p)
+{
+ vstore4(__cvt_4f32_to_4f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtz(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtz(float4 v, size_t i, __local half *p)
+{
+ vstore4(__cvt_4f32_to_4f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_cur(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8(float8 v, size_t i, half *p)
+{
+ vstore8(__cvt_8f32_to_8f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_cur(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8(float8 v, size_t i, __global half *p)
+{
+ vstore8(__cvt_8f32_to_8f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_cur(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8(float8 v, size_t i, __local half *p)
+{
+ vstore8(__cvt_8f32_to_8f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rte(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rte(float8 v, size_t i, half *p)
+{
+ vstore8(__cvt_8f32_to_8f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rte(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rte(float8 v, size_t i, __global half *p)
+{
+ vstore8(__cvt_8f32_to_8f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rte(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rte(float8 v, size_t i, __local half *p)
+{
+ vstore8(__cvt_8f32_to_8f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtp(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtp(float8 v, size_t i, half *p)
+{
+ vstore8(__cvt_8f32_to_8f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtp(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtp(float8 v, size_t i, __global half *p)
+{
+ vstore8(__cvt_8f32_to_8f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtp(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtp(float8 v, size_t i, __local half *p)
+{
+ vstore8(__cvt_8f32_to_8f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtn(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtn(float8 v, size_t i, half *p)
+{
+ vstore8(__cvt_8f32_to_8f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtn(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtn(float8 v, size_t i, __global half *p)
+{
+ vstore8(__cvt_8f32_to_8f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtn(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtn(float8 v, size_t i, __local half *p)
+{
+ vstore8(__cvt_8f32_to_8f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtz(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtz(float8 v, size_t i, half *p)
+{
+ vstore8(__cvt_8f32_to_8f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtz(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtz(float8 v, size_t i, __global half *p)
+{
+ vstore8(__cvt_8f32_to_8f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtz(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtz(float8 v, size_t i, __local half *p)
+{
+ vstore8(__cvt_8f32_to_8f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_cur(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16(float16 v, size_t i, half *p)
+{
+ vstore16(__cvt_16f32_to_16f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_cur(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16(float16 v, size_t i, __global half *p)
+{
+ vstore16(__cvt_16f32_to_16f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_cur(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16(float16 v, size_t i, __local half *p)
+{
+ vstore16(__cvt_16f32_to_16f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rte(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rte(float16 v, size_t i, half *p)
+{
+ vstore16(__cvt_16f32_to_16f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rte(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rte(float16 v, size_t i, __global half *p)
+{
+ vstore16(__cvt_16f32_to_16f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rte(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rte(float16 v, size_t i, __local half *p)
+{
+ vstore16(__cvt_16f32_to_16f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtp(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtp(float16 v, size_t i, half *p)
+{
+ vstore16(__cvt_16f32_to_16f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtp(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtp(float16 v, size_t i, __global half *p)
+{
+ vstore16(__cvt_16f32_to_16f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtp(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtp(float16 v, size_t i, __local half *p)
+{
+ vstore16(__cvt_16f32_to_16f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtn(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtn(float16 v, size_t i, half *p)
+{
+ vstore16(__cvt_16f32_to_16f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtn(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtn(float16 v, size_t i, __global half *p)
+{
+ vstore16(__cvt_16f32_to_16f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtn(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtn(float16 v, size_t i, __local half *p)
+{
+ vstore16(__cvt_16f32_to_16f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtz(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtz(float16 v, size_t i, half *p)
+{
+ vstore16(__cvt_16f32_to_16f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtz(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtz(float16 v, size_t i, __global half *p)
+{
+ vstore16(__cvt_16f32_to_16f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtz(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtz(float16 v, size_t i, __local half *p)
+{
+ vstore16(__cvt_16f32_to_16f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_cur(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2(double2 v, size_t i, half *p)
+{
+ vstore2(__cvt_2f64_to_2f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_cur(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2(double2 v, size_t i, __global half *p)
+{
+ vstore2(__cvt_2f64_to_2f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_cur(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2(double2 v, size_t i, __local half *p)
+{
+ vstore2(__cvt_2f64_to_2f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rte(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rte(double2 v, size_t i, half *p)
+{
+ vstore2(__cvt_2f64_to_2f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rte(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rte(double2 v, size_t i, __global half *p)
+{
+ vstore2(__cvt_2f64_to_2f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rte(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rte(double2 v, size_t i, __local half *p)
+{
+ vstore2(__cvt_2f64_to_2f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtp(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtp(double2 v, size_t i, half *p)
+{
+ vstore2(__cvt_2f64_to_2f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtp(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtp(double2 v, size_t i, __global half *p)
+{
+ vstore2(__cvt_2f64_to_2f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtp(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtp(double2 v, size_t i, __local half *p)
+{
+ vstore2(__cvt_2f64_to_2f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtn(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtn(double2 v, size_t i, half *p)
+{
+ vstore2(__cvt_2f64_to_2f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtn(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtn(double2 v, size_t i, __global half *p)
+{
+ vstore2(__cvt_2f64_to_2f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtn(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtn(double2 v, size_t i, __local half *p)
+{
+ vstore2(__cvt_2f64_to_2f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtz(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtz(double2 v, size_t i, half *p)
+{
+ vstore2(__cvt_2f64_to_2f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtz(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtz(double2 v, size_t i, __global half *p)
+{
+ vstore2(__cvt_2f64_to_2f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtz(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtz(double2 v, size_t i, __local half *p)
+{
+ vstore2(__cvt_2f64_to_2f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_cur(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3(double3 v, size_t i, half *p)
+{
+ vstore3(__cvt_3f64_to_3f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_cur(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3(double3 v, size_t i, __global half *p)
+{
+ vstore3(__cvt_3f64_to_3f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_cur(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3(double3 v, size_t i, __local half *p)
+{
+ vstore3(__cvt_3f64_to_3f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rte(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rte(double3 v, size_t i, half *p)
+{
+ vstore3(__cvt_3f64_to_3f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rte(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rte(double3 v, size_t i, __global half *p)
+{
+ vstore3(__cvt_3f64_to_3f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rte(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rte(double3 v, size_t i, __local half *p)
+{
+ vstore3(__cvt_3f64_to_3f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtp(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtp(double3 v, size_t i, half *p)
+{
+ vstore3(__cvt_3f64_to_3f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtp(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtp(double3 v, size_t i, __global half *p)
+{
+ vstore3(__cvt_3f64_to_3f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtp(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtp(double3 v, size_t i, __local half *p)
+{
+ vstore3(__cvt_3f64_to_3f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtn(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtn(double3 v, size_t i, half *p)
+{
+ vstore3(__cvt_3f64_to_3f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtn(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtn(double3 v, size_t i, __global half *p)
+{
+ vstore3(__cvt_3f64_to_3f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtn(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtn(double3 v, size_t i, __local half *p)
+{
+ vstore3(__cvt_3f64_to_3f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtz(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtz(double3 v, size_t i, half *p)
+{
+ vstore3(__cvt_3f64_to_3f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtz(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtz(double3 v, size_t i, __global half *p)
+{
+ vstore3(__cvt_3f64_to_3f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtz(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtz(double3 v, size_t i, __local half *p)
+{
+ vstore3(__cvt_3f64_to_3f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_cur(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4(double4 v, size_t i, half *p)
+{
+ vstore4(__cvt_4f64_to_4f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_cur(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4(double4 v, size_t i, __global half *p)
+{
+ vstore4(__cvt_4f64_to_4f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_cur(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4(double4 v, size_t i, __local half *p)
+{
+ vstore4(__cvt_4f64_to_4f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rte(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rte(double4 v, size_t i, half *p)
+{
+ vstore4(__cvt_4f64_to_4f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rte(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rte(double4 v, size_t i, __global half *p)
+{
+ vstore4(__cvt_4f64_to_4f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rte(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rte(double4 v, size_t i, __local half *p)
+{
+ vstore4(__cvt_4f64_to_4f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtp(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtp(double4 v, size_t i, half *p)
+{
+ vstore4(__cvt_4f64_to_4f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtp(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtp(double4 v, size_t i, __global half *p)
+{
+ vstore4(__cvt_4f64_to_4f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtp(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtp(double4 v, size_t i, __local half *p)
+{
+ vstore4(__cvt_4f64_to_4f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtn(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtn(double4 v, size_t i, half *p)
+{
+ vstore4(__cvt_4f64_to_4f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtn(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtn(double4 v, size_t i, __global half *p)
+{
+ vstore4(__cvt_4f64_to_4f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtn(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtn(double4 v, size_t i, __local half *p)
+{
+ vstore4(__cvt_4f64_to_4f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtz(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtz(double4 v, size_t i, half *p)
+{
+ vstore4(__cvt_4f64_to_4f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtz(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtz(double4 v, size_t i, __global half *p)
+{
+ vstore4(__cvt_4f64_to_4f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtz(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtz(double4 v, size_t i, __local half *p)
+{
+ vstore4(__cvt_4f64_to_4f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_cur(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8(double8 v, size_t i, half *p)
+{
+ vstore8(__cvt_8f64_to_8f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_cur(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8(double8 v, size_t i, __global half *p)
+{
+ vstore8(__cvt_8f64_to_8f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_cur(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8(double8 v, size_t i, __local half *p)
+{
+ vstore8(__cvt_8f64_to_8f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rte(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rte(double8 v, size_t i, half *p)
+{
+ vstore8(__cvt_8f64_to_8f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rte(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rte(double8 v, size_t i, __global half *p)
+{
+ vstore8(__cvt_8f64_to_8f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rte(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rte(double8 v, size_t i, __local half *p)
+{
+ vstore8(__cvt_8f64_to_8f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtp(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtp(double8 v, size_t i, half *p)
+{
+ vstore8(__cvt_8f64_to_8f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtp(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtp(double8 v, size_t i, __global half *p)
+{
+ vstore8(__cvt_8f64_to_8f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtp(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtp(double8 v, size_t i, __local half *p)
+{
+ vstore8(__cvt_8f64_to_8f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtn(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtn(double8 v, size_t i, half *p)
+{
+ vstore8(__cvt_8f64_to_8f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtn(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtn(double8 v, size_t i, __global half *p)
+{
+ vstore8(__cvt_8f64_to_8f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtn(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtn(double8 v, size_t i, __local half *p)
+{
+ vstore8(__cvt_8f64_to_8f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtz(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtz(double8 v, size_t i, half *p)
+{
+ vstore8(__cvt_8f64_to_8f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtz(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtz(double8 v, size_t i, __global half *p)
+{
+ vstore8(__cvt_8f64_to_8f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtz(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtz(double8 v, size_t i, __local half *p)
+{
+ vstore8(__cvt_8f64_to_8f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_cur(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16(double16 v, size_t i, half *p)
+{
+ vstore16(__cvt_16f64_to_16f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_cur(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16(double16 v, size_t i, __global half *p)
+{
+ vstore16(__cvt_16f64_to_16f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_cur(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16(double16 v, size_t i, __local half *p)
+{
+ vstore16(__cvt_16f64_to_16f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rte(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rte(double16 v, size_t i, half *p)
+{
+ vstore16(__cvt_16f64_to_16f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rte(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rte(double16 v, size_t i, __global half *p)
+{
+ vstore16(__cvt_16f64_to_16f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rte(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rte(double16 v, size_t i, __local half *p)
+{
+ vstore16(__cvt_16f64_to_16f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtp(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtp(double16 v, size_t i, half *p)
+{
+ vstore16(__cvt_16f64_to_16f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtp(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtp(double16 v, size_t i, __global half *p)
+{
+ vstore16(__cvt_16f64_to_16f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtp(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtp(double16 v, size_t i, __local half *p)
+{
+ vstore16(__cvt_16f64_to_16f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtn(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtn(double16 v, size_t i, half *p)
+{
+ vstore16(__cvt_16f64_to_16f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtn(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtn(double16 v, size_t i, __global half *p)
+{
+ vstore16(__cvt_16f64_to_16f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtn(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtn(double16 v, size_t i, __local half *p)
+{
+ vstore16(__cvt_16f64_to_16f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtz(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtz(double16 v, size_t i, half *p)
+{
+ vstore16(__cvt_16f64_to_16f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtz(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtz(double16 v, size_t i, __global half *p)
+{
+ vstore16(__cvt_16f64_to_16f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtz(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtz(double16 v, size_t i, __local half *p)
+{
+ vstore16(__cvt_16f64_to_16f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_cur(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2(float2 v, size_t i, half *p)
+{
+
+ *(ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_cur(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_cur(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2(float2 v, size_t i, __global half *p)
+{
+
+ *(__global ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_cur(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_cur(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2(float2 v, size_t i, __local half *p)
+{
+
+ *(__local ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_cur(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rte(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rte(float2 v, size_t i, half *p)
+{
+
+ *(ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rte(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rte(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rte(float2 v, size_t i, __global half *p)
+{
+
+ *(__global ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rte(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rte(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rte(float2 v, size_t i, __local half *p)
+{
+
+ *(__local ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rte(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtp(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtp(float2 v, size_t i, half *p)
+{
+
+ *(ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtp(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtp(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtp(float2 v, size_t i, __global half *p)
+{
+
+ *(__global ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtp(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtp(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtp(float2 v, size_t i, __local half *p)
+{
+
+ *(__local ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtp(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtn(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtn(float2 v, size_t i, half *p)
+{
+
+ *(ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtn(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtn(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtn(float2 v, size_t i, __global half *p)
+{
+
+ *(__global ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtn(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtn(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtn(float2 v, size_t i, __local half *p)
+{
+
+ *(__local ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtn(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtz(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtz(float2 v, size_t i, half *p)
+{
+
+ *(ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtz(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtz(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtz(float2 v, size_t i, __global half *p)
+{
+
+ *(__global ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtz(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtz(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtz(float2 v, size_t i, __local half *p)
+{
+
+ *(__local ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtz(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_cur(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3(float3 v, size_t i, half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f32_to_3f16_cur(v);
+ *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_cur(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3(float3 v, size_t i, __global half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f32_to_3f16_cur(v);
+ *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_cur(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3(float3 v, size_t i, __local half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f32_to_3f16_cur(v);
+ *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rte(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rte(float3 v, size_t i, half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f32_to_3f16_rte(v);
+ *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rte(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rte(float3 v, size_t i, __global half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f32_to_3f16_rte(v);
+ *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rte(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rte(float3 v, size_t i, __local half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f32_to_3f16_rte(v);
+ *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtp(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtp(float3 v, size_t i, half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f32_to_3f16_rtp(v);
+ *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtp(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtp(float3 v, size_t i, __global half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f32_to_3f16_rtp(v);
+ *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtp(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtp(float3 v, size_t i, __local half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f32_to_3f16_rtp(v);
+ *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtn(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtn(float3 v, size_t i, half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f32_to_3f16_rtn(v);
+ *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtn(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtn(float3 v, size_t i, __global half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f32_to_3f16_rtn(v);
+ *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtn(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtn(float3 v, size_t i, __local half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f32_to_3f16_rtn(v);
+ *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtz(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtz(float3 v, size_t i, half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f32_to_3f16_rtz(v);
+ *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtz(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtz(float3 v, size_t i, __global half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f32_to_3f16_rtz(v);
+ *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtz(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtz(float3 v, size_t i, __local half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f32_to_3f16_rtz(v);
+ *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_cur(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4(float4 v, size_t i, half *p)
+{
+
+ *(ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_cur(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_cur(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4(float4 v, size_t i, __global half *p)
+{
+
+ *(__global ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_cur(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_cur(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4(float4 v, size_t i, __local half *p)
+{
+
+ *(__local ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_cur(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rte(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rte(float4 v, size_t i, half *p)
+{
+
+ *(ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rte(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rte(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rte(float4 v, size_t i, __global half *p)
+{
+
+ *(__global ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rte(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rte(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rte(float4 v, size_t i, __local half *p)
+{
+
+ *(__local ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rte(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtp(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtp(float4 v, size_t i, half *p)
+{
+
+ *(ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtp(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtp(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtp(float4 v, size_t i, __global half *p)
+{
+
+ *(__global ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtp(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtp(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtp(float4 v, size_t i, __local half *p)
+{
+
+ *(__local ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtp(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtn(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtn(float4 v, size_t i, half *p)
+{
+
+ *(ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtn(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtn(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtn(float4 v, size_t i, __global half *p)
+{
+
+ *(__global ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtn(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtn(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtn(float4 v, size_t i, __local half *p)
+{
+
+ *(__local ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtn(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtz(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtz(float4 v, size_t i, half *p)
+{
+
+ *(ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtz(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtz(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtz(float4 v, size_t i, __global half *p)
+{
+
+ *(__global ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtz(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtz(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtz(float4 v, size_t i, __local half *p)
+{
+
+ *(__local ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtz(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_cur(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8(float8 v, size_t i, half *p)
+{
+
+ *(ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_cur(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_cur(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8(float8 v, size_t i, __global half *p)
+{
+
+ *(__global ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_cur(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_cur(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8(float8 v, size_t i, __local half *p)
+{
+
+ *(__local ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_cur(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rte(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rte(float8 v, size_t i, half *p)
+{
+
+ *(ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rte(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rte(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rte(float8 v, size_t i, __global half *p)
+{
+
+ *(__global ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rte(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rte(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rte(float8 v, size_t i, __local half *p)
+{
+
+ *(__local ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rte(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtp(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtp(float8 v, size_t i, half *p)
+{
+
+ *(ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtp(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtp(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtp(float8 v, size_t i, __global half *p)
+{
+
+ *(__global ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtp(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtp(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtp(float8 v, size_t i, __local half *p)
+{
+
+ *(__local ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtp(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtn(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtn(float8 v, size_t i, half *p)
+{
+
+ *(ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtn(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtn(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtn(float8 v, size_t i, __global half *p)
+{
+
+ *(__global ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtn(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtn(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtn(float8 v, size_t i, __local half *p)
+{
+
+ *(__local ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtn(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtz(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtz(float8 v, size_t i, half *p)
+{
+
+ *(ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtz(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtz(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtz(float8 v, size_t i, __global half *p)
+{
+
+ *(__global ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtz(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtz(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtz(float8 v, size_t i, __local half *p)
+{
+
+ *(__local ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtz(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_cur(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16(float16 v, size_t i, half *p)
+{
+
+ *(ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_cur(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_cur(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16(float16 v, size_t i, __global half *p)
+{
+
+ *(__global ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_cur(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_cur(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16(float16 v, size_t i, __local half *p)
+{
+
+ *(__local ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_cur(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rte(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rte(float16 v, size_t i, half *p)
+{
+
+ *(ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rte(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rte(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rte(float16 v, size_t i, __global half *p)
+{
+
+ *(__global ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rte(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rte(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rte(float16 v, size_t i, __local half *p)
+{
+
+ *(__local ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rte(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtp(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtp(float16 v, size_t i, half *p)
+{
+
+ *(ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtp(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtp(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtp(float16 v, size_t i, __global half *p)
+{
+
+ *(__global ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtp(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtp(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtp(float16 v, size_t i, __local half *p)
+{
+
+ *(__local ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtp(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtn(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtn(float16 v, size_t i, half *p)
+{
+
+ *(ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtn(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtn(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtn(float16 v, size_t i, __global half *p)
+{
+
+ *(__global ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtn(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtn(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtn(float16 v, size_t i, __local half *p)
+{
+
+ *(__local ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtn(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtz(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtz(float16 v, size_t i, half *p)
+{
+
+ *(ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtz(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtz(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtz(float16 v, size_t i, __global half *p)
+{
+
+ *(__global ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtz(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtz(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtz(float16 v, size_t i, __local half *p)
+{
+
+ *(__local ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtz(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_cur(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2(double2 v, size_t i, half *p)
+{
+
+ *(ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_cur(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_cur(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2(double2 v, size_t i, __global half *p)
+{
+
+ *(__global ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_cur(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_cur(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2(double2 v, size_t i, __local half *p)
+{
+
+ *(__local ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_cur(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rte(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rte(double2 v, size_t i, half *p)
+{
+
+ *(ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rte(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rte(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rte(double2 v, size_t i, __global half *p)
+{
+
+ *(__global ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rte(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rte(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rte(double2 v, size_t i, __local half *p)
+{
+
+ *(__local ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rte(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtp(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtp(double2 v, size_t i, half *p)
+{
+
+ *(ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtp(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtp(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtp(double2 v, size_t i, __global half *p)
+{
+
+ *(__global ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtp(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtp(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtp(double2 v, size_t i, __local half *p)
+{
+
+ *(__local ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtp(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtn(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtn(double2 v, size_t i, half *p)
+{
+
+ *(ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtn(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtn(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtn(double2 v, size_t i, __global half *p)
+{
+
+ *(__global ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtn(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtn(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtn(double2 v, size_t i, __local half *p)
+{
+
+ *(__local ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtn(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtz(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtz(double2 v, size_t i, half *p)
+{
+
+ *(ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtz(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtz(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtz(double2 v, size_t i, __global half *p)
+{
+
+ *(__global ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtz(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtz(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtz(double2 v, size_t i, __local half *p)
+{
+
+ *(__local ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtz(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_cur(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3(double3 v, size_t i, half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f64_to_3f16_cur(v);
+ *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_cur(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3(double3 v, size_t i, __global half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f64_to_3f16_cur(v);
+ *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_cur(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3(double3 v, size_t i, __local half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f64_to_3f16_cur(v);
+ *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rte(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rte(double3 v, size_t i, half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f64_to_3f16_rte(v);
+ *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rte(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rte(double3 v, size_t i, __global half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f64_to_3f16_rte(v);
+ *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rte(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rte(double3 v, size_t i, __local half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f64_to_3f16_rte(v);
+ *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtp(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtp(double3 v, size_t i, half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f64_to_3f16_rtp(v);
+ *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtp(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtp(double3 v, size_t i, __global half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f64_to_3f16_rtp(v);
+ *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtp(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtp(double3 v, size_t i, __local half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f64_to_3f16_rtp(v);
+ *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtn(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtn(double3 v, size_t i, half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f64_to_3f16_rtn(v);
+ *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtn(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtn(double3 v, size_t i, __global half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f64_to_3f16_rtn(v);
+ *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtn(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtn(double3 v, size_t i, __local half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f64_to_3f16_rtn(v);
+ *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtz(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtz(double3 v, size_t i, half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f64_to_3f16_rtz(v);
+ *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtz(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtz(double3 v, size_t i, __global half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f64_to_3f16_rtz(v);
+ *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtz(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtz(double3 v, size_t i, __local half *p)
+{
+
+ ushort4 h;
+ h.s012 = __cvt_3f64_to_3f16_rtz(v);
+ *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_cur(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4(double4 v, size_t i, half *p)
+{
+
+ *(ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_cur(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_cur(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4(double4 v, size_t i, __global half *p)
+{
+
+ *(__global ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_cur(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_cur(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4(double4 v, size_t i, __local half *p)
+{
+
+ *(__local ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_cur(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rte(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rte(double4 v, size_t i, half *p)
+{
+
+ *(ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rte(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rte(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rte(double4 v, size_t i, __global half *p)
+{
+
+ *(__global ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rte(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rte(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rte(double4 v, size_t i, __local half *p)
+{
+
+ *(__local ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rte(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtp(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtp(double4 v, size_t i, half *p)
+{
+
+ *(ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtp(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtp(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtp(double4 v, size_t i, __global half *p)
+{
+
+ *(__global ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtp(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtp(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtp(double4 v, size_t i, __local half *p)
+{
+
+ *(__local ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtp(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtn(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtn(double4 v, size_t i, half *p)
+{
+
+ *(ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtn(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtn(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtn(double4 v, size_t i, __global half *p)
+{
+
+ *(__global ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtn(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtn(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtn(double4 v, size_t i, __local half *p)
+{
+
+ *(__local ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtn(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtz(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtz(double4 v, size_t i, half *p)
+{
+
+ *(ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtz(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtz(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtz(double4 v, size_t i, __global half *p)
+{
+
+ *(__global ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtz(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtz(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtz(double4 v, size_t i, __local half *p)
+{
+
+ *(__local ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtz(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_cur(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8(double8 v, size_t i, half *p)
+{
+
+ *(ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_cur(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_cur(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8(double8 v, size_t i, __global half *p)
+{
+
+ *(__global ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_cur(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_cur(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8(double8 v, size_t i, __local half *p)
+{
+
+ *(__local ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_cur(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rte(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rte(double8 v, size_t i, half *p)
+{
+
+ *(ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rte(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rte(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rte(double8 v, size_t i, __global half *p)
+{
+
+ *(__global ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rte(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rte(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rte(double8 v, size_t i, __local half *p)
+{
+
+ *(__local ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rte(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtp(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtp(double8 v, size_t i, half *p)
+{
+
+ *(ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtp(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtp(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtp(double8 v, size_t i, __global half *p)
+{
+
+ *(__global ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtp(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtp(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtp(double8 v, size_t i, __local half *p)
+{
+
+ *(__local ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtp(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtn(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtn(double8 v, size_t i, half *p)
+{
+
+ *(ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtn(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtn(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtn(double8 v, size_t i, __global half *p)
+{
+
+ *(__global ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtn(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtn(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtn(double8 v, size_t i, __local half *p)
+{
+
+ *(__local ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtn(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtz(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtz(double8 v, size_t i, half *p)
+{
+
+ *(ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtz(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtz(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtz(double8 v, size_t i, __global half *p)
+{
+
+ *(__global ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtz(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtz(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtz(double8 v, size_t i, __local half *p)
+{
+
+ *(__local ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtz(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_cur(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16(double16 v, size_t i, half *p)
+{
+
+ *(ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_cur(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_cur(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16(double16 v, size_t i, __global half *p)
+{
+
+ *(__global ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_cur(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_cur(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16(double16 v, size_t i, __local half *p)
+{
+
+ *(__local ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_cur(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rte(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rte(double16 v, size_t i, half *p)
+{
+
+ *(ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rte(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rte(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rte(double16 v, size_t i, __global half *p)
+{
+
+ *(__global ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rte(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rte(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rte(double16 v, size_t i, __local half *p)
+{
+
+ *(__local ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rte(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtp(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtp(double16 v, size_t i, half *p)
+{
+
+ *(ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtp(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtp(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtp(double16 v, size_t i, __global half *p)
+{
+
+ *(__global ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtp(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtp(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtp(double16 v, size_t i, __local half *p)
+{
+
+ *(__local ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtp(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtn(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtn(double16 v, size_t i, half *p)
+{
+
+ *(ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtn(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtn(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtn(double16 v, size_t i, __global half *p)
+{
+
+ *(__global ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtn(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtn(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtn(double16 v, size_t i, __local half *p)
+{
+
+ *(__local ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtn(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtz(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtz(double16 v, size_t i, half *p)
+{
+
+ *(ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtz(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtz(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtz(double16 v, size_t i, __global half *p)
+{
+
+ *(__global ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtz(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtz(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtz(double16 v, size_t i, __local half *p)
+{
+
+ *(__local ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtz(v);
+
+}
+#endif
Added: libclc/branches/amd-builtins/amd-builtins/workgroup/wg.h
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/workgroup/wg.h?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/workgroup/wg.h (added)
+++ libclc/branches/amd-builtins/amd-builtins/workgroup/wg.h Tue Oct 7 12:10:46 2014
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+// XXX The runtime computes CL_DEVICE_MAX_WORK_GROUP_SIZE as
+// XXX dev->wave_front_size * dev->max_waves_per_simd
+// XXX If max_waves_per_simd is ever raised then this code will need to be updated
+#define MAX_WAVES_PER_SIMD 4
+
+#pragma OPENCL EXTENSION cl_amd_program_scope_locals : enable
+extern __local ulong __wg_scratch[MAX_WAVES_PER_SIMD];
+
Added: libclc/branches/amd-builtins/amd-builtins/workgroup/wganyall.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/workgroup/wganyall.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/workgroup/wganyall.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/workgroup/wganyall.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if __OPENCL_C_VERSION__ >= 200
+
+#include "wg.h"
+
+#define GEN_AA(SUF,ID) \
+__attribute__((overloadable, always_inline)) int \
+work_group_##SUF(int predicate) \
+{ \
+ uint n = get_num_sub_groups(); \
+ int a = sub_group_##SUF(predicate); \
+ if (n == 1) \
+ return a; \
+ \
+ __local int *p = (__local int *)__wg_scratch; \
+ uint l = get_sub_group_local_id(); \
+ uint i = get_sub_group_id(); \
+ \
+ if (l == 0) \
+ p[i] = a; \
+ \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ if (i == 0) { \
+ a = l < n ? p[l] : ID; \
+ a = sub_group_##SUF(a); \
+ if (l == 0) \
+ p[0] = a; \
+ } \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ a = p[0]; \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+ return a; \
+}
+
+GEN_AA(all, 1U)
+GEN_AA(any, 0U);
+
+#endif
+
Added: libclc/branches/amd-builtins/amd-builtins/workgroup/wgbarrier.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/workgroup/wgbarrier.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/workgroup/wgbarrier.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/workgroup/wgbarrier.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if __OPENCL_C_VERSION__ >= 200
+
+extern void __hsail_barrier(void);
+
+__attribute__((overloadable, weak, always_inline)) void
+work_group_barrier(cl_mem_fence_flags flags, memory_scope scope)
+{
+ atomic_work_item_fence(flags, memory_order_release, scope);
+ __hsail_barrier();
+ atomic_work_item_fence(flags, memory_order_acquire, scope);
+}
+
+__attribute__((overloadable, weak, always_inline)) void
+work_group_barrier(cl_mem_fence_flags flags)
+{
+ work_group_barrier(flags, memory_scope_work_group);
+}
+
+#endif
+
Added: libclc/branches/amd-builtins/amd-builtins/workgroup/wgbcast.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/workgroup/wgbcast.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/workgroup/wgbcast.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/workgroup/wgbcast.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if __OPENCL_C_VERSION__ >= 200
+
+#include "wg.h"
+
+#define GEN_BROADCAST(TYPE) \
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_broadcast(TYPE a, size_t local_id_x) \
+{ \
+ if (get_num_sub_groups() == 1) \
+ return sub_group_broadcast(a, local_id_x); \
+ \
+ __local TYPE *p = (__local TYPE *)__wg_scratch; \
+ if (get_local_id(0) == local_id_x) \
+ *p = a; \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ a = *p; \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ return a; \
+} \
+\
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_broadcast(TYPE a, size_t local_id_x, size_t local_id_y) \
+{ \
+ __local TYPE *p = (__local TYPE *)__wg_scratch; \
+ if (get_local_id(0) == local_id_x && get_local_id(1) == local_id_y) \
+ *p = a; \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ a = *p; \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ return a; \
+} \
+\
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_broadcast(TYPE a, size_t local_id_x, size_t local_id_y, size_t local_id_z) \
+{ \
+ __local TYPE *p = (__local TYPE *)__wg_scratch; \
+ if (get_local_id(0) == local_id_x && get_local_id(1) == local_id_y && get_local_id(2) == local_id_z) \
+ *p = a; \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ a = *p; \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ return a; \
+}
+
+GEN_BROADCAST(uint)
+GEN_BROADCAST(int)
+GEN_BROADCAST(ulong)
+GEN_BROADCAST(long)
+GEN_BROADCAST(float)
+GEN_BROADCAST(double)
+
+#endif
+
Added: libclc/branches/amd-builtins/amd-builtins/workgroup/wgreduce.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/workgroup/wgreduce.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/workgroup/wgreduce.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/workgroup/wgreduce.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if __OPENCL_C_VERSION__ >= 200
+
+#include "wg.h"
+
+#define GENA(TYPE) \
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_reduce_add(TYPE a) \
+{ \
+ uint n = get_num_sub_groups(); \
+ a = sub_group_reduce_add(a); \
+ if (n == 1) \
+ return a; \
+ \
+ __local TYPE *p = (__local TYPE *)__wg_scratch; \
+ uint l = get_sub_group_local_id(); \
+ uint i = get_sub_group_id(); \
+ \
+ if (l == 0) \
+ p[i] = a; \
+ \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ if (i == 0) { \
+ a = l < n ? p[l] : (TYPE)0; \
+ a = sub_group_reduce_add(a); \
+ if (l == 0) \
+ p[0] = a; \
+ } \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ a = p[0]; \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ return a; \
+}
+
+#define GENO(TYPE,SUF,ID) \
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_reduce_##SUF(TYPE a) \
+{ \
+ uint n = get_num_sub_groups(); \
+ a = sub_group_reduce_##SUF(a); \
+ if (n == 1) \
+ return a; \
+ \
+ __local TYPE *p = (__local TYPE *)__wg_scratch; \
+ uint l = get_sub_group_local_id(); \
+ uint i = get_sub_group_id(); \
+ \
+ if (l == 0) \
+ p[i] = a; \
+ \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ if (i == 0) { \
+ a = l < n ? p[l] : ID; \
+ a = sub_group_reduce_##SUF(a); \
+ if (l == 0) \
+ p[0] = a; \
+ } \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ a = p[0]; \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ return a; \
+}
+
+GENA(int)
+GENA(uint)
+GENA(long)
+GENA(ulong)
+GENA(float)
+GENA(double)
+
+GENO(int,max,INT_MIN)
+GENO(uint,max,0U)
+GENO(long,max,LONG_MIN)
+GENO(ulong,max,0UL)
+GENO(float,max,-INFINITY)
+GENO(double,max,-(double)INFINITY)
+
+GENO(int,min,INT_MAX)
+GENO(uint,min,UINT_MAX)
+GENO(long,min,LONG_MAX)
+GENO(ulong,min,ULONG_MAX)
+GENO(float,min,INFINITY)
+GENO(double,min,(double)INFINITY)
+
+#endif
+
Added: libclc/branches/amd-builtins/amd-builtins/workgroup/wgscan.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/workgroup/wgscan.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/workgroup/wgscan.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/workgroup/wgscan.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "wg.h"
+
+#if __OPENCL_C_VERSION__ >= 200
+
+#define GENIA(TYPE) \
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_scan_inclusive_add(TYPE a) \
+{ \
+ uint n = get_num_sub_groups(); \
+ a = sub_group_scan_inclusive_add(a); \
+ if (n == 1) \
+ return a; \
+ \
+ __local TYPE *p = (__local TYPE *)__wg_scratch; \
+ uint l = get_sub_group_local_id(); \
+ uint i = get_sub_group_id(); \
+ \
+ if (l == get_sub_group_size() - 1U) \
+ p[i] = a; \
+ \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ if (i == 0) { \
+ TYPE t = l < n ? p[l] : (TYPE)0; \
+ t = sub_group_scan_inclusive_add(t); \
+ if (l < n) \
+ p[l] = t; \
+ } \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ TYPE ret = i == 0 ? a : a + p[i-1]; \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ return ret; \
+}
+
+#define GENIO(TYPE,SUF,ID) \
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_scan_inclusive_##SUF(TYPE a) \
+{ \
+ uint n = get_num_sub_groups(); \
+ a = sub_group_scan_inclusive_##SUF(a); \
+ if (n == 1) \
+ return a; \
+ \
+ __local TYPE *p = (__local TYPE *)__wg_scratch; \
+ uint l = get_sub_group_local_id(); \
+ uint i = get_sub_group_id(); \
+ \
+ if (l == get_sub_group_size() - 1U) \
+ p[i] = a; \
+ \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ if (i == 0) { \
+ TYPE t = l < n ? p[l] : ID; \
+ t = sub_group_scan_inclusive_##SUF(t); \
+ if (l < n) \
+ p[l] = t; \
+ } \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ TYPE ret = i == 0 ? a : SUF(a, p[i-1]); \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ return ret; \
+}
+
+GENIA(int)
+GENIA(uint)
+GENIA(long)
+GENIA(ulong)
+GENIA(float)
+GENIA(double)
+
+GENIO(int,max,INT_MIN)
+GENIO(uint,max,0U)
+GENIO(long,max,LONG_MIN)
+GENIO(ulong,max,0UL)
+GENIO(float,max,-INFINITY)
+GENIO(double,max,-(double)INFINITY)
+
+GENIO(int,min,INT_MAX)
+GENIO(uint,min,UINT_MAX)
+GENIO(long,min,LONG_MAX)
+GENIO(ulong,min,ULONG_MAX)
+GENIO(float,min,INFINITY)
+GENIO(double,min,(double)INFINITY)
+
+#define GENEA(TYPE) \
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_scan_exclusive_add(TYPE a) \
+{ \
+ uint n = get_num_sub_groups(); \
+ TYPE t = sub_group_scan_exclusive_add(a); \
+ if (n == 1) \
+ return t; \
+ \
+ __local TYPE *p = (__local TYPE *)__wg_scratch; \
+ uint l = get_sub_group_local_id(); \
+ uint i = get_sub_group_id(); \
+ \
+ if (l == get_sub_group_size() - 1U) \
+ p[i] = a + t; \
+ \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ if (i == 0) { \
+ TYPE s = l < n ? p[l] : (TYPE)0; \
+ s = sub_group_scan_inclusive_add(s); \
+ if (l < n) \
+ p[l] = s; \
+ } \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ TYPE ret = i == 0 ? t : t + p[i-1]; \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ return ret; \
+}
+
+#define GENEO(TYPE,SUF,ID) \
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_scan_exclusive_##SUF(TYPE a) \
+{ \
+ uint n = get_num_sub_groups(); \
+ TYPE t = sub_group_scan_exclusive_##SUF(a); \
+ if (n == 1) \
+ return t; \
+ \
+ __local TYPE *p = (__local TYPE *)__wg_scratch; \
+ uint l = get_sub_group_local_id(); \
+ uint i = get_sub_group_id(); \
+ \
+ if (l == get_sub_group_size() - 1U) \
+ p[i] = SUF(a, t); \
+ \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ if (i == 0) { \
+ TYPE s = l < n ? p[l] : ID; \
+ s = sub_group_scan_inclusive_##SUF(s); \
+ if (l < n) \
+ p[l] = s; \
+ } \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ TYPE ret = i == 0 ? t : SUF(t, p[i-1]); \
+ work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ return ret; \
+}
+
+GENEA(int)
+GENEA(uint)
+GENEA(long)
+GENEA(ulong)
+GENEA(float)
+GENEA(double)
+
+GENEO(int,max,INT_MIN)
+GENEO(uint,max,0U)
+GENEO(long,max,LONG_MIN)
+GENEO(ulong,max,0UL)
+GENEO(float,max,-INFINITY)
+GENEO(double,max,-(double)INFINITY)
+
+GENEO(int,min,INT_MAX)
+GENEO(uint,min,UINT_MAX)
+GENEO(long,min,LONG_MAX)
+GENEO(ulong,min,ULONG_MAX)
+GENEO(float,min,INFINITY)
+GENEO(double,min,(double)INFINITY)
+
+#endif
+
Added: libclc/branches/amd-builtins/amd-builtins/workgroup/wgscratch.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/workgroup/wgscratch.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/workgroup/wgscratch.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/workgroup/wgscratch.cl Tue Oct 7 12:10:46 2014
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if __OPENCL_C_VERSION >= 200
+
+#include "wg.h"
+
+// Temporary data for work group functions
+__local ulong __wg_scratch[MAX_WAVES_PER_SIMD];
+
+#endif
More information about the cfe-commits
mailing list